compress 16 bit: II

Grok Compression · Grok Compression · commit 5c1088cbbff5 · 2026-04-27T09:02:09.000-04:00
diff --git a/src/lib/core/scheduling/standard/CompressScheduler.cpp b/src/lib/core/scheduling/standard/CompressScheduler.cpp
@@ -119,6 +119,7 @@ bool CompressScheduler::scheduleT1(ITileProcessor* proc)
             block->mct_norms = mct_norms_;
             block->mct_numcomps = mct_numcomps_;
             block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());
+            block->use16BitDwt = tilec->is16BitDwt();
             blocks.push_back(block);
           }
         }
@@ -206,6 +207,7 @@ bool CompressScheduler::populateT1Flow(FlowComponent* flow)
             block->mct_norms = mct_norms_;
             block->mct_numcomps = mct_numcomps_;
             block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());
+            block->use16BitDwt = tilec->is16BitDwt();
             blocks.push_back(block);
           }
         }
diff --git a/src/lib/core/t1/BlockExec.h b/src/lib/core/t1/BlockExec.h
@@ -153,6 +153,7 @@ struct CompressBlockExec : public BlockExec
   int32_t* unencodedData = nullptr;
 #endif
   uint16_t mct_numcomps = 0;
+  bool use16BitDwt = false;
 
   // Delete copy constructor and assignment operator
   CompressBlockExec(const CompressBlockExec&) = delete;
diff --git a/src/lib/core/t1/part1/Coder.cpp b/src/lib/core/t1/part1/Coder.cpp
@@ -71,6 +71,25 @@ bool Coder::preCompress(CompressBlockExec* block, uint32_t& maximum)
       tile_index += tileLineAdvance;
     }
   }
+  else if(block->use16BitDwt)
+  {
+    // 16-bit DWT produces int32_t values (int16 range) — read directly, not as float
+    double quant = 1.0 / block->stepsize;
+    for(auto j = 0U; j < h; ++j)
+    {
+      for(auto i = 0U; i < w; ++i)
+      {
+        int32_t temp = (int32_t)grk_lrintf((float)(((double)block->tiledp[tile_index++] * quant)) *
+                                           (1 << T1_NMSEDEC_FRACBITS));
+        int32_t mag = temp * ((temp > 0) - (temp < 0));
+        if((uint32_t)mag > maximum)
+          maximum = (uint32_t)mag;
+        int32_t sgn = int32_t((uint32_t)(mag != temp) * 0x80000000);
+        uncompressedData[cblk_index++] = sgn | mag;
+      }
+      tile_index += tileLineAdvance;
+    }
+  }
   else
   {
     const auto* const tiledp = (float*)block->tiledp;
diff --git a/src/lib/core/t1/part15/CoderOJPH.cpp b/src/lib/core/t1/part15/CoderOJPH.cpp
@@ -137,6 +137,24 @@ bool T1OJPH::preCompress([[maybe_unused]] CompressBlockExec* block)
       tiledp += tileLineAdvance;
     }
   }
+  else if(block->use16BitDwt)
+  {
+    // 16-bit DWT produces int32_t values (int16 range) — read directly, not as float
+    auto tiledp = block->tiledp;
+    for(auto j = 0U; j < h; ++j)
+    {
+      for(auto i = 0U; i < w; ++i)
+      {
+        int32_t t = (int32_t)((float)*tiledp++ * block->inv_step_ht * (float)(1 << shift));
+        uint32_t val = t >= 0 ? (uint32_t)t : -(uint32_t)t;
+        uint32_t sign = t >= 0 ? 0U : 0x80000000U;
+        int32_t res = (int32_t)(sign | val);
+        unencoded_data[cblk_index] = res;
+        cblk_index++;
+      }
+      tiledp += tileLineAdvance;
+    }
+  }
   else
   {
     auto tiledp = (float*)block->tiledp;
diff --git a/src/lib/core/tile_processor/TileProcessorCompress.cpp b/src/lib/core/tile_processor/TileProcessorCompress.cpp
@@ -123,6 +123,49 @@ bool TileProcessorCompress::preCompressTile([[maybe_unused]] size_t thread_id)
       return false;
     auto unreducedTileComp = tileComp;
     tileComp->createWindow(Rect32(unreducedTileComp));
+
+    // 16-bit forward DWT eligibility.
+    //
+    // Reversible 5/3 (ITU-T T.800 Annex F.3.4):
+    //   The 5/3 analysis lifting steps are:
+    //     D[n] -= floor((S[n] + S[n+1]) / 2)         (prediction)
+    //     S[n] += floor((D[n-1] + D[n] + 2) / 4)     (update)
+    //   BIBO (Bounded-Input Bounded-Output) gain analysis shows intermediate
+    //   values can grow by at most 2^3 (≤6 levels) or 2^4 (>6 levels), plus
+    //   1 extra bit when the reversible colour transform (RCT, ITU-T T.800
+    //   Annex G.2) is applied.  The update step uses an overflow-safe
+    //   averaging operator (see WaveletFwd.cpp) so only the prediction step's
+    //   pre-accumulation headroom limits the working precision:
+    //     prec + headroom ≤ 16
+    //   where headroom = 4 (no MCT) or 5 (MCT, RCT component).
+    //
+    // Irreversible 9/7 (ITU-T T.800 Annex F.3.5):
+    //   The 9/7 analysis uses four lifting steps with coefficients
+    //   α=-1.586, β=-0.053, γ=0.883, δ=0.444 followed by K-scaling.
+    //   Because the lowpass BIBO gain per level ≈ 6× (dominated by the large
+    //   |α| coefficient), intermediate values compound across decomposition
+    //   levels.  Fixed-point 16-bit processing is feasible only when
+    //   prec + 6 ≤ 16  →  prec ≤ 10.
+    //   The implementation uses an odd-branch (high-pass) halving strategy
+    //   that stores D samples at half magnitude through the lifting chain,
+    //   with adjusted coefficients and a normalizing factor computed from
+    //   BIBO gains (see WaveletFwd.cpp).
+    //   MCT components are excluded because the irreversible colour transform
+    //   (ICT) operates on float buffers.
+    auto tccp = tcp_->tccps_ + compno;
+    if(tccp->qmfbid_ == 1)
+    {
+      bool isMctComp = needsMctDecompress(compno) && tcp_->mct_ == 1;
+      uint32_t headroom = isMctComp ? 5 : 4;
+      if(imageComp->prec + headroom <= 16)
+        tileComp->setUse16BitDwt(true);
+    }
+    else if(tccp->qmfbid_ == 0)
+    {
+      bool isMctComp = needsMctDecompress(compno) && tcp_->mct_ == 1;
+      if(!isMctComp && imageComp->prec + 6 <= 16)
+        tileComp->setUse16BitDwt(true);
+    }
   }
   uint32_t numTiles = (uint32_t)cp_->t_grid_height_ * cp_->t_grid_width_;
 
diff --git a/src/lib/core/wavelet/WaveletCommon.h b/src/lib/core/wavelet/WaveletCommon.h
@@ -50,6 +50,19 @@ class dwt97
                 uint32_t rows, float dcShift = 0.0f);
 };
 
+class dwt97_16
+{
+public:
+  // Adapts int32_t template interface to int16_t 9/7 DWT functions.
+  // The template passes int32_t* scratch; we cast to int16_t* internally
+  // (the scratch buffer is large enough since sizeof(int32_t) >= sizeof(int16_t)).
+  void encode_v(int32_t* res, int32_t* scratch, uint32_t height, uint8_t parity, uint32_t stride,
+                uint32_t cols, int32_t dcShift = 0, bool intInput = false);
+
+  void encode_h(int32_t* row, int32_t* scratch, uint32_t width, uint8_t parity, uint32_t stride,
+                uint32_t rows, int32_t dcShift = 0);
+};
+
 template<typename T, size_t N>
 struct vec
 {
diff --git a/src/lib/core/wavelet/WaveletFwd.cpp b/src/lib/core/wavelet/WaveletFwd.cpp

Original file line number	Diff line number	Diff line change
`@@ -119,6 +119,7 @@ bool CompressScheduler::scheduleT1(ITileProcessor* proc)`
`119`	`119`	`block->mct_norms = mct_norms_;`
`120`	`120`	`block->mct_numcomps = mct_numcomps_;`
`121`	`121`	`block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());`
	`122`	`+ block->use16BitDwt = tilec->is16BitDwt();`
`122`	`123`	`blocks.push_back(block);`
`123`	`124`	`}`
`124`	`125`	`}`
`@@ -206,6 +207,7 @@ bool CompressScheduler::populateT1Flow(FlowComponent* flow)`
`206`	`207`	`block->mct_norms = mct_norms_;`
`207`	`208`	`block->mct_numcomps = mct_numcomps_;`
`208`	`209`	`block->k_msbs = (uint8_t)(band->maxBitPlanes_ - cblk->numbps());`
	`210`	`+ block->use16BitDwt = tilec->is16BitDwt();`
`209`	`211`	`blocks.push_back(block);`
`210`	`212`	`}`
`211`	`213`	`}`