add ability to do a random rotation before scalar quantization, inspired by recent works, but cite the original paper by Chee et al. from Cornell

lucidrains · lucidrains · commit a8235c854f9a · 2026-03-30T08:00:56.000-07:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
diff --git a/README.md b/README.md
@@ -815,3 +815,15 @@ assert loss.item() >= 0
     url     = {https://arxiv.org/abs/2509.10140},
 }
 ```
+
+```bibtex
+@misc{chee2024quip2bitquantizationlarge,
+    title   = {QuIP: 2-Bit Quantization of Large Language Models With Guarantees}, 
+    author  = {Jerry Chee and Yaohui Cai and Volodymyr Kuleshov and Christopher De Sa},
+    year    = {2024},
+    eprint  = {2307.13304},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.LG},
+    url     = {https://arxiv.org/abs/2307.13304}, 
+}
+```
diff --git a/examples/autoencoder_fsq.py b/examples/autoencoder_fsq.py
@@ -32,15 +32,15 @@ def default(val, d):
 
 # classes
 
-def SimpleFSQAutoEncoder(levels: list[int]):
+def SimpleFSQAutoEncoder(levels: list[int], orthogonal_rotation: bool = False):
     return Sequential(
         nn.Conv2d(1, 16, kernel_size = 3, stride = 1, padding = 1),
         nn.MaxPool2d(kernel_size = 2, stride = 2),
         nn.GELU(),
         nn.Conv2d(16, 32, kernel_size = 3, stride = 1, padding = 1),
         nn.MaxPool2d(kernel_size = 2, stride = 2),
         nn.Conv2d(32, len(levels), kernel_size = 1),
-        FSQ(levels),
+        FSQ(levels, orthogonal_rotation = orthogonal_rotation),
         nn.Conv2d(len(levels), 32, kernel_size = 3, stride = 1, padding = 1),
         nn.Upsample(scale_factor = 2, mode = "nearest"),
         nn.Conv2d(32, 16, kernel_size = 3, stride = 1, padding = 1),
@@ -54,14 +54,15 @@ def train(
     lr = 3e-4,
     levels = [8, 6, 5],
     seed = 1234,
-    batch_size = 256
+    batch_size = 256,
+    orthogonal_rotation = False
 ):
     torch.random.manual_seed(seed)
     device = "cuda" if torch.cuda.is_available() else "cpu"
 
     num_codes = math.prod(levels)
 
-    model = SimpleFSQAutoEncoder(levels).to(device)
+    model = SimpleFSQAutoEncoder(levels, orthogonal_rotation = orthogonal_rotation).to(device)
 
     opt = AdamW(model.parameters(), lr = lr)
 
diff --git a/examples/autoencoder_lfq.py b/examples/autoencoder_lfq.py
@@ -65,7 +65,8 @@ def train(
     entropy_loss_weight = 0.02,
     diversity_gamma = 1.,
     spherical = True,
-    batch_size = 256
+    batch_size = 256,
+    orthogonal_rotation = False
 ):
     torch.random.manual_seed(seed)
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -74,7 +75,8 @@ def train(
         codebook_size = codebook_size,
         entropy_loss_weight = entropy_loss_weight,
         diversity_gamma = diversity_gamma,
-        spherical = spherical
+        spherical = spherical,
+        orthogonal_rotation = orthogonal_rotation
     ).to(device)
 
     opt = AdamW(model.parameters(), lr = lr)
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vector-quantize-pytorch"
-version = "1.28.0"
+version = "1.28.1"
 description = "Vector Quantization - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/vector_quantize_pytorch/finite_scalar_quantization.py b/vector_quantize_pytorch/finite_scalar_quantization.py
@@ -76,7 +76,8 @@ def __init__(
         force_quantization_f32 = True,
         preserve_symmetry = False,
         noise_dropout = 0.,
-        bound_hard_clamp = False # for residual fsq, if input is pre-softclamped to the right range
+        bound_hard_clamp = False,                   # for residual fsq, if input is pre-softclamped to the right range
+        orthogonal_rotation = False                 # increase codebook utilization. ensure levels are symmetric! https://arxiv.org/abs/2307.13304v2
     ):
         super().__init__()
 
@@ -132,6 +133,17 @@ def __init__(
 
         self.bound_hard_clamp = bound_hard_clamp
 
+        self.orthogonal_rotation = orthogonal_rotation
+
+        if orthogonal_rotation:
+            is_symmetric = len(set(levels)) == 1
+            if not is_symmetric:
+                print('orthogonal_rotation is not recommended for FSQ with asymmetric levels (i.e. where the number of bins differ across dimensions)')
+
+            orthogonal_rot = torch.empty(codebook_dim, codebook_dim)
+            nn.init.orthogonal_(orthogonal_rot)
+            self.register_buffer('orthogonal_rot', orthogonal_rot)
+
     def bound(self, z, eps = 1e-3, hard_clamp = False):
         """ Bound `z`, an array of shape (..., d). """
         maybe_tanh = tanh if not hard_clamp else partial(clamp, min = -1., max = 1.)
@@ -219,6 +231,9 @@ def indices_to_codes(self, indices):
 
         codes = self._indices_to_codes(indices)
 
+        if self.orthogonal_rotation:
+            codes = codes @ self.orthogonal_rot.t()
+
         if self.keep_num_codebooks_dim:
             codes = rearrange(codes, '... c d -> ... (c d)')
 
@@ -253,6 +268,9 @@ def forward(self, z):
 
         z = rearrange(z, 'b n (c d) -> b n c d', c = self.num_codebooks)
 
+        if self.orthogonal_rotation:
+            z = z @ self.orthogonal_rot
+
         # whether to force quantization step to be full precision or not
 
         force_f32 = self.force_quantization_f32
@@ -275,6 +293,9 @@ def forward(self, z):
 
             codes = self.maybe_apply_noise(codes)
 
+            if self.orthogonal_rotation:
+                codes = codes @ self.orthogonal_rot.t()
+
             codes = rearrange(codes, 'b n c d -> b n (c d)')
 
             codes = codes.to(orig_dtype)
diff --git a/vector_quantize_pytorch/lookup_free_quantization.py b/vector_quantize_pytorch/lookup_free_quantization.py
@@ -116,7 +116,8 @@ def __init__(
         experimental_softplus_entropy_loss = False,
         entropy_loss_offset = 5.,                   # how much to shift the loss before softplus
         spherical = False,                          # from https://arxiv.org/abs/2406.07548
-        force_quantization_f32 = True               # will force the quantization step to be full precision
+        force_quantization_f32 = True,              # will force the quantization step to be full precision
+        orthogonal_rotation = False                 # increase codebook utilization without aux losses, inspired by https://arxiv.org/abs/2307.13304v2
     ):
         super().__init__()
 
@@ -165,6 +166,15 @@ def __init__(
         self.spherical = spherical
         self.maybe_l2norm = (lambda t: l2norm(t) * self.codebook_scale) if spherical else identity
 
+        # orthogonal rotation
+
+        self.orthogonal_rotation = orthogonal_rotation
+
+        if orthogonal_rotation:
+            orthogonal_rot = torch.empty(codebook_dim, codebook_dim)
+            nn.init.orthogonal_(orthogonal_rot)
+            self.register_buffer('orthogonal_rot', orthogonal_rot)
+
         # entropy aux loss related weights
 
         assert 0 < frac_per_sample_entropy <= 1.
@@ -234,6 +244,9 @@ def indices_to_codes(
 
         codes = self.maybe_l2norm(codes)
 
+        if self.orthogonal_rotation:
+            codes = codes @ self.orthogonal_rot.t()
+
         codes = rearrange(codes, '... c d -> ... (c d)')
 
         # whether to project codes out to original dimensions
@@ -287,6 +300,9 @@ def forward(
 
         x = rearrange(x, 'b n (c d) -> b n c d', c = self.num_codebooks)
 
+        if self.orthogonal_rotation:
+            x = x @ self.orthogonal_rot
+
         # maybe l2norm
 
         x = self.maybe_l2norm(x)
@@ -412,6 +428,11 @@ def forward(
             if force_f32:
                 x = x.type(orig_dtype)
 
+        # rotate back if needed
+
+        if self.orthogonal_rotation:
+            x = x @ self.orthogonal_rot.t()
+
         # merge back codebook dim
 
         x = rearrange(x, 'b n c d -> b n (c d)')