improve weight decay mix tests

skyw · skyw · commit e8a4d85ef079 · 2026-03-31T09:56:35.000-07:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/tests/test_weight_decay_mixin.py b/tests/test_weight_decay_mixin.py
@@ -14,25 +14,43 @@
 # limitations under the License.
 
 import torch
+from absl import flags, logging
 from absl.testing import absltest, parameterized
 
 from emerging_optimizers.mixin import WeightDecayMixin
 
 
-class _WeightDecayHelper(WeightDecayMixin):
+flags.DEFINE_enum("device", "cpu", ["cpu", "cuda"], "Device to run tests on")
+flags.DEFINE_integer("seed", None, "Random seed for reproducible tests")
+FLAGS = flags.FLAGS
+
+
+def setUpModule() -> None:
+    if FLAGS.seed is not None:
+        logging.info("Setting random seed to %d", FLAGS.seed)
+        torch.manual_seed(FLAGS.seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(FLAGS.seed)
+
+
+class _WeightDecayTestHelper(WeightDecayMixin):
     """Thin wrapper so we can set weight_decay_method and call the mixin."""
 
     def __init__(self, method: str):
         self.weight_decay_method = method
 
 
 class WeightDecayMixinTest(parameterized.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.device = FLAGS.device
+
     @parameterized.parameters("decoupled", "independent", "l2", "palm")
     def test_zero_weight_decay_is_noop(self, method):
         """Neither p nor grad should change when weight_decay is 0."""
-        helper = _WeightDecayHelper(method)
-        p = torch.tensor([1.0, 2.0, 3.0])
-        grad = torch.tensor([0.5, -0.5, 1.0])
+        helper = _WeightDecayTestHelper(method)
+        p = torch.tensor([1.0, 2.0, 3.0], device=self.device)
+        grad = torch.tensor([0.5, -0.5, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         helper._apply_weight_decay_inplace(p, grad, lr=0.1, weight_decay=0.0)
@@ -41,15 +59,15 @@ def test_zero_weight_decay_is_noop(self, method):
         torch.testing.assert_close(grad, grad_orig, atol=0, rtol=0)
 
     @parameterized.parameters(
-        {"lr": 0.1, "wd": 0.5},
-        {"lr": 0.01, "wd": 1.0},
-        {"lr": 1.0, "wd": 0.01},
+        {"lr": 0.25, "wd": 0.5},
+        {"lr": 0.025, "wd": 1.0},
+        {"lr": 1.0, "wd": 0.05},
     )
     def test_decoupled(self, lr, wd):
         """Decoupled: p <- p * (1 - wd * lr), grad untouched."""
-        helper = _WeightDecayHelper("decoupled")
-        p = torch.tensor([4.0, -2.0, 0.0, 7.0])
-        grad = torch.tensor([1.0, 1.0, 1.0, 1.0])
+        helper = _WeightDecayTestHelper("decoupled")
+        p = torch.tensor([4.0, -2.0, 0.0, 8.0], device=self.device)
+        grad = torch.tensor([1.0, 2.0, 1.0, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         helper._apply_weight_decay_inplace(p, grad, lr=lr, weight_decay=wd)
@@ -59,15 +77,15 @@ def test_decoupled(self, lr, wd):
         torch.testing.assert_close(grad, grad_orig, atol=0, rtol=0)
 
     @parameterized.parameters(
-        {"lr": 0.1, "wd": 0.5},
-        {"lr": 0.01, "wd": 1.0},
-        {"lr": 1.0, "wd": 0.01},
+        {"lr": 0.25, "wd": 0.5},
+        {"lr": 0.025, "wd": 1.0},
+        {"lr": 1.0, "wd": 0.05},
     )
     def test_independent(self, lr, wd):
         """Independent: p <- p * (1 - wd), grad untouched, lr irrelevant."""
-        helper = _WeightDecayHelper("independent")
-        p = torch.tensor([4.0, -2.0, 0.0, 7.0])
-        grad = torch.tensor([1.0, 1.0, 1.0, 1.0])
+        helper = _WeightDecayTestHelper("independent")
+        p = torch.tensor([4.0, -2.0, 0.0, 7.0], device=self.device)
+        grad = torch.tensor([1.0, 1.0, 1.0, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         helper._apply_weight_decay_inplace(p, grad, lr=lr, weight_decay=wd)
@@ -79,13 +97,13 @@ def test_independent(self, lr, wd):
     def test_independent_ignores_lr(self):
         """Two different lr values must produce identical results for independent decay."""
         wd = 0.3
-        p1 = torch.tensor([5.0, -3.0, 1.0])
+        p1 = torch.tensor([5.0, -3.0, 1.0], device=self.device)
         p2 = p1.clone()
-        grad1 = torch.tensor([1.0, 1.0, 1.0])
+        grad1 = torch.tensor([1.0, 1.0, 1.0], device=self.device)
         grad2 = grad1.clone()
 
-        _WeightDecayHelper("independent")._apply_weight_decay_inplace(p1, grad1, lr=0.001, weight_decay=wd)
-        _WeightDecayHelper("independent")._apply_weight_decay_inplace(p2, grad2, lr=100.0, weight_decay=wd)
+        _WeightDecayTestHelper("independent")._apply_weight_decay_inplace(p1, grad1, lr=0.001, weight_decay=wd)
+        _WeightDecayTestHelper("independent")._apply_weight_decay_inplace(p2, grad2, lr=100.0, weight_decay=wd)
 
         torch.testing.assert_close(p1, p2, atol=0, rtol=0)
 
@@ -96,9 +114,9 @@ def test_independent_ignores_lr(self):
     )
     def test_l2(self, lr, wd):
         """L2: grad <- grad + p * wd, p untouched."""
-        helper = _WeightDecayHelper("l2")
-        p = torch.tensor([4.0, -2.0, 0.0, 7.0])
-        grad = torch.tensor([1.0, 1.0, 1.0, 1.0])
+        helper = _WeightDecayTestHelper("l2")
+        p = torch.tensor([4.0, -2.0, 0.0, 7.0], device=self.device)
+        grad = torch.tensor([1.0, 1.0, 1.0, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         helper._apply_weight_decay_inplace(p, grad, lr=lr, weight_decay=wd)
@@ -110,26 +128,26 @@ def test_l2(self, lr, wd):
     def test_l2_ignores_lr(self):
         """Two different lr values must produce identical results for L2 decay."""
         wd = 0.3
-        p1 = torch.tensor([5.0, -3.0, 1.0])
+        p1 = torch.tensor([5.0, -3.0, 1.0], device=self.device)
         p2 = p1.clone()
-        grad1 = torch.tensor([1.0, 1.0, 1.0])
+        grad1 = torch.tensor([1.0, 1.0, 1.0], device=self.device)
         grad2 = grad1.clone()
 
-        _WeightDecayHelper("l2")._apply_weight_decay_inplace(p1, grad1, lr=0.001, weight_decay=wd)
-        _WeightDecayHelper("l2")._apply_weight_decay_inplace(p2, grad2, lr=100.0, weight_decay=wd)
+        _WeightDecayTestHelper("l2")._apply_weight_decay_inplace(p1, grad1, lr=0.001, weight_decay=wd)
+        _WeightDecayTestHelper("l2")._apply_weight_decay_inplace(p2, grad2, lr=100.0, weight_decay=wd)
 
         torch.testing.assert_close(grad1, grad2, atol=0, rtol=0)
 
     @parameterized.parameters(
-        {"lr": 0.1, "wd": 0.5},
-        {"lr": 0.01, "wd": 1.0},
-        {"lr": 1.0, "wd": 0.01},
+        {"lr": 0.25, "wd": 0.5},
+        {"lr": 0.025, "wd": 1.0},
+        {"lr": 1.0, "wd": 0.05},
     )
     def test_palm(self, lr, wd):
         """PaLM: p <- p * (1 - wd * lr^2), grad untouched."""
-        helper = _WeightDecayHelper("palm")
-        p = torch.tensor([4.0, -2.0, 0.0, 7.0])
-        grad = torch.tensor([1.0, 1.0, 1.0, 1.0])
+        helper = _WeightDecayTestHelper("palm")
+        p = torch.tensor([4.0, -2.0, 0.0, 7.0], device=self.device)
+        grad = torch.tensor([1.0, 1.0, 1.0, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         helper._apply_weight_decay_inplace(p, grad, lr=lr, weight_decay=wd)
@@ -141,8 +159,8 @@ def test_palm(self, lr, wd):
     def test_default_method_is_l2(self):
         """When weight_decay_method attribute is absent, default to L2."""
         helper = WeightDecayMixin()
-        p = torch.tensor([4.0, -2.0, 0.0, 7.0])
-        grad = torch.tensor([1.0, 1.0, 1.0, 1.0])
+        p = torch.tensor([4.0, -2.0, 0.0, 7.0], device=self.device)
+        grad = torch.tensor([1.0, 1.0, 1.0, 1.0], device=self.device)
         p_orig, grad_orig = p.clone(), grad.clone()
 
         wd = 0.5
@@ -154,9 +172,9 @@ def test_default_method_is_l2(self):
 
     def test_invalid_method_raises(self):
         """An unrecognized weight_decay_method must raise ValueError."""
-        helper = _WeightDecayHelper("bogus")
-        p = torch.tensor([1.0])
-        grad = torch.tensor([1.0])
+        helper = _WeightDecayTestHelper("bogus")
+        p = torch.tensor([1.0], device=self.device)
+        grad = torch.tensor([1.0], device=self.device)
         with self.assertRaises(ValueError):
             helper._apply_weight_decay_inplace(p, grad, lr=0.1, weight_decay=0.1)