Optimize: Guard against unsupported input types

ricardoV94 · ricardoV94 · commit e5b1d3e29604 · 2025-12-10T12:54:31.000+01:00
diff --git a/pytensor/tensor/optimize.py b/pytensor/tensor/optimize.py
@@ -7,12 +7,13 @@
 
 import pytensor.scalar as ps
 from pytensor.compile.function import function
-from pytensor.gradient import grad, jacobian
+from pytensor.gradient import grad, grad_not_implemented, jacobian
 from pytensor.graph.basic import Apply, Constant
 from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.op import ComputeMapType, HasInnerGraph, Op, StorageMapType
 from pytensor.graph.replace import graph_replace
 from pytensor.graph.traversal import ancestors, truncated_graph_inputs
+from pytensor.scalar import ScalarType, ScalarVariable
 from pytensor.tensor.basic import (
     atleast_2d,
     concatenate,
@@ -22,6 +23,7 @@
 )
 from pytensor.tensor.math import dot
 from pytensor.tensor.slinalg import solve
+from pytensor.tensor.type import DenseTensorType
 from pytensor.tensor.variable import TensorVariable, Variable
 
 
@@ -140,23 +142,19 @@ def _find_optimization_parameters(objective: TensorVariable, x: TensorVariable):
 
 
 def _get_parameter_grads_from_vector(
-    grad_wrt_args_vector: Variable,
-    x_star: Variable,
-    args: Sequence[Variable],
+    grad_wrt_args_vector: TensorVariable,
+    x_star: TensorVariable,
+    args: Sequence[TensorVariable | ScalarVariable],
     output_grad: Variable,
 ):
     """
     Given a single concatenated vector of objective function gradients with respect to raveled optimization parameters,
     returns the contribution of each parameter to the total loss function, with the unraveled shape of the parameter.
     """
-    grad_wrt_args_vector = cast(TensorVariable, grad_wrt_args_vector)
-    x_star = cast(TensorVariable, x_star)
-
     cursor = 0
     grad_wrt_args = []
 
     for arg in args:
-        arg = cast(TensorVariable, arg)
         arg_shape = arg.shape
         arg_size = arg_shape.prod()
         arg_grad = grad_wrt_args_vector[:, cursor : cursor + arg_size].reshape(
@@ -375,14 +373,18 @@ def __init__(
         method: str = "brent",
         optimizer_kwargs: dict | None = None,
     ):
-        if not cast(TensorVariable, x).ndim == 0:
+        if not (isinstance(x, TensorVariable) and x.ndim == 0):
             raise ValueError(
                 "The variable `x` must be a scalar (0-dimensional) tensor for minimize_scalar."
             )
-        if not cast(TensorVariable, objective).ndim == 0:
+        if not (isinstance(objective, TensorVariable) and objective.ndim == 0):
             raise ValueError(
                 "The objective function must be a scalar (0-dimensional) tensor for minimize_scalar."
             )
+        if x not in ancestors([objective]):
+            raise ValueError(
+                "The variable `x` must be an input to the computational graph of the objective function."
+            )
         self.fgraph = FunctionGraph([x, *args], [objective])
 
         self.method = method
@@ -416,7 +418,19 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
+        # TODO: Handle disconnected inputs, instead of zeroing them out or failing for unsupported types
         x, *args = inputs
+        if non_supported_types := tuple(
+            inp.type
+            for inp in inputs
+            if not isinstance(inp.type, DenseTensorType | ScalarType)
+        ):
+            # TODO: Support SparseTensorTypes
+            # TODO: Remaining types are likely just disconnected anyway
+            msg = f"Minimize gradient not implemented due to inputs of type {non_supported_types}"
+            return [
+                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
+            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
@@ -468,7 +482,6 @@ def minimize_scalar(
         Symbolic boolean flag indicating whether the minimization routine reported convergence to a minimum
         value, based on the requested convergence criteria.
     """
-
     args = _find_optimization_parameters(objective, x)
 
     minimize_scalar_op = MinimizeScalarOp(
@@ -499,7 +512,11 @@ def __init__(
         use_vectorized_jac: bool = False,
         optimizer_kwargs: dict | None = None,
     ):
-        if not cast(TensorVariable, objective).ndim == 0:
+        if not (isinstance(x, TensorVariable) and x.ndim in (0, 1)):
+            raise ValueError(
+                "The variable `x` must be a scalar or vector (0-or-1-dimensional) tensor for minimize."
+            )
+        if not (isinstance(objective, TensorVariable) and objective.ndim == 0):
             raise ValueError(
                 "The objective function must be a scalar (0-dimensional) tensor for minimize."
             )
@@ -570,7 +587,19 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.success)
 
     def L_op(self, inputs, outputs, output_grads):
+        # TODO: Handle disconnected inputs, instead of zeroing them out or failing for unsupported types
         x, *args = inputs
+        if non_supported_types := tuple(
+            inp.type
+            for inp in inputs
+            if not isinstance(inp.type, DenseTensorType | ScalarType)
+        ):
+            # TODO: Support SparseTensorTypes
+            # TODO: Remaining types are likely just disconnected anyway
+            msg = f"MinimizeOp gradient not implemented due to inputs of type {non_supported_types}"
+            return [
+                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
+            ]
         x_star, _success = outputs
         output_grad, _ = output_grads
 
@@ -672,13 +701,15 @@ def __init__(
         hess: bool = False,
         optimizer_kwargs=None,
     ):
-        if not equation.ndim == 0:
+        if not (isinstance(variables, TensorVariable) and variables.ndim == 0):
+            raise ValueError(
+                "The variable `x` must be a scalar (0-dimensional) tensor for root_scalar."
+            )
+        if not (isinstance(equation, TensorVariable) and equation.ndim == 0):
             raise ValueError(
                 "The equation must be a scalar (0-dimensional) tensor for root_scalar."
             )
-        if not isinstance(variables, Variable) or variables not in ancestors(
-            [equation]
-        ):
+        if variables not in ancestors([equation]):
             raise ValueError(
                 "The variable `variables` must be an input to the computational graph of the equation."
             )
@@ -741,7 +772,19 @@ def perform(self, node, inputs, outputs):
         outputs[1][0] = np.bool_(res.converged)
 
     def L_op(self, inputs, outputs, output_grads):
+        # TODO: Handle disconnected inputs, instead of zeroing them out or failing for unsupported types
         x, *args = inputs
+        if non_supported_types := tuple(
+            inp.type
+            for inp in inputs
+            if not isinstance(inp.type, DenseTensorType | ScalarType)
+        ):
+            # TODO: Support SparseTensorTypes
+            # TODO: Remaining types are likely just disconnected anyway
+            msg = f"RootScalarOp gradient not implemented due to inputs of type {non_supported_types}"
+            return [
+                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
+            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
@@ -833,7 +876,11 @@ def __init__(
         optimizer_kwargs: dict | None = None,
         use_vectorized_jac: bool = False,
     ):
-        if cast(TensorVariable, variables).ndim != cast(TensorVariable, equations).ndim:
+        if not isinstance(variables, TensorVariable):
+            raise ValueError("The variable `variables` must be a tensor for root.")
+        if not isinstance(equations, TensorVariable):
+            raise ValueError("The equations must be a tensor for root.")
+        if variables.ndim != equations.ndim:
             raise ValueError(
                 "The variable `variables` must have the same number of dimensions as the equations."
             )
@@ -922,7 +969,19 @@ def L_op(
         outputs: Sequence[Variable],
         output_grads: Sequence[Variable],
     ) -> list[Variable]:
+        # TODO: Handle disconnected inputs, instead of zeroing them out or failing for unsupported types
         x, *args = inputs
+        if non_supported_types := tuple(
+            inp.type
+            for inp in inputs
+            if not isinstance(inp.type, DenseTensorType | ScalarType)
+        ):
+            # TODO: Support SparseTensorTypes
+            # TODO: Remaining types are likely just disconnected anyway
+            msg = f"RootOp gradient not implemented due to inputs of type {non_supported_types}"
+            return [
+                grad_not_implemented(self, i, inp, msg) for i, inp in enumerate(inputs)
+            ]
         x_star, _ = outputs
         output_grad, _ = output_grads
 
diff --git a/tests/tensor/test_optimize.py b/tests/tensor/test_optimize.py
@@ -3,9 +3,11 @@
 
 import pytensor
 import pytensor.tensor as pt
-from pytensor import config, function
-from pytensor.graph import Apply, Op
-from pytensor.tensor import scalar
+from pytensor import Variable, config, function
+from pytensor.gradient import NullTypeGradError, disconnected_type
+from pytensor.graph import Apply, Op, Type
+from pytensor.scalar import float64
+from pytensor.tensor import alloc, scalar, scalar_from_tensor, tensor_from_scalar
 from pytensor.tensor.optimize import minimize, minimize_scalar, root, root_scalar
 from tests import unittest_tools as utt
 
@@ -248,3 +250,105 @@ def L_op(self, inputs, outputs, out_grads):
     np.testing.assert_allclose(
         opt_x_res, 0, atol=1e-15 if floatX == "float64" else 1e-6
     )
+
+
+@pytest.mark.parametrize("optimize_op", (minimize, minimize_scalar, root, root_scalar))
+def test_minimize_grad_scalar_arg(optimize_op):
+    # Regression test for https://github.com/pymc-devs/pytensor/pull/1744
+    x = scalar("x")
+    theta = float64("theta")
+    obj = tensor_from_scalar((scalar_from_tensor(x) + theta) ** 2)
+    x0, _ = optimize_op(obj, x)
+
+    # Confirm theta is a direct input to the node
+    assert x0.owner.inputs[1] is theta
+
+    grad_wrt_theta = pt.grad(x0, theta)
+    np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: np.e}), -1)
+
+
+@pytest.mark.parametrize("optimize_op", (minimize, minimize_scalar, root, root_scalar))
+def test_minimize_grad_disconnected_numerical_inp(optimize_op):
+    x = scalar("x", dtype="float64")
+    theta = scalar("theta", dtype="int64")
+    obj = alloc(x**2, theta).sum()  # repeat theta times and sum
+    x0, _ = optimize_op(obj, x)
+
+    # Confirm theta is a direct input to the node
+    assert x0.owner.inputs[1] is theta
+
+    # This should technically raise, but does not right now
+    grad_wrt_theta = pt.grad(x0, theta, disconnected_inputs="raise")
+    np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: 5}), 0)
+
+    # This should work even if the previous one raised
+    grad_wrt_theta = pt.grad(x0, theta, disconnected_inputs="ignore")
+    np.testing.assert_allclose(grad_wrt_theta.eval({x: np.pi, theta: 5}), 0)
+
+
+@pytest.mark.parametrize("optimize_op", (minimize, minimize_scalar, root, root_scalar))
+def test_minimize_grad_disconnected_non_numerical_inp(optimize_op):
+    class StrType(Type):
+        def filter(self, x, **kwargs):
+            if isinstance(x, str):
+                return x
+            raise TypeError
+
+    class SmileOrFrown(Op):
+        def make_node(self, x, str_emoji):
+            return Apply(self, [x, str_emoji], [x.type()])
+
+        def perform(self, node, inputs, output_storage):
+            [x, str_emoji] = inputs
+            match str_emoji:
+                case ":)":
+                    out = np.array(x)
+                case ":(":
+                    out = np.array(-x)
+                case _:
+                    ValueError("str_emoji must be a smile or a frown")
+            output_storage[0][0] = out
+
+        def connection_pattern(self, node):
+            # Gradient connected only to first input
+            return [[True], [False]]
+
+        def L_op(self, inputs, outputs, output_gradients):
+            [_x, str_emoji] = inputs
+            [g] = output_gradients
+            return [
+                self(g, str_emoji),
+                disconnected_type(),
+            ]
+
+    # We could try to use real types like NoneTypeT or SliceType, but this is more robust to future API changes
+    str_type = StrType()
+    smile_or_frown = SmileOrFrown()
+
+    x = scalar("x", dtype="float64")
+    num_theta = pt.scalar("num_theta", dtype="float64")
+    str_theta = Variable(str_type, None, None, name="str_theta")
+    obj = (smile_or_frown(x, str_theta) + num_theta) ** 2
+    x_star, _ = optimize_op(obj, x)
+
+    # Confirm thetas are direct inputs to the node
+    assert set(x_star.owner.inputs[1:]) == {num_theta, str_theta}
+
+    # Confirm forward pass works, no point in worrying about gradient otherwise
+    np.testing.assert_allclose(
+        x_star.eval({x: np.pi, num_theta: np.e, str_theta: ":)"}),
+        -np.e,
+    )
+    np.testing.assert_allclose(
+        x_star.eval({x: np.pi, num_theta: np.e, str_theta: ":("}),
+        np.e,
+    )
+
+    with pytest.raises(NullTypeGradError):
+        pt.grad(x_star, str_theta, disconnected_inputs="raise")
+
+    # This could be supported, but it is not right now.
+    with pytest.raises(NullTypeGradError):
+        _grad_wrt_num_theta = pt.grad(x_star, num_theta, disconnected_inputs="raise")
+    # np.testing.assert_allclose(grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":)"}), -1)
+    # np.testing.assert_allclose(grad_wrt_num_theta.eval({x: np.pi, num_theta: np.e, str_theta: ":("}), 1)