equinor · tommyod · Dec 5, 2025 · Dec 4, 2025 · Dec 4, 2025 · Dec 5, 2025
diff --git a/README.md b/README.md
@@ -74,7 +74,7 @@ How much money will we have after 20 years?
 ...     returns = returns * interest + saved_per_year
 >>> samples = returns.sample(999, random_state=42)
 >>> samples.mean(), samples.std()
-(np.float64(76583.58738496085), np.float64(33483.2245611436))
+(np.float64(76630.89701703968), np.float64(34507.63482771612))
 
 ```
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "probabilit"
-version = "0.3.0"
+version = "0.4.0"
 description = "A Python package for Monte Carlo sampling."
 requires-python = ">= 3.10"
 readme = {file = "README.md", content-type = "text/markdown"}

diff --git a/src/probabilit/modeling.py b/src/probabilit/modeling.py
@@ -19,7 +19,7 @@
 >>> price = box_volume * price_per_sqm
 >>> samples = price.sample(999, random_state=rng)
 >>> float(np.mean(samples))
-20.00139737515...
+20.00876430957...
 
 Distributions are built on top of scipy, so "norm" refers to the name of the
 normal distribution as given in `scipy.stats`, and the arguments to the
@@ -84,13 +84,13 @@
 Sampling any node is done by calling the `.sample()` method:
 
 >>> expression.sample(5, random_state=rng)
-array([ 2.70764145, 36.58578812,  7.07064239,  1.84433247,  3.90951632])
+array([ 5.04060084,  5.19163254, 13.09590433, 20.23600678,  4.24147296])
 
 Sampling the expression has the side effect that `.samples_` is populated on
 *every* ancestor node in the expression, for instance:
 
 >>> a.samples_
-array([4.51589278, 4.37788659, 5.25960812, 5.80609507, 4.33770499])
+array([3.9254551 , 6.0788841 , 4.68923246, 3.48160505, 4.26149282])
 
 Here is an even more complex expression, showcasing some mathematical functions:
 
@@ -99,7 +99,8 @@
 >>> c = Distribution("norm", loc=0, scale=3)
 >>> expression = a*a - Add(a, b, c) + Abs(b)**Abs(c) + Exp(1 / Abs(c))
 >>> expression.sample(5, random_state=rng)
-array([ 4.70542018, 14.43250192,  6.74494838, -0.14020459, -3.27334554])
+array([ -4.9730559 , 103.74625257,   3.45199868,  14.50692883,
+        31.07178136])
 
 Nodes are hashable and can be used in sets, so __hash__ and __eq__ must both
 be defined. Therefore we cannot use `==` for modeling; equality in that context
@@ -202,7 +203,7 @@
 >>> hypercube = LatinHypercube(d=d, rng=rng, optimization="random-cd")
 >>> hypercube_samples = hypercube.random(5) # Draw 5 samples
 >>> expression.sample_from_quantiles(hypercube_samples)
-array([ 7.80785741,  3.72416016,  3.77849849, -3.83561905, 38.02479019])
+array([ 8.2746582 ,  1.47340322,  1.71799415,  6.21188211, 41.63293414])
 
 
 Garbage collection
@@ -344,13 +345,6 @@ def python_to_prob(argument):
 #
 # Some further terminology:
 #   * The _ancestors_ of node "-" are {"+", "2", "mu", "normal"}
-#   * A node is said to be an _initial sampling node_ iff
-#      (1) The node is a Distribution
-#      (2) None of its ancestors are Distributions
-#     For instance, in the graph above, the node "mu" is an initial sampling node.
-#     Initial sampling nodes are the nodes that we can impose correlations on.
-#     We cannot impose correlations on "normal" above, since its correlation
-#     is determined by the graph structure.
 #   * result.sample() samples the expression "mu + normal - 2" by propagating
 #     through the graph, parents first. More specifically, each category of nodes
 #     (Constant, Transform and Distribution) have their own internal _sample methods
@@ -582,10 +576,9 @@ def sample_from_quantiles(
         }
         if isinstance(correlator, str):
             correlator = correlator.lower().strip()
-            if correlator not in CORRELATOR_MAP.keys():
-                raise ValueError(
-                    f"`{correlator=}` must be in {set(CORRELATOR_MAP.keys())}"
-                )
+            valid_corrs = set(CORRELATOR_MAP.keys())
+            if correlator not in valid_corrs:
+                raise ValueError(f"`{correlator=}` not in {valid_corrs}")
             correlator = CORRELATOR_MAP[correlator]  # Map to instance
 
         # Prepare columns of quantiles, one column for each Distribution
@@ -599,25 +592,62 @@ def sample_from_quantiles(
         # Set up garbage collection
         gc = GarbageCollector(strategy=gc_strategy).set_sink(self)
 
-        # Start with initial sampling nodes, which contain independent variables
-        initial_sampling_nodes = set(
-            node for node in self.nodes() if node._is_initial_sampling_node()
-        )
-        # Ensure consistent ordering for reproducible results
-        initial_sampling_nodes = sorted(initial_sampling_nodes, key=lambda n: n._id)
+        # Keep track of all nodes that are sampled and later garbarge-collected.
+        # If we do not keep track of these then they will be sampled twice.
+        # We will skip sampling a node if either (1) samples_ is set or (2)
+        # the node has previously been sampled and garbage collected.
+        garbage_collected = set()
+
+        def topo_sample(G, gc, garbage_collected):
+            """Sample nodes in a graph G in topological order.
+
+            Both the arguments `gc` (garbage collector) and `garbage_collected`
+            will be updated in place (mutated).
+            """
+
+            for node in nx.topological_sort(G):
+                # Skip if samples already exists or the node was sampled previously
+                if hasattr(node, "samples_") or node in garbage_collected:
+                    continue
+                elif isinstance(node, Constant):
+                    node.samples_ = node._sample(size=size)  # Draw constants
+                elif isinstance(node, AbstractDistribution):
+                    node.samples_ = node._sample(q=next(columns))  # Sample distr
+                elif isinstance(node, Transform):
+                    node.samples_ = node._sample()  # Propagate through transform
+                else:
+                    raise TypeError(
+                        "Node must be Constant, AbstractDistribution or Transform."
+                    )
+
+                is_numeric = (node.samples_ is not None) and np.issubdtype(
+                    node.samples_.dtype, np.number
+                )
+                if is_numeric and not np.all(np.isfinite(node.samples_)):
+                    msg = f"Sampling gave non-finite values: {node}\n{node.samples_}"
+                    raise ValueError(msg)
+
+                # Tell the garbage collector that we sampled this node.
+                # If the reference counter reaches zero (a parent has no unsampled
+                # children), then the `.samples_` attribute of the parent might
+                # be deleted (if the garbage collection strategy allows it).
+                garbage_collected.update(gc.decrement_and_delete(node))
+
+        # If there are no correlations to induce, then we can simply go through
+        # the graph in topological order and sample it.
+        # If there are correlations, then we must first sample up until and
+        # including nodes that are to be correlated, then correlate them, then
+        # sample the remaining graph. For instance, consider the graph:
+        # A ----> B ---> [C] ---> D
+        #                         |
+        #                         v
+        # E ---> [F] ---> G ----> result
+        # If nodes C and F are to be correlated, then we sample A -> B -> C,
+        # followed by E -> F, once we have samples on nodes C and F we
+        # can correlate those permuting the order of the samples (ImanConover).
+        # Finally we keep sampling D -> G -> result.
 
-        # Loop over all initial sampling nodes (ISN) and sample them
         G = self.to_graph()
-        for node in initial_sampling_nodes:
-            # Sample all ancestors of ISNs
-            ancestors = G.subgraph(nx.ancestors(G, node))
-            for ancestor in nx.topological_sort(ancestors):
-                assert isinstance(ancestor, (Constant, Transform))
-                ancestor.samples_ = ancestor._sample(size=size)
-
-            # Sample the ISN
-            assert isinstance(node, AbstractDistribution)
-            node.samples_ = node._sample(q=next(columns))
 
         # Go through all ancestor nodes and create a list [(var, corr), ...]
         # that contains all correlations we must induce
@@ -626,11 +656,35 @@ def sample_from_quantiles(
             if hasattr(node, "_correlations"):
                 correlations.extend(node._correlations)
 
-        # Check that each variable to correlate is an initial sampling node
-        for variables, _ in correlations:
-            for variable in variables:
-                if variable not in initial_sampling_nodes:
-                    raise ValueError(f"Cannot correlate variable: {variable}")
+        variable_sets = [set(variables) for (variables, _) in correlations]
+        # Map all variables to integers to associate them with a column
+        corr_variables = list(functools.reduce(set.union, variable_sets, set()))
+        # Ensure consistent ordering for reproducible results
+        corr_variables = sorted(corr_variables, key=lambda n: n._id)
+
+        # Check that the set of variables that the user wants to correlate
+        # are allowed. The condition is that each variable and its ancestors
+        # must be disjoint sets. For instance, in the graph A -> B we cannot
+        # correlate A and B. In the graph A <- B -> C we cannot correlate
+        # A and C. In general we can only correlate nodes whose correlation
+        # cannot potentially be determined already from the graph structure.
+        seen = set()
+        for variable in corr_variables:
+            var_plus_ancestors = set(variable.nodes())
+
+            # If the variable, or any ancestor, has already been seen
+            if seen.intersection(var_plus_ancestors):
+                msg = f"Cannot correlate node: {variable}\n"
+                msg += "This variable is an ancestor of more than one variables\n"
+                msg += "that you wish to correlate. But this relationship can\n"
+                msg += "potentially already induce a correlation.\n"
+                msg += (
+                    "For instance, in the graph A -> B you cannot correlate A and B.\n"
+                )
+                msg += "In the graph A <- B -> C you cannot correlate A and C."
+                raise ValueError(msg)
+            else:
+                seen.update(var_plus_ancestors)
 
         # Check that no correlation has been specified twice
         variable_sets = [set(variables) for (variables, _) in correlations]
@@ -639,11 +693,18 @@ def sample_from_quantiles(
             if len(common) > 1:
                 raise ValueError(f"Correlations specified more than once: {common}")
 
-        # Map all variables to integers to associate them with a column
-        all_variables = list(functools.reduce(set.union, variable_sets, set()))
-        # Ensure consistent ordering for reproducible results
-        all_variables = sorted(all_variables, key=lambda n: n._id)
-        var_to_int = {v: i for (i, v) in enumerate(all_variables)}
+        # Sample up until nodes that we induce correlations on. In this graph:
+        # A ----> B ---> [C] ---> D
+        #                         |
+        #                         v
+        # E ---> [F] ---> G ----> result
+        # this means sampling up until and including C and F.
+        for variable in corr_variables:
+            ancestors = G.subgraph(nx.ancestors(G, variable).union({variable}))
+            topo_sample(ancestors, gc=gc, garbage_collected=garbage_collected)
+
+        # Map to correlations
+        var_to_int = {v: i for (i, v) in enumerate(corr_variables)}
         correlations = [
             (tuple(var_to_int[var] for var in variables), corrmat)
             for (variables, corrmat) in correlations
@@ -659,40 +720,18 @@ def sample_from_quantiles(
             correlator = correlator.set_target(correlation_matrix)
 
             # Concatenate samples, correlate them (shift rows in each col), then re-assign
-            samples_input = np.vstack([var.samples_ for var in all_variables]).T
+            samples_input = np.vstack([var.samples_ for var in corr_variables]).T
             samples_ouput = correlator(samples_input)
-            for var, sample in zip(all_variables, samples_ouput.T):
+            for var, sample in zip(corr_variables, samples_ouput.T):
                 var.samples_ = np.copy(sample)
 
-        # Iterate sampling though the graph
-        for node in nx.topological_sort(G):
-            if hasattr(node, "samples_"):  # Skip if samples already exists
-                pass
-            elif isinstance(node, Constant):
-                node.samples_ = node._sample(size=size)  # Draw constants
-            elif isinstance(node, AbstractDistribution):
-                node.samples_ = node._sample(q=next(columns))  # Sample distr
-            elif isinstance(node, Transform):
-                node.samples_ = node._sample()  # Propagate through transform
-            else:
-                raise TypeError(
-                    "Node must be Constant, AbstractDistribution or Transform."
-                )
-
-            is_numeric = (node.samples_ is not None) and np.issubdtype(
-                node.samples_.dtype, np.number
-            )
-            if is_numeric and not np.all(np.isfinite(node.samples_)):
-                raise ValueError(
-                    f"Sampling this node gave non-finite values: {node}\n{node.samples_}"
-                )
-
-            # Tell the garbage collector that we sampled this node.
-            # If the reference counter reaches zero (a parent has no unsampled
-            # children), then the `.samples_` attribute of the parent might
-            # be deleted (only if the garbage collection strategy allows it).
-            gc.decrement_and_delete(node)
-
+        # Sample all the way to the end. In this graph:
+        # A ----> B ---> [C] ---> D
+        #                         |
+        #                         v
+        # E ---> [F] ---> G ----> result
+        # this would mean sampling from D and G.
+        topo_sample(self.to_graph(), gc=gc, garbage_collected=garbage_collected)
         return self.samples_
 
     def _is_initial_sampling_node(self):