|
| 1 | +// Module for training and using score-based diffusion model |
| 2 | +// Added by Yongyi Wu |
| 3 | +// Mar. 2026 |
| 4 | + |
| 5 | +#pragma once |
| 6 | + |
| 7 | +#include <vector> |
| 8 | +#include <algorithm> |
| 9 | +#include <cmath> |
| 10 | +#include <string> |
| 11 | +#include <iomanip> |
| 12 | +#include <fstream> |
| 13 | +#include <sstream> |
| 14 | +#include <iostream> |
| 15 | +#include <cctype> |
| 16 | +#include <cassert> |
| 17 | + |
| 18 | +#include "CLHEP/Random/RandomEngine.h" |
| 19 | +#include "CLHEP/Random/RandFlat.h" |
| 20 | +#include "CLHEP/Random/RandGaussQ.h" |
| 21 | + |
| 22 | +#include "messagefacility/MessageLogger/MessageLogger.h" |
| 23 | +#include "cetlib_except/exception.h" |
| 24 | + |
| 25 | +namespace mu2e{ |
| 26 | + struct DiffusionTrainingSample { |
| 27 | + std::vector<double> x; // transformed state vector to diffuse (size = dim) |
| 28 | + std::vector<double> cond; // optional conditioning vector (size = conditionDim) |
| 29 | + }; |
| 30 | + |
| 31 | + class ScoreBasedDiffusionModel { |
| 32 | + public: |
| 33 | + // Enumeration for optimizer selection |
| 34 | + enum class OptimizerType { |
| 35 | + SGD, // Stochastic Gradient Descent |
| 36 | + ADAM // Adam optimizer |
| 37 | + }; |
| 38 | + |
| 39 | + // Enumeration for noise schedule selection |
| 40 | + enum class NoiseScheduleType { |
| 41 | + LINEAR, // Linear noise schedule |
| 42 | + COSINE // Cosine noise schedule |
| 43 | + }; |
| 44 | + |
| 45 | + // Constructor: Initialize diffusion model with CLHEP random distributions. |
| 46 | + // |
| 47 | + // Parameters: |
| 48 | + // randFlat - Reference to CLHEP RandFlat for uniform sampling (externally managed) |
| 49 | + // randGaussQ - Reference to CLHEP RandGaussQ for Gaussian noise (externally managed) |
| 50 | + // dim - Dimensionality of the state space |
| 51 | + // conditionDim - Dimensionality of the optional conditioning vector (default: 0 for unconditional model) |
| 52 | + // hidden - Size of hidden layers in the neural network |
| 53 | + // layers - Number of layers in the network |
| 54 | + // optimizerType - Type of optimizer to use (SGD or ADAM, default: ADAM) |
| 55 | + // adamBeta1 - Adam optimizer beta1 parameter (default: 0.9) |
| 56 | + // adamBeta2 - Adam optimizer beta2 parameter (default: 0.999) |
| 57 | + // adamEps - Adam optimizer epsilon parameter (default: 1e-8) |
| 58 | + // scheduleType - Type of noise schedule (LINEAR or COSINE, default: COSINE) |
| 59 | + // betaMin - Minimum noise schedule parameter (for LINEAR schedule, default: 1e-4) |
| 60 | + // betaMax - Maximum noise schedule parameter (for LINEAR schedule, default: 0.02) |
| 61 | + // cosineOffset - Offset parameter (for cosine schedule, default: 0.008) |
| 62 | + // batchSize - Batch size for training (default: 32) |
| 63 | + // gradientClipThreshold - Threshold for gradient clipping (default: 1.0) |
| 64 | + // learningRate - Learning rate for training (default: 1e-3) |
| 65 | + // diffusionSteps - Number of steps in the diffusion process (default: 200) |
| 66 | + ScoreBasedDiffusionModel( |
| 67 | + // Network architecture parameters |
| 68 | + CLHEP::RandFlat& randFlat, |
| 69 | + CLHEP::RandGaussQ& randGaussQ, |
| 70 | + int dim, |
| 71 | + int conditionDim, |
| 72 | + int hidden, |
| 73 | + int layers, |
| 74 | + // Optimizer configuration |
| 75 | + OptimizerType optimizerType = OptimizerType::ADAM, |
| 76 | + double adamBeta1 = 0.9, |
| 77 | + double adamBeta2 = 0.999, |
| 78 | + double adamEps = 1e-8, |
| 79 | + // Noise schedule configuration |
| 80 | + NoiseScheduleType scheduleType = NoiseScheduleType::COSINE, |
| 81 | + double betaMin = 1e-4, |
| 82 | + double betaMax = 0.02, |
| 83 | + double cosineOffset = 0.008, |
| 84 | + // Training configuration |
| 85 | + int batchSize = 32, |
| 86 | + double gradientClipThreshold = 1.0, |
| 87 | + double learningRate = 1e-3, |
| 88 | + // Diffusion process configuration |
| 89 | + int diffusionSteps = 200 |
| 90 | + ); |
| 91 | + |
| 92 | + // Train the score network on a batch of samples. |
| 93 | + // Uses random sampling and noise injection via the external engine. |
| 94 | + // Note that training needs to occur on all data samples. Training on multiple small subsets |
| 95 | + // of data and then averaging or aggregating the model parameters is not supported and may lead to |
| 96 | + // issues as neural networks are not linear. |
| 97 | + // |
| 98 | + // Parameters: |
| 99 | + // data - Training samples (transformed state vectors) |
| 100 | + // epochs - Number of training epochs to perform |
| 101 | + void train( |
| 102 | + const std::vector<DiffusionTrainingSample>& data, |
| 103 | + int epochs |
| 104 | + ); |
| 105 | + |
| 106 | + // Generate a new sample from the diffusion model via reverse process. |
| 107 | + // Uses the external random engine for noise generation during sampling. |
| 108 | + // |
| 109 | + // Parameters: |
| 110 | + // condition - Optional conditioning vector (must match conditionDim_ when enabled) |
| 111 | + // useHeun - If true, uses Heun's method (2nd order, default). If false, uses Euler's method (1st order) |
| 112 | + // diffusionSteps - Number of diffusion steps for sampling (default: -1 uses the model's configured diffusionSteps_) |
| 113 | + // |
| 114 | + // Returns: A generated sample vector of dimension dim_ |
| 115 | + std::vector<double> generateSample( |
| 116 | + const std::vector<double>& condition = {}, |
| 117 | + bool useHeun = true, |
| 118 | + int diffusionSteps = -1 |
| 119 | + ); |
| 120 | + |
| 121 | + // Save the model parameters to a CSV file with annotations for later use. |
| 122 | + // Uses a default filename of "DiffusionModel.csv" if not specified. |
| 123 | + // |
| 124 | + // Parameters: |
| 125 | + // filename - Path to the CSV file where model parameters will be saved (default: "DiffusionModel.csv") |
| 126 | + void saveModel(const std::string& filename = "DiffusionModel.csv"); |
| 127 | + |
| 128 | + // Load model parameters from a file to restore a previously trained model. |
| 129 | + // Note that as the adam optimizer state is not saved, it is not possible to resume training from a loaded |
| 130 | + // model and pick up the training process where it left off. The loaded model can only be used for sampling. |
| 131 | + // |
| 132 | + // Parameters: |
| 133 | + // randFlat / RandGaussQ - CLHEP random number generator wrappers being passed |
| 134 | + // filename - Path to the file from which model parameters will be loaded |
| 135 | + static ScoreBasedDiffusionModel loadModel( |
| 136 | + CLHEP::RandFlat& randFlat, |
| 137 | + CLHEP::RandGaussQ& randGaussQ, |
| 138 | + const std::string& filename |
| 139 | + ); |
| 140 | + |
| 141 | + private: |
| 142 | + |
| 143 | + // ----- network ----- |
| 144 | + |
| 145 | + // Represents a single fully-connected layer with weights and biases. |
| 146 | + struct Layer { |
| 147 | + std::vector<std::vector<double>> W; // Weight matrix [output_size][input_size] |
| 148 | + std::vector<double> b; // Bias vector [output_size] |
| 149 | + |
| 150 | + // storage for gradients during back-propagation |
| 151 | + std::vector<std::vector<double>> gradW; // gradient of loss w.r.t. weights |
| 152 | + std::vector<double> gradb; // gradient of loss w.r.t. biases |
| 153 | + |
| 154 | + // Adam optimizer state buffers |
| 155 | + std::vector<std::vector<double>> mW; // First moment estimates for weights, m_t = beta1 m_{t-1} + (1-beta1) g_t |
| 156 | + std::vector<std::vector<double>> vW; // Second moment estimates for weights, v_t = beta2 v_{t-1} + (1-beta2) g_t^2 |
| 157 | + |
| 158 | + std::vector<double> mb; // First moment estimates for biases |
| 159 | + std::vector<double> vb; // Second moment estimates for biases |
| 160 | + }; |
| 161 | + |
| 162 | + std::vector<Layer> network_; // Network layers in forward order |
| 163 | + |
| 164 | + // Storage for activations and pre-activations during forward pass (used in back-propagation) |
| 165 | + // During forward pass, preactivations_[l] = W[l] * activations_[l-1] + b[l], |
| 166 | + // and activations_[l] = activationFunction(preactivations_[l]) |
| 167 | + // These are needed to compute gradients during the backward pass. |
| 168 | + std::vector<std::vector<double>> activations_; |
| 169 | + std::vector<std::vector<double>> preactivations_; |
| 170 | + |
| 171 | + // Forward pass through the network. |
| 172 | + // Computes the score function. |
| 173 | + // input actually contains the state vector, optional condition vector, and the time embedding. |
| 174 | + // dimension hence is dim_ + conditionDim_ + 1. |
| 175 | + // |
| 176 | + // Parameters: |
| 177 | + // x - Input vector of dimension dim_ + conditionDim_ + 1 |
| 178 | + // |
| 179 | + // Returns: Output vector (dimension depends on network architecture) |
| 180 | + std::vector<double> forward( |
| 181 | + const std::vector<double>& x |
| 182 | + ); |
| 183 | + |
| 184 | + // Backward pass for gradient computation. |
| 185 | + // Computes gradients w.r.t. network parameters via chain rule. |
| 186 | + // |
| 187 | + // Parameters: |
| 188 | + // gradOutput - Gradient of loss w.r.t. network output |
| 189 | + // |
| 190 | + // Returns: Gradient of loss w.r.t. network parameters (gradW and gradb for each layer) |
| 191 | + void backward( |
| 192 | + const std::vector<double>& gradOutput |
| 193 | + ); |
| 194 | + |
| 195 | + // Update network weights using computed gradients (Stochastic Gradient Descent SGD). |
| 196 | + // Applied after backward pass. Alternately, an Adam optimizer step can be implemented |
| 197 | + // in adamUpdate() for better convergence. |
| 198 | + // |
| 199 | + // Parameters: |
| 200 | + // lr - Learning rate for gradient descent step |
| 201 | + void updateWeights(double lr); |
| 202 | + |
| 203 | + // Update network weights using Adam optimization algorithm. |
| 204 | + // This method would implement the Adam update rule using the stored gradients and moment estimates. |
| 205 | + // |
| 206 | + // Parameters: |
| 207 | + // lr - Learning rate for Adam update |
| 208 | + void adamUpdate(double lr); |
| 209 | + |
| 210 | + // ----- diffusion ----- |
| 211 | + |
| 212 | + // Noise schedule parameter beta(t) over diffusion time [0,1]. |
| 213 | + // Linear interpolation between betaMin_ and betaMax_. |
| 214 | + // |
| 215 | + // Parameters: |
| 216 | + // t - Diffusion time parameter in [0,1] |
| 217 | + // |
| 218 | + // Returns: Beta value for the given time step |
| 219 | + double beta(double t) const; |
| 220 | + |
| 221 | + // Standard deviation of noise at diffusion time t. |
| 222 | + // Related to the noise schedule via sigma(t) = sqrt(1 - exp(-integral(beta(s) ds))). |
| 223 | + // |
| 224 | + // Parameters: |
| 225 | + // t - Diffusion time parameter in [0,1] |
| 226 | + // |
| 227 | + // Returns: Noise standard deviation at time t |
| 228 | + double sigma(double t) const; |
| 229 | + |
| 230 | + // Add Gaussian noise to a state vector at diffusion time t. |
| 231 | + // Uses external engine (randGaussQ_) for reproducible noise generation. |
| 232 | + // Noise sample is stored in eps for later use in training. |
| 233 | + // |
| 234 | + // Parameters: |
| 235 | + // x - Original state vector |
| 236 | + // t - Diffusion time parameter in [0,1] |
| 237 | + // eps - Output: Gaussian noise vector used for perturbation (size = dim_) |
| 238 | + // |
| 239 | + // Returns: Noisy state = x + sigma(t) * eps |
| 240 | + std::vector<double> addNoise( |
| 241 | + const std::vector<double>& x, |
| 242 | + double t, |
| 243 | + std::vector<double>& eps |
| 244 | + ); |
| 245 | + |
| 246 | + // ----- loss ----- |
| 247 | + |
| 248 | + // Compute Mean Squared Error between predicted score and target score. |
| 249 | + // Used during training to measure model performance. |
| 250 | + // |
| 251 | + // Parameters: |
| 252 | + // score - Model's predicted score vector |
| 253 | + // target - Ground truth target score vector |
| 254 | + // |
| 255 | + // Returns: MSE loss value (scalar) |
| 256 | + double computeLoss( |
| 257 | + const std::vector<double>& score, |
| 258 | + const std::vector<double>& target |
| 259 | + ) const; |
| 260 | + |
| 261 | + // Clip gradients to prevent exploding gradients during training. |
| 262 | + // |
| 263 | + // Parameters: |
| 264 | + // maxNorm - Maximum allowed norm for the gradients. If the total norm exceeds this threshold, |
| 265 | + // gradients are scaled down to have norm equal to maxNorm. |
| 266 | + void clipGradients(double maxNorm); |
| 267 | + |
| 268 | + // ----- internal vars ----- |
| 269 | + |
| 270 | + // The random engine is NOT owned by this class. It is injected externally |
| 271 | + // by the framework. Below are CLHEP distribution wrappers for actual random number generation. |
| 272 | + // These wrap the engine_ and provide specific probability distributions. |
| 273 | + // - RandFlat: Uniform distribution on [0,1) |
| 274 | + // - RandGaussQ: Gaussian (normal) distribution with mean=0, sigma=1 (or custom) |
| 275 | + // Both are bound in the constructor to externally managed wrappers. |
| 276 | + CLHEP::RandFlat& randFlat_; // Used for uniform sampling (e.g., batch selection) |
| 277 | + CLHEP::RandGaussQ& randGaussQ_; // Used for Gaussian noise in diffusion process |
| 278 | + |
| 279 | + // Model hyperparameters |
| 280 | + int dim_; // Dimensionality of state space |
| 281 | + int conditionDim_; // Dimensionality of the optional conditioning vector |
| 282 | + int hidden_; // Hidden layer size |
| 283 | + int layers_; // Number of network layers |
| 284 | + |
| 285 | + // Optimizer configuration |
| 286 | + OptimizerType optimizerType_; // Type of optimizer to use (SGD or ADAM) |
| 287 | + |
| 288 | + // Adam optimizer parameters |
| 289 | + double adamBeta1_; // First moment exponential decay rate (default: 0.9) |
| 290 | + double adamBeta2_; // Second moment exponential decay rate (default: 0.999) |
| 291 | + double adamEps_; // Small constant for numerical stability (default: 1e-8) |
| 292 | + |
| 293 | + // Noise schedule configuration |
| 294 | + NoiseScheduleType noiseScheduleType_; // Type of noise schedule (LINEAR or COSINE) |
| 295 | + |
| 296 | + // Linear noise schedule parameters (beta(t) = betaMin + t*(betaMax - betaMin)) |
| 297 | + // default betaMin = 1e-4, betaMax = 0.02 are typical values used in diffusion models, but can be tuned for specific applications. |
| 298 | + double betaMin_; // Beta value at t=0 |
| 299 | + double betaMax_; // Beta value at t=1 |
| 300 | + |
| 301 | + // Cosine noise schedule parameter (offset to avoid singularity at t=0 in cosine schedule, default: 0.008) |
| 302 | + double cosineOffset_; |
| 303 | + |
| 304 | + // Training configuration |
| 305 | + int batchSize_; // Batch size for vectorized training (default: 32) |
| 306 | + double gradientClipThreshold_; // Gradient clipping threshold (default: 1.0) |
| 307 | + double learningRate_; // Learning rate for training (default: 1e-3) |
| 308 | + |
| 309 | + // Diffusion process discretization |
| 310 | + int diffusionSteps_; // Number of time steps to generate a sample (default: 200) |
| 311 | + |
| 312 | + // Training state |
| 313 | + double runningLoss_; // Accumulated loss for monitoring during training |
| 314 | + int adamStep_; // Step counter for Adam optimizer (used to compute bias-corrected moment estimates) |
| 315 | + int trainingSampleSize_; // Total number of training samples |
| 316 | + |
| 317 | + // Container for tracking training loss over epochs |
| 318 | + std::vector<double> epochLosses_; |
| 319 | + |
| 320 | + }; |
| 321 | +} |
0 commit comments