AstraZeneca · vijayvammi · Apr 27, 2026 · Apr 27, 2026
diff --git a/.claude/settings.json b/.claude/settings.json
@@ -0,0 +1,16 @@
+{
+  "hooks": {
+    "PostToolUse": [
+      {
+        "matcher": "Write|Edit",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "jq -r '.tool_input.file_path' | { read -r f; echo \"$f\" | grep -q '\\.py$' && ruff check --fix \"$f\" && ruff format \"$f\"; } 2>/dev/null || true",
+            "statusMessage": "Running ruff fix + format..."
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/docs/index.md b/docs/index.md
@@ -9,7 +9,7 @@
 <a href="https://www.flaticon.com/free-icons/runner" title="runner icons">Runner icons created by Leremy - Flaticon</a>
 </span>
 
-Transform any Python function into a portable, trackable pipeline in seconds.
+Transform Python functions into portable, trackable pipelines.
 
 <hr style="border:2px dotted orange">
 
@@ -47,7 +47,10 @@ def analyze_sales():
 from runnable import PythonJob
 
 def main():
-    job = PythonJob(function=analyze_sales)
+    job = PythonJob(
+        function=analyze_sales,
+        # returns=["total_revenue", "best_product"]  # Optional: track return values
+    )
     job.execute()
     return job  # REQUIRED: Always return the job object
 
@@ -63,7 +66,7 @@ You just made your first function runnable and got:
 - ✅ **Reproducible runs**: full execution history and metadata
 - ✅ **Environment portability**: runs the same on laptop, containers, Kubernetes
 
-**Your code now runs anywhere without changes!**
+**Your code runs consistently across different environments!**
 
 ---
 
@@ -81,18 +84,18 @@ def forecast_growth(revenue, growth_rate):
 from runnable import PythonJob
 
 def main():
-    job = PythonJob(function=forecast_growth)
+    job = PythonJob(
+        function=forecast_growth,
+        # returns=["forecast"]  # Optional: track return values
+    )
     job.execute()
     return job  # REQUIRED: Always return the job object
 
 if __name__ == "__main__":
     main()
 
 # Run different scenarios anywhere:
-# Local: RUNNABLE_PRM_revenue=100000 RUNNABLE_PRM_growth_rate=0.05 python forecast.py
-# Container: same command, same results
-# Kubernetes: same command, same results
-
+# RUNNABLE_PRM_revenue=100000 RUNNABLE_PRM_growth_rate=0.05 python forecast.py
 # ✨ Every run tracked with parameters - reproducible everywhere
 ```
 
@@ -130,8 +133,8 @@ from runnable import Pipeline, PythonTask
 
 def main():
     pipeline = Pipeline(steps=[
-        PythonTask(function=load_customer_data, returns=["customer_data"]),
-        PythonTask(function=analyze_segments, returns=["analysis"])
+        PythonTask(name="load_data", function=load_customer_data, returns=["customer_data"]),
+        PythonTask(name="analyze", function=analyze_segments, returns=["analysis"])
     ])
     pipeline.execute()
     return pipeline  # REQUIRED: Always return the pipeline object

diff --git a/docs/tutorial/02-making-it-reproducible.md b/docs/tutorial/02-making-it-reproducible.md
@@ -7,13 +7,21 @@ Now let's solve the first major problem: lack of execution tracking. We'll trans
 Instead of calling our function directly, we'll wrap it with Runnable's `PythonJob`:
 
 ```python title="examples/tutorials/getting-started/02_making_it_reproducible.py"
-from runnable import PythonJob
+from runnable import Catalog, PythonJob
 from functions import train_ml_model_basic
 
 def main():
+    # Define a Catalog to specify what files to save from the run
+    catalog = Catalog(put=["model.pkl", "results.json"])
+
     # Same function, now wrapped as a Job
-    job = PythonJob(function=train_ml_model_basic)
+    job = PythonJob(
+        function=train_ml_model_basic,
+        returns=["results"],
+        catalog=catalog,
+    )
     job.execute()
+
     return job
 
 if __name__ == "__main__":
@@ -42,22 +50,22 @@ ls .run_log_store/
 Each run directory contains:
 
 - **Execution metadata**: when it ran, how long it took
-- **Environment info**: Python version, package versions
 - **Results**: function return values
 - **Status**: success/failure with any error details
 
 ### ♻️ **Result Preservation**
 
-Unlike the basic version that overwrote `model.pkl` and `results.json`, each Runnable execution gets its own directory. Your results are never lost.
+Unlike the basic version that overwrote `model.pkl` and `results.json`, each Runnable execution gets its own directory.
+The results are stored in a catalog, ```.catalog``` with the same ```run_id```.
 
 ### 🔍 **Full Reproducibility**
 
 Each run captures everything needed to reproduce it:
 
 - Exact timestamp
 - Code version (if using git)
-- Environment details
 - Input parameters (we'll add those next!)
+- Output from function calls.
 
 ### 🎯 **Zero Code Changes**
 
@@ -73,7 +81,7 @@ uv run examples/tutorials/getting-started/02_making_it_reproducible.py
 uv run examples/tutorials/getting-started/02_making_it_reproducible.py
 ```
 
-Each run creates a separate log entry in `.run_log_store/`. You now have a complete history of all your experiments!
+Each run creates a separate log entry in `.run_log_store/` and `.catalog` . You now have a complete history of all your experiments!
 
 ## Compare: Before vs After
 

diff --git a/docs/tutorial/03-adding-flexibility.md b/docs/tutorial/03-adding-flexibility.md
@@ -81,19 +81,20 @@ Run different experiments:
 
 ```bash
 # Basic experiment
-uv run examples/tutorials/getting-started/03_adding_flexibility.py --parameters-file experiment_configs/basic.yaml
+RUNNABLE_PARAMETERS_FILE="experiment_configs/basic.yaml" uv run examples/tutorials/getting-started/03_adding_flexibility.py
 
 # Large forest experiment
-uv run examples/tutorials/getting-started/03_adding_flexibility.py --parameters-file experiment_configs/large_forest.yaml
+RUNNABLE_PARAMETERS_FILE="experiment_configs/large_forest.yaml" uv run examples/tutorials/getting-started/03_adding_flexibility.py
 ```
 
 ## Parameter Precedence
 
 Runnable handles parameter conflicts intelligently:
 
-1. **Environment variables** (highest priority): `RUNNABLE_PRM_n_estimators=300`
-2. **Command line config**: `--parameters-file config.yaml`
-3. **Function defaults** (lowest priority): What you defined in the function signature
+1. **Individual overrides** (highest priority): `RUNNABLE_PRM_n_estimators=300`
+2. **Environment file**: `RUNNABLE_PARAMETERS_FILE="config.yaml"`
+3. **Code-specified**: `job.execute(parameters_file="config.yaml")`
+4. **Function defaults** (lowest priority): What you defined in the function signature
 
 This means you can have a base configuration file but override specific values with environment variables.
 
@@ -110,7 +111,7 @@ This means you can have a base configuration file but override specific values w
 Every run gets logged with the exact parameters used:
 
 ```bash
-ls .runnable/run-log-store/
+ls .run_log_store/
 # Each timestamped directory contains the parameters for that run
 ```
 
@@ -138,7 +139,7 @@ uv run 03_adding_flexibility.py
 RUNNABLE_PRM_n_estimators=200 uv run 03_adding_flexibility.py
 
 # Experiment 3: From config file
-uv run 03_adding_flexibility.py --parameters-file experiment_configs/large_forest.yaml
+RUNNABLE_PARAMETERS_FILE="experiment_configs/large_forest.yaml" uv run 03_adding_flexibility.py
 
 # Check the logs - each run preserved with its parameters
 ls .run_log_store/

diff --git a/docs/tutorial/04-connecting-workflow.md b/docs/tutorial/04-connecting-workflow.md
@@ -65,16 +65,16 @@ def main():
 uv run examples/tutorials/getting-started/04_connecting_workflow.py
 ```
 
-## How Data Flows Automatically
+## How Data Flow Works Through Run Log
 
-Notice something magical: we didn't write any glue code! Runnable automatically connects the steps:
+Runnable connects pipeline steps through **run log parameter management**:
 
-1. **`load_data()`** returns a DataFrame
-2. **`preprocess_data(df)`** - gets the DataFrame automatically (parameter name matches!)
-3. **`train_model(preprocessed_data)`** - gets preprocessing results automatically
-4. **`evaluate_model(model_data, preprocessed_data)`** - gets both model and data automatically
+1. **`returns=[pickled("df")]`** → Run log stores parameter "df" (binary data in catalog)
+2. **`preprocess_data(df, ...)`** → Run log provides "df" parameter (fetches from catalog if pickled)
+3. **`train_model(preprocessed_data)`** → Run log provides "preprocessed_data" parameter
+4. **`evaluate_model(model_data, preprocessed_data)`** → Run log provides both parameters
 
-**The secret:** Parameter names in your functions determine data flow. If `train_model()` expects a parameter called `preprocessed_data`, and a previous step returns something called `preprocessed_data`, they get connected automatically.
+**The pattern:** Function parameter names must match the names in previous tasks' `returns` declarations because that's how the run log maps parameters.
 
 ## What You Get with Pipelines
 
@@ -94,8 +94,11 @@ evaluate: ✅ Completed in 0.2s
 Each step's output is saved. You can inspect intermediate results without rerunning expensive steps:
 
 ```bash
-# Check what the preprocessing step produced
-ls .runnable/
+# Check run log parameter tracking
+ls .run_log_store/
+
+# Check pickled data storage
+ls .catalog/
 ```
 
 ### 🛠️ **Better Debugging**
@@ -106,6 +109,46 @@ If training fails, you don't lose your preprocessing work. You can debug just th
 
 See timing and resource usage for each step, helping identify bottlenecks.
 
+## 🔗 Understanding Parameter Naming
+
+For data flow to work correctly, follow this naming pattern:
+
+```python
+# Step 1: Function returns something, run log tracks as "df"
+PythonTask(function=load_data, returns=[pickled("df")])
+
+# Step 2: Function parameter "df" matches run log parameter "df"
+def preprocess_data(df, test_size=0.2):  # Gets "df" from run log
+    return preprocessed_data
+
+# Step 3: Save as "preprocessed_data" in run log
+PythonTask(function=preprocess_data, returns=[pickled("preprocessed_data")])
+
+# Step 4: Parameter names match run log parameter names
+def train_model(preprocessed_data, n_estimators=100):  # Gets from run log
+def evaluate_model(model_data, preprocessed_data):    # Gets both from run log
+```
+
+**Key Rule:** Parameter names in later functions must exactly match the names in earlier `returns` declarations.
+
+## 🚨 Common Parameter Issues
+
+**Problem**: Parameter name doesn't match returns name
+```python
+# Won't work - name mismatch!
+PythonTask(function=load_data, returns=[pickled("dataframe")])
+def preprocess_data(df, test_size=0.2):  # Run log has "dataframe", expects "df"
+```
+
+**Solution**: Make parameter names match returns names
+```python
+# Works - run log has "df", function expects "df"
+PythonTask(function=load_data, returns=[pickled("df")])
+def preprocess_data(df, test_size=0.2):  # Gets "df" from run log
+```
+
+**Debug Tip**: Check run log files in `.run_log_store/` to see actual parameter names stored.
+
 ## Advanced: Parameters in Pipelines
 
 You can still use parameters, but now at the step level:
@@ -140,7 +183,7 @@ Parameters get passed to the appropriate functions based on their parameter name
 - ✅ Intermediate results preserved
 - ✅ Resume from failed steps
 - ✅ Better debugging and development
-- ✅ Automatic data flow between steps
+- ✅ Parameter-based data flow between steps
 
 ## Your Functions Didn't Change
 

diff --git a/docs/tutorial/05-handling-datasets.md b/docs/tutorial/05-handling-datasets.md
@@ -20,11 +20,11 @@ PythonTask(
 - Can't easily inspect intermediate results
 - Memory pressure on your system
 
-## The Solution: Catalog for File Storage
+## The Solution: Advanced Catalog Usage
 
-Instead of passing data through memory, save it to files and let Runnable manage them:
+In Chapter 2, we used `Catalog(put=["model.pkl", "results.json"])` to save final results. Now let's expand this for large dataset management - instead of passing data through memory, we'll use catalog's `get=[]` and `put=[]` capabilities to manage intermediate datasets between pipeline steps:
 
-```python title="examples/tutorials/getting-started/05_handling_datasets.py" hl_lines="11 19-20"
+```python title="examples/tutorials/getting-started/05_handling_datasets.py""
 from runnable import Pipeline, PythonTask, Catalog, pickled
 
 def load_data_to_file(data_path="data.csv"):
@@ -48,7 +48,14 @@ PythonTask(
 uv run examples/tutorials/getting-started/05_handling_datasets.py
 ```
 
-## How Catalog Works
+## How Catalog Works for Large Datasets
+
+**Progression from Chapter 2:**
+
+- **Chapter 2**: `Catalog(put=["model.pkl"])` - Save final results only
+- **Chapter 5**: `Catalog(get=["data.csv"], put=["processed.csv"])` - Manage intermediate datasets between steps
+
+The key difference: **`get=[]`** parameter lets tasks retrieve files created by previous tasks.
 
 ### Step 1: Create and Store Files
 
@@ -102,7 +109,7 @@ PythonTask(
 Here's the full pipeline using file storage for large data:
 
 ```python title="examples/tutorials/getting-started/05_handling_datasets.py"
---8<-- "examples/tutorials/getting-started/05_handling_datasets.py:62:119"
+--8<-- "examples/tutorials/getting-started/05_handling_datasets.py:86:125"
 ```
 
 ## What You Get with File-Based Storage
@@ -121,7 +128,7 @@ X_train = pd.read_csv("X_train.csv")  # Maybe 50GB
 
 Runnable handles file locations transparently:
 
-- **`put=["file.parquet"]`** - Stores file safely in `.runnable/` catalog
+- **`put=["file.parquet"]`** - Stores file safely in `.catalog/`
 - **`get=["file.parquet"]`** - Makes file available in your working directory
 - Files appear exactly where your code expects them
 
@@ -131,7 +138,7 @@ All intermediate files are preserved:
 
 ```bash
 # Check what preprocessing produced
-ls .runnable/catalog/
+ls .catalog/
 # X_train.csv  X_test.csv  y_train.csv  y_test.csv
 ```