JoeGaffney · JoeGaffney · Nov 27, 2025 · Nov 22, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ We try to use plural to adhere to REST best practices.
 
 ### **Scalability for AI Projects**
 
-- AI models often require **domain-specific logic**. Keeping `schemas.py`, `context.py`, and `models/` in the same module makes it easier to extend functionality.
+- AI models often require **domain-specific logic**. Keeping `schemas.py`, `context.py`, and `tasks/` in the same module makes it easier to extend functionality.
 - If a new AI domain (`audio`, `3D`, etc.) is introduced, the structure remains consistent just duplicate the existing pattern.
 
 ```
@@ -122,7 +122,8 @@ This design choice ensures:
 Developers who want to extend or modify available models can do so by editing the typed definitions directly in code:
 
 - `api/images/schemas.py`
-- `workers/images/tasks.py` or `workers/images/models/`
+- `workers/images/tasks.py`
+- `workers/images/local/...`
 
 Each new model entry should include:
 
@@ -161,10 +162,11 @@ ImageContext / VideoContext
     │
     ▼
 Pipeline Function (pure function)
-    ├─ Calls one of:
+    ├─ Calls:
           - text_to_image_call(context)
           - image_to_image_call(context)
           - inpainting_call(context)
+          - ...
     ├─ Internally selects the exact model(s) / transformer variants:
           - Flux: Krea / Kontext / Fill
           - WAN, VEO variants based on context

diff --git a/api/images/schemas.py b/api/images/schemas.py
@@ -9,9 +9,11 @@
 ModelName: TypeAlias = Literal[
     "sd-xl",
     "flux-1",
+    "flux-2",
     "qwen-image",
     "depth-anything-2",
-    "segment-anything-2",
+    "sam-2",
+    "sam-3",
     "real-esrgan-x4",
     "gpt-image-1",
     "runway-gen4-image",
@@ -56,6 +58,13 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
         references=True,
         description="FLUX dev model (Krea tuned). Uses Kontext for img2img, Fill for inpainting.",
     ),
+    "flux-2": ImagesModelInfo(
+        provider="local",
+        external=False,
+        supported_modes={"text-to-image", "image-to-image", "inpainting"},
+        references=True,
+        description="FLUX 2.0 dev model with edit capabilities.",
+    ),
     "qwen-image": ImagesModelInfo(
         provider="local",
         external=False,
@@ -69,11 +78,17 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
         supported_modes={"image-to-image"},
         description="Depth estimation pipeline.",
     ),
-    "segment-anything-2": ImagesModelInfo(
+    "sam-2": ImagesModelInfo(
+        provider="local",
+        external=False,
+        supported_modes={"image-to-image"},
+        description="Meta's SAM 2 Segmentation pipeline.",
+    ),
+    "sam-3": ImagesModelInfo(
         provider="local",
         external=False,
         supported_modes={"image-to-image"},
-        description="Segmentation pipeline.",
+        description="Meta's SAM 3 Segmentation pipeline.",
     ),
     "gpt-image-1": ImagesModelInfo(
         provider="openai",
@@ -221,8 +236,8 @@ def external_model(self) -> bool:
         return self.meta.external
 
     @property
-    def task_name(self) -> ModelName:
-        return self.model
+    def task_name(self) -> str:
+        return f"images.{self.model}"
 
     @property
     def task_queue(self) -> str:

diff --git a/api/texts/schemas.py b/api/texts/schemas.py
@@ -21,13 +21,15 @@ def queue(self) -> str:
 
 
 class SystemPrompt(str, Enum):
+    NONE = "NONE"
     BASE = "BASE"
     IMAGE_OPTIMIZER = "IMAGE_OPTIMIZER"
     VIDEO_OPTIMIZER = "VIDEO_OPTIMIZER"
     VIDEO_TRANSITION = "VIDEO_TRANSITION"
 
 
 SYSTEM_PROMPT_TEXT = {
+    SystemPrompt.NONE: "",
     SystemPrompt.BASE: (
         "You are a helpful AI assistant specialized in visual effects, filmmaking, and creative workflows. "
         "You excel at analyzing images and videos, describing visual content in detail, and providing expert feedback. "
@@ -48,77 +50,48 @@ class SystemPrompt(str, Enum):
         "Do not ask for clarification—provide the best possible response based on the given input."
     ),
     SystemPrompt.IMAGE_OPTIMIZER: (
-        "You are an expert AI image prompt optimizer. Create detailed, vivid descriptions optimized for models like Flux, Stable Diffusion, or Midjourney. "
+        "You are an expert AI image prompt optimizer. Write concise, vivid descriptions for models like Flux, Stable Diffusion, or Midjourney."
         "\n"
-        "Write flowing sentences in this sequence: "
-        "Primary subject and focal point, secondary elements and spatial relationships, "
-        "setting and background, artistic style and medium, lighting and color palette, "
-        "composition and camera perspective. "
+        "Describe in this order: primary subject and focal point, secondary elements and spatial relationships, setting and background, artistic style and medium, lighting and color palette, composition and camera perspective."
         "\n"
-        "Default to photorealism unless the user explicitly requests a different style (painting, illustration, cartoon, etc.). "
-        "When photorealistic, emphasize: realistic lighting, accurate materials and textures, natural physics, "
-        "believable proportions, and lifelike details. "
+        "Default to photorealism unless a different style is requested. For photorealism, focus on realistic lighting, accurate materials, natural physics, believable proportions, and lifelike details."
         "\n"
-        "If a reference image is provided: "
-        "- For style transfer or img2img: describe desired changes while noting what to preserve\n"
-        "- For inspiration: use it to inform mood, composition, or style without literal replication\n"
+        "If a reference image is provided:"
+        "- For style transfer or img2img: describe desired changes and what to preserve\n"
+        "- For inspiration: use it to inform mood, composition, or style, not literal replication\n"
         "\n"
-        "Include naturally: technical terms (bokeh, depth of field, golden hour, subsurface scattering), "
-        "style markers (photograph, DSLR photo, cinematic, or oil painting, digital art when non-photorealistic), "
-        "and quality tags (highly detailed, sharp focus, 8K, masterpiece, photorealistic). "
+        "Use technical terms, style markers, and quality tags only when relevant; do not force them into every prompt."
         "\n"
         "Output only the optimized prompt. No preamble, labels, or explanations."
     ),
     SystemPrompt.VIDEO_OPTIMIZER: (
-        "You are an expert AI video prompt optimizer. Create cinematic descriptions optimized for models like Runway, Pika, or Kling. "
+        "You are an expert AI video prompt optimizer. Write concise, cinematic descriptions for models like Runway, Pika, or Kling."
         "\n"
-        "Write flowing sentences in this sequence: "
-        "Core action and scene progression, camera movement and framing, "
-        "environment and spatial relationships, lighting and atmosphere, "
-        "subject details and interactions, temporal elements and pacing. "
+        "Describe in this order: core action and scene progression, camera movement and framing, environment, lighting, subject details, and pacing."
         "\n"
-        "Default to photorealism and realistic physics unless the user explicitly requests a different style (animation, stylized, surreal, etc.). "
-        "When realistic, emphasize: natural motion dynamics, believable physics (gravity, inertia, momentum), "
-        "realistic lighting changes, authentic material behavior, and lifelike interactions. "
+        "Default to photorealism and realistic physics unless a different style is requested. When realistic, focus on natural motion, believable physics, realistic lighting, and lifelike interactions."
         "\n"
-        "If reference images are provided: "
-        "- Single image: treat as a starting frame and describe how motion/life emerges from it\n"
-        "- Multiple images: use only the first as inspiration for the opening frame; focus on motion and progression\n"
-        "- No images: rely purely on the text prompt\n"
+        "If reference images are provided:"
+        "- Single image: treat as the starting frame and describe how motion emerges\n"
+        "- Multiple images: use only the first for the opening frame; focus on motion and progression\n"
+        "- No images: rely on the text prompt\n"
         "\n"
-        "Include naturally: camera terms (dolly zoom, tracking shot, handheld), "
-        "timing words (gradual, sudden, continuous, smooth), "
-        "physics terms (momentum, weight, natural movement), "
-        "lighting terms (golden hour, volumetric, high contrast), "
-        "and quality markers (4K, cinematic, professional, photorealistic). "
+        "Use technical terms (camera, timing, physics, lighting, quality) only when relevant; do not force them into every prompt."
         "\n"
-        "Emphasize motion, continuity, and temporal flow. Be specific about how things move and change realistically. "
+        "Emphasize motion and temporal flow. Be specific about realistic changes."
         "\n"
         "Output only the optimized prompt. No preamble, labels, or explanations."
     ),
     SystemPrompt.VIDEO_TRANSITION: (
         "You are an expert AI video prompt optimizer for keyframe-to-keyframe video generation. "
-        "Two reference images define start and end states—describe the coherent journey between them. "
+        "Two reference images define start and end states—describe the journey between them."
         "\n"
-        "Write flowing sentences that: "
-        "Establish the starting state, describe the transformation and transition, "
-        "detail camera movement through the sequence, "
-        "specify how environment and lighting evolve, "
-        "and describe the arrival at the final state. "
+        "Establish the starting state, describe the transformation and transition, camera movement, environment and lighting changes, and the arrival at the final state."
         "\n"
-        "Default to photorealism and realistic physics unless the context suggests otherwise. "
-        "Emphasize natural, believable transitions with realistic motion dynamics and physics. "
+        "Default to photorealism and realistic physics unless context suggests otherwise. Emphasize natural, believable transitions and realistic motion."
         "\n"
-        "Focus on smooth, coherent transitions: "
-        "- What changes gradually vs. suddenly?\n"
-        "- How does the camera guide the viewer through the transition?\n"
-        "- What are the key visual milestones?\n"
-        "- What stays consistent as an anchor?\n"
-        "- How do physics and momentum carry through the transition?\n"
-        "\n"
-        "Include temporal markers (beginning, midway, approaching the end) and "
-        "cinematic terms (match cut, morph, cross-dissolve, continuous motion) naturally. "
-        "Add quality markers (seamless transition, smooth motion, 4K, cinematic, photorealistic). "
+        "Use temporal markers and cinematic terms (e.g., match cut, morph, cross-dissolve, continuous motion) only if relevant; do not force them into every prompt."
+        "Quality markers (e.g., seamless transition, smooth motion, 4K, cinematic, photorealistic) are examples—use only when appropriate."
         "\n"
         "Output only the optimized prompt. No preamble, labels, or explanations."
     ),
@@ -170,6 +143,7 @@ class TextRequest(BaseModel):
     system_prompt: SystemPrompt = Field(
         description=(
             "System prompt type. Options:\n"
+            "NONE: Will use the model's default behavior.\n\n"
             "BASE: " + SYSTEM_PROMPT_TEXT[SystemPrompt.BASE] + "\n\n"
             "IMAGE_OPTIMIZER: " + SYSTEM_PROMPT_TEXT[SystemPrompt.IMAGE_OPTIMIZER] + "\n\n"
             "VIDEO_OPTIMIZER: " + SYSTEM_PROMPT_TEXT[SystemPrompt.VIDEO_OPTIMIZER] + "\n\n"
@@ -189,8 +163,8 @@ def external_model(self) -> bool:
         return self.meta.external
 
     @property
-    def task_name(self) -> ModelName:
-        return self.model
+    def task_name(self) -> str:
+        return f"texts.{self.model}"
 
     @property
     def task_queue(self) -> str:

diff --git a/api/videos/schemas.py b/api/videos/schemas.py
@@ -39,7 +39,6 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
 
 # Unified metadata (local + external)
 MODEL_META: Dict[ModelName, VideosModelInfo] = {
-    # Local
     "ltx-video": VideosModelInfo(
         provider="local",
         external=False,
@@ -52,9 +51,8 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
         supported_modes={"text-to-video", "image-to-video", "first-last-image", "video-to-video"},
         description="Wan 2.2, quality open-source video generation model. Will fall back to Wan VACE 2.1 for video-to-video.",
     ),
-    # External
     "runway-gen-4": VideosModelInfo(
-        provider="runway",
+        provider="replicate",
         external=True,
         supported_modes={"image-to-video", "video-to-video"},
         description="Runway Gen-4 family. Uses standard Gen-4 for image-to-video and Aleph variant for video-to-video.",
@@ -66,10 +64,10 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
         description="Matches animation from a reference video to a character reference image.",
     ),
     "runway-upscale": VideosModelInfo(
-        provider="runway",
+        provider="replicate",
         external=True,
         supported_modes={"video-to-video"},
-        description="Video upscaling model.",
+        description="Runway's video upscaling model.",
     ),
     "bytedance-seedance-1": VideosModelInfo(
         provider="replicate",
@@ -180,8 +178,8 @@ def external_model(self) -> bool:
         return self.meta.external
 
     @property
-    def task_name(self) -> ModelName:
-        return self.model
+    def task_name(self) -> str:
+        return f"videos.{self.model}"
 
     @property
     def task_queue(self) -> str:

diff --git a/clients/nuke/gizmos/dd_image.gizmo b/clients/nuke/gizmos/dd_image.gizmo
@@ -7,7 +7,7 @@ Group {
  addUserKnob {1 node_type +HIDDEN}
  node_type dd_image
  addUserKnob {26 ""}
- addUserKnob {4 model l "Model" M {sd-xl flux-1 qwen-image depth-anything-2 segment-anything-2 real-esrgan-x4 gpt-image-1 runway-gen4-image flux-1-pro topazlabs-upscale google-gemini-2 google-gemini-3 bytedance-seedream-4 } t "Select the image generation model."}
+ addUserKnob {4 model l "Model" M {sd-xl flux-1 qwen-image depth-anything-2 sam-2 sam-3 real-esrgan-x4 gpt-image-1 runway-gen4-image flux-1-pro topazlabs-upscale google-gemini-2 google-gemini-3 bytedance-seedream-4 } t "Select the image generation model."}
  addUserKnob {22 generate l "Generate" -STARTLINE T "import dd_image\nnode = nuke.thisNode()\ndd_image.process_image(node)" t "Start image generation with current settings."}
  addUserKnob {1 task_id l "Task ID" -STARTLINE t "ID of the current image generation task."}
  addUserKnob {22 get_task l "Get Task" -STARTLINE T "import dd_image\nnode = nuke.thisNode()\ndd_image.get_image(node)" t "Fetch the latest image for this task."}

diff --git a/clients/openapi.json b/clients/openapi.json
diff --git a/workers/common/config.py b/workers/common/config.py
@@ -40,3 +40,5 @@ def _read_bool(env_name: str, default: bool = False) -> bool:
 # When true, pipelines/models should prefer CPU offload where supported to reduce GPU memory
 IMAGE_CPU_OFFLOAD = _read_bool("IMAGE_CPU_OFFLOAD", default=False)
 VIDEO_CPU_OFFLOAD = _read_bool("VIDEO_CPU_OFFLOAD", default=False)
+
+ONE_MB_IN_BYTES = 1 * 1024 * 1024
diff --git a/workers/common/pipeline_helpers.py b/workers/common/pipeline_helpers.py
@@ -2,8 +2,8 @@
 import os
 import time
 from collections import OrderedDict
-from functools import lru_cache, wraps
-from typing import Literal, Union
+from functools import wraps
+from typing import Literal, Optional, Union
-from typing import Literal, Optional, Union
+from typing import Literal, Union
-from typing import Literal, Optional, Union
+from typing import Literal, Union
 
 import torch
 from accelerate.hooks import CpuOffload
@@ -25,6 +25,7 @@
 
 @time_info_decorator
 def patched_pre_forward(self, module, *args, **kwargs):
+    """Patched pre_forward to log timiing for offloading."""
-    """Patched pre_forward to log timiing for offloading."""
+    """Patched pre_forward to log timing for offloading."""
-    """Patched pre_forward to log timiing for offloading."""
+    """Patched pre_forward to log timing for offloading."""
     return _original_pre_forward(self, module, *args, **kwargs)
 
 
@@ -126,8 +127,7 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@time_info_decorator
-def optimize_pipeline(pipe, disable_safety_checker=True, offload=True, vae_tiling=True):
+def optimize_pipeline(pipe, offload=True, vae_tiling=True):
     # Override the safety checker
     def dummy_safety_checker(images, **kwargs):
         return images, [False] * len(images)
@@ -140,11 +140,15 @@ def dummy_safety_checker(images, **kwargs):
     if vae_tiling:
         try:
             pipe.vae.enable_tiling()  # Enable VAE tiling to improve memory efficiency
-            pipe.vae.enable_slicing()
         except:
             pass  # VAE tiling is not available for all models
 
-    if disable_safety_checker:
+        try:
+            pipe.vae.enable_slicing()  # Enable VAE slicing to reduce memory usage
+        except:
-        except:
+        except Exception:
-        except:
+        except Exception:
+            pass  # VAE slicing is not available for all models
+
+    if hasattr(pipe, "disable_safety_checker"):
         pipe.safety_checker = dummy_safety_checker
 
     return pipe
@@ -180,7 +184,12 @@ def get_quant_dir(model_id: str, subfolder: str, load_in_4bit: bool) -> str:
 
 @time_info_decorator
 def get_quantized_model(
-    model_id, subfolder, model_class, target_precision: Literal[4, 8, 16] = 8, torch_dtype=torch.float16
+    model_id,
+    subfolder,
+    model_class,
+    target_precision: Literal[4, 8, 16] = 8,
+    torch_dtype=torch.float16,
+    offload: bool = False,
 ):
     """
     Load a quantized model component if available locally; otherwise, load original,
@@ -197,9 +206,14 @@ def get_quantized_model(
         model instance
     """
 
+    # if we will be offloading, load to CPU
+    args = {}
+    if offload:
+        args["device_map"] = "cpu"
+
     if target_precision == 16:
-        logger.warning(f"Quantization disabled for {model_id} subfolder {subfolder}")
-        return model_class.from_pretrained(model_id, subfolder=subfolder, torch_dtype=torch_dtype)
+        logger.debug(f"Quantization disabled for {model_id} subfolder {subfolder}")
+        return model_class.from_pretrained(model_id, subfolder=subfolder, torch_dtype=torch_dtype, **args)
 
     load_in_4bit = target_precision == 4
     quant_dir = get_quant_dir(model_id, subfolder, load_in_4bit=load_in_4bit)
@@ -224,7 +238,7 @@ def get_quantized_model(
     try:
         logger.info(f"Loading quantized model from {quant_dir}")
         model = model_class.from_pretrained(
-            quant_dir, torch_dtype=torch_dtype, local_files_only=True, use_safetensors=use_safetensors
+            quant_dir, torch_dtype=torch_dtype, local_files_only=True, use_safetensors=use_safetensors, **args
         )
     except Exception as e:
         logger.error(f"Failed to load quantized model from {quant_dir}: {e}")

diff --git a/workers/common/replicate_helpers.py b/workers/common/replicate_helpers.py
@@ -6,11 +6,11 @@
 from replicate.helpers import FileOutput
 
 
-def replicate_run(model_path: str, payload: dict) -> Any:
+def replicate_run(model_path: str, payload: dict[str, Any]) -> Any:
     try:
         output = replicate.run(model_path, input=payload)
     except Exception as e:
-        raise RuntimeError(f"Error calling Replicate API: {e}")
+        raise RuntimeError(f"Error calling Replicate API {model_path}: {e}")
 
     return output