Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ We try to use plural to adhere to REST best practices.

### **Scalability for AI Projects**

- AI models often require **domain-specific logic**. Keeping `schemas.py`, `context.py`, and `models/` in the same module makes it easier to extend functionality.
- AI models often require **domain-specific logic**. Keeping `schemas.py`, `context.py`, and `tasks/` in the same module makes it easier to extend functionality.
- If a new AI domain (`audio`, `3D`, etc.) is introduced, the structure remains consistent just duplicate the existing pattern.

```
Expand Down Expand Up @@ -122,7 +122,8 @@ This design choice ensures:
Developers who want to extend or modify available models can do so by editing the typed definitions directly in code:

- `api/images/schemas.py`
- `workers/images/tasks.py` or `workers/images/models/`
- `workers/images/tasks.py`
- `workers/images/local/...`

Each new model entry should include:

Expand Down Expand Up @@ -161,10 +162,11 @@ ImageContext / VideoContext
Pipeline Function (pure function)
├─ Calls one of:
├─ Calls:
- text_to_image_call(context)
- image_to_image_call(context)
- inpainting_call(context)
- ...
├─ Internally selects the exact model(s) / transformer variants:
- Flux: Krea / Kontext / Fill
- WAN, VEO variants based on context
Expand Down
25 changes: 20 additions & 5 deletions api/images/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
ModelName: TypeAlias = Literal[
"sd-xl",
"flux-1",
"flux-2",
"qwen-image",
"depth-anything-2",
"segment-anything-2",
"sam-2",
"sam-3",
"real-esrgan-x4",
"gpt-image-1",
"runway-gen4-image",
Expand Down Expand Up @@ -56,6 +58,13 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
references=True,
description="FLUX dev model (Krea tuned). Uses Kontext for img2img, Fill for inpainting.",
),
"flux-2": ImagesModelInfo(
provider="local",
external=False,
supported_modes={"text-to-image", "image-to-image", "inpainting"},
references=True,
description="FLUX 2.0 dev model with edit capabilities.",
),
"qwen-image": ImagesModelInfo(
provider="local",
external=False,
Expand All @@ -69,11 +78,17 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
supported_modes={"image-to-image"},
description="Depth estimation pipeline.",
),
"segment-anything-2": ImagesModelInfo(
"sam-2": ImagesModelInfo(
provider="local",
external=False,
supported_modes={"image-to-image"},
description="Meta's SAM 2 Segmentation pipeline.",
),
"sam-3": ImagesModelInfo(
provider="local",
external=False,
supported_modes={"image-to-image"},
description="Segmentation pipeline.",
description="Meta's SAM 3 Segmentation pipeline.",
),
"gpt-image-1": ImagesModelInfo(
provider="openai",
Expand Down Expand Up @@ -221,8 +236,8 @@ def external_model(self) -> bool:
return self.meta.external

@property
def task_name(self) -> ModelName:
return self.model
def task_name(self) -> str:
return f"images.{self.model}"

@property
def task_queue(self) -> str:
Expand Down
78 changes: 26 additions & 52 deletions api/texts/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,15 @@ def queue(self) -> str:


class SystemPrompt(str, Enum):
NONE = "NONE"
BASE = "BASE"
IMAGE_OPTIMIZER = "IMAGE_OPTIMIZER"
VIDEO_OPTIMIZER = "VIDEO_OPTIMIZER"
VIDEO_TRANSITION = "VIDEO_TRANSITION"


SYSTEM_PROMPT_TEXT = {
SystemPrompt.NONE: "",
SystemPrompt.BASE: (
"You are a helpful AI assistant specialized in visual effects, filmmaking, and creative workflows. "
"You excel at analyzing images and videos, describing visual content in detail, and providing expert feedback. "
Expand All @@ -48,77 +50,48 @@ class SystemPrompt(str, Enum):
"Do not ask for clarification—provide the best possible response based on the given input."
),
SystemPrompt.IMAGE_OPTIMIZER: (
"You are an expert AI image prompt optimizer. Create detailed, vivid descriptions optimized for models like Flux, Stable Diffusion, or Midjourney. "
"You are an expert AI image prompt optimizer. Write concise, vivid descriptions for models like Flux, Stable Diffusion, or Midjourney."
"\n"
"Write flowing sentences in this sequence: "
"Primary subject and focal point, secondary elements and spatial relationships, "
"setting and background, artistic style and medium, lighting and color palette, "
"composition and camera perspective. "
"Describe in this order: primary subject and focal point, secondary elements and spatial relationships, setting and background, artistic style and medium, lighting and color palette, composition and camera perspective."
"\n"
"Default to photorealism unless the user explicitly requests a different style (painting, illustration, cartoon, etc.). "
"When photorealistic, emphasize: realistic lighting, accurate materials and textures, natural physics, "
"believable proportions, and lifelike details. "
"Default to photorealism unless a different style is requested. For photorealism, focus on realistic lighting, accurate materials, natural physics, believable proportions, and lifelike details."
"\n"
"If a reference image is provided: "
"- For style transfer or img2img: describe desired changes while noting what to preserve\n"
"- For inspiration: use it to inform mood, composition, or style without literal replication\n"
"If a reference image is provided:"
"- For style transfer or img2img: describe desired changes and what to preserve\n"
"- For inspiration: use it to inform mood, composition, or style, not literal replication\n"
"\n"
"Include naturally: technical terms (bokeh, depth of field, golden hour, subsurface scattering), "
"style markers (photograph, DSLR photo, cinematic, or oil painting, digital art when non-photorealistic), "
"and quality tags (highly detailed, sharp focus, 8K, masterpiece, photorealistic). "
"Use technical terms, style markers, and quality tags only when relevant; do not force them into every prompt."
"\n"
"Output only the optimized prompt. No preamble, labels, or explanations."
),
SystemPrompt.VIDEO_OPTIMIZER: (
"You are an expert AI video prompt optimizer. Create cinematic descriptions optimized for models like Runway, Pika, or Kling. "
"You are an expert AI video prompt optimizer. Write concise, cinematic descriptions for models like Runway, Pika, or Kling."
"\n"
"Write flowing sentences in this sequence: "
"Core action and scene progression, camera movement and framing, "
"environment and spatial relationships, lighting and atmosphere, "
"subject details and interactions, temporal elements and pacing. "
"Describe in this order: core action and scene progression, camera movement and framing, environment, lighting, subject details, and pacing."
"\n"
"Default to photorealism and realistic physics unless the user explicitly requests a different style (animation, stylized, surreal, etc.). "
"When realistic, emphasize: natural motion dynamics, believable physics (gravity, inertia, momentum), "
"realistic lighting changes, authentic material behavior, and lifelike interactions. "
"Default to photorealism and realistic physics unless a different style is requested. When realistic, focus on natural motion, believable physics, realistic lighting, and lifelike interactions."
"\n"
"If reference images are provided: "
"- Single image: treat as a starting frame and describe how motion/life emerges from it\n"
"- Multiple images: use only the first as inspiration for the opening frame; focus on motion and progression\n"
"- No images: rely purely on the text prompt\n"
"If reference images are provided:"
"- Single image: treat as the starting frame and describe how motion emerges\n"
"- Multiple images: use only the first for the opening frame; focus on motion and progression\n"
"- No images: rely on the text prompt\n"
"\n"
"Include naturally: camera terms (dolly zoom, tracking shot, handheld), "
"timing words (gradual, sudden, continuous, smooth), "
"physics terms (momentum, weight, natural movement), "
"lighting terms (golden hour, volumetric, high contrast), "
"and quality markers (4K, cinematic, professional, photorealistic). "
"Use technical terms (camera, timing, physics, lighting, quality) only when relevant; do not force them into every prompt."
"\n"
"Emphasize motion, continuity, and temporal flow. Be specific about how things move and change realistically. "
"Emphasize motion and temporal flow. Be specific about realistic changes."
"\n"
"Output only the optimized prompt. No preamble, labels, or explanations."
),
SystemPrompt.VIDEO_TRANSITION: (
"You are an expert AI video prompt optimizer for keyframe-to-keyframe video generation. "
"Two reference images define start and end states—describe the coherent journey between them. "
"Two reference images define start and end states—describe the journey between them."
"\n"
"Write flowing sentences that: "
"Establish the starting state, describe the transformation and transition, "
"detail camera movement through the sequence, "
"specify how environment and lighting evolve, "
"and describe the arrival at the final state. "
"Establish the starting state, describe the transformation and transition, camera movement, environment and lighting changes, and the arrival at the final state."
"\n"
"Default to photorealism and realistic physics unless the context suggests otherwise. "
"Emphasize natural, believable transitions with realistic motion dynamics and physics. "
"Default to photorealism and realistic physics unless context suggests otherwise. Emphasize natural, believable transitions and realistic motion."
"\n"
"Focus on smooth, coherent transitions: "
"- What changes gradually vs. suddenly?\n"
"- How does the camera guide the viewer through the transition?\n"
"- What are the key visual milestones?\n"
"- What stays consistent as an anchor?\n"
"- How do physics and momentum carry through the transition?\n"
"\n"
"Include temporal markers (beginning, midway, approaching the end) and "
"cinematic terms (match cut, morph, cross-dissolve, continuous motion) naturally. "
"Add quality markers (seamless transition, smooth motion, 4K, cinematic, photorealistic). "
"Use temporal markers and cinematic terms (e.g., match cut, morph, cross-dissolve, continuous motion) only if relevant; do not force them into every prompt."
"Quality markers (e.g., seamless transition, smooth motion, 4K, cinematic, photorealistic) are examples—use only when appropriate."
"\n"
"Output only the optimized prompt. No preamble, labels, or explanations."
),
Expand Down Expand Up @@ -170,6 +143,7 @@ class TextRequest(BaseModel):
system_prompt: SystemPrompt = Field(
description=(
"System prompt type. Options:\n"
"NONE: Will use the model's default behavior.\n\n"
"BASE: " + SYSTEM_PROMPT_TEXT[SystemPrompt.BASE] + "\n\n"
"IMAGE_OPTIMIZER: " + SYSTEM_PROMPT_TEXT[SystemPrompt.IMAGE_OPTIMIZER] + "\n\n"
"VIDEO_OPTIMIZER: " + SYSTEM_PROMPT_TEXT[SystemPrompt.VIDEO_OPTIMIZER] + "\n\n"
Expand All @@ -189,8 +163,8 @@ def external_model(self) -> bool:
return self.meta.external

@property
def task_name(self) -> ModelName:
return self.model
def task_name(self) -> str:
return f"texts.{self.model}"

@property
def task_queue(self) -> str:
Expand Down
12 changes: 5 additions & 7 deletions api/videos/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:

# Unified metadata (local + external)
MODEL_META: Dict[ModelName, VideosModelInfo] = {
# Local
"ltx-video": VideosModelInfo(
provider="local",
external=False,
Expand All @@ -52,9 +51,8 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
supported_modes={"text-to-video", "image-to-video", "first-last-image", "video-to-video"},
description="Wan 2.2, quality open-source video generation model. Will fall back to Wan VACE 2.1 for video-to-video.",
),
# External
"runway-gen-4": VideosModelInfo(
provider="runway",
provider="replicate",
external=True,
supported_modes={"image-to-video", "video-to-video"},
description="Runway Gen-4 family. Uses standard Gen-4 for image-to-video and Aleph variant for video-to-video.",
Expand All @@ -66,10 +64,10 @@ def supports_inferred_mode(self, mode: InferredMode) -> bool:
description="Matches animation from a reference video to a character reference image.",
),
"runway-upscale": VideosModelInfo(
provider="runway",
provider="replicate",
external=True,
supported_modes={"video-to-video"},
description="Video upscaling model.",
description="Runway's video upscaling model.",
),
"bytedance-seedance-1": VideosModelInfo(
provider="replicate",
Expand Down Expand Up @@ -180,8 +178,8 @@ def external_model(self) -> bool:
return self.meta.external

@property
def task_name(self) -> ModelName:
return self.model
def task_name(self) -> str:
return f"videos.{self.model}"

@property
def task_queue(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion clients/nuke/gizmos/dd_image.gizmo
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Group {
addUserKnob {1 node_type +HIDDEN}
node_type dd_image
addUserKnob {26 ""}
addUserKnob {4 model l "Model" M {sd-xl flux-1 qwen-image depth-anything-2 segment-anything-2 real-esrgan-x4 gpt-image-1 runway-gen4-image flux-1-pro topazlabs-upscale google-gemini-2 google-gemini-3 bytedance-seedream-4 } t "Select the image generation model."}
addUserKnob {4 model l "Model" M {sd-xl flux-1 qwen-image depth-anything-2 sam-2 sam-3 real-esrgan-x4 gpt-image-1 runway-gen4-image flux-1-pro topazlabs-upscale google-gemini-2 google-gemini-3 bytedance-seedream-4 } t "Select the image generation model."}
addUserKnob {22 generate l "Generate" -STARTLINE T "import dd_image\nnode = nuke.thisNode()\ndd_image.process_image(node)" t "Start image generation with current settings."}
addUserKnob {1 task_id l "Task ID" -STARTLINE t "ID of the current image generation task."}
addUserKnob {22 get_task l "Get Task" -STARTLINE T "import dd_image\nnode = nuke.thisNode()\ndd_image.get_image(node)" t "Fetch the latest image for this task."}
Expand Down
2 changes: 1 addition & 1 deletion clients/openapi.json

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions workers/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@ def _read_bool(env_name: str, default: bool = False) -> bool:
# When true, pipelines/models should prefer CPU offload where supported to reduce GPU memory
IMAGE_CPU_OFFLOAD = _read_bool("IMAGE_CPU_OFFLOAD", default=False)
VIDEO_CPU_OFFLOAD = _read_bool("VIDEO_CPU_OFFLOAD", default=False)

ONE_MB_IN_BYTES = 1 * 1024 * 1024
34 changes: 24 additions & 10 deletions workers/common/pipeline_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import os
import time
from collections import OrderedDict
from functools import lru_cache, wraps
from typing import Literal, Union
from functools import wraps
from typing import Literal, Optional, Union
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'Optional' is not used.

Suggested change
from typing import Literal, Optional, Union
from typing import Literal, Union

Copilot uses AI. Check for mistakes.

import torch
from accelerate.hooks import CpuOffload
Expand All @@ -25,6 +25,7 @@

@time_info_decorator
def patched_pre_forward(self, module, *args, **kwargs):
"""Patched pre_forward to log timiing for offloading."""
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in docstring: "timiing" should be "timing".

Suggested change
"""Patched pre_forward to log timiing for offloading."""
"""Patched pre_forward to log timing for offloading."""

Copilot uses AI. Check for mistakes.
return _original_pre_forward(self, module, *args, **kwargs)


Expand Down Expand Up @@ -126,8 +127,7 @@ def wrapper(*args, **kwargs):
return wrapper


@time_info_decorator
def optimize_pipeline(pipe, disable_safety_checker=True, offload=True, vae_tiling=True):
def optimize_pipeline(pipe, offload=True, vae_tiling=True):
# Override the safety checker
def dummy_safety_checker(images, **kwargs):
return images, [False] * len(images)
Expand All @@ -140,11 +140,15 @@ def dummy_safety_checker(images, **kwargs):
if vae_tiling:
try:
pipe.vae.enable_tiling() # Enable VAE tiling to improve memory efficiency
pipe.vae.enable_slicing()
except:
pass # VAE tiling is not available for all models

if disable_safety_checker:
try:
pipe.vae.enable_slicing() # Enable VAE slicing to reduce memory usage
except:
Copy link

Copilot AI Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Except block directly handles BaseException.

Suggested change
except:
except Exception:

Copilot uses AI. Check for mistakes.
pass # VAE slicing is not available for all models

if hasattr(pipe, "disable_safety_checker"):
pipe.safety_checker = dummy_safety_checker

return pipe
Expand Down Expand Up @@ -180,7 +184,12 @@ def get_quant_dir(model_id: str, subfolder: str, load_in_4bit: bool) -> str:

@time_info_decorator
def get_quantized_model(
model_id, subfolder, model_class, target_precision: Literal[4, 8, 16] = 8, torch_dtype=torch.float16
model_id,
subfolder,
model_class,
target_precision: Literal[4, 8, 16] = 8,
torch_dtype=torch.float16,
offload: bool = False,
):
"""
Load a quantized model component if available locally; otherwise, load original,
Expand All @@ -197,9 +206,14 @@ def get_quantized_model(
model instance
"""

# if we will be offloading, load to CPU
args = {}
if offload:
args["device_map"] = "cpu"

if target_precision == 16:
logger.warning(f"Quantization disabled for {model_id} subfolder {subfolder}")
return model_class.from_pretrained(model_id, subfolder=subfolder, torch_dtype=torch_dtype)
logger.debug(f"Quantization disabled for {model_id} subfolder {subfolder}")
return model_class.from_pretrained(model_id, subfolder=subfolder, torch_dtype=torch_dtype, **args)

load_in_4bit = target_precision == 4
quant_dir = get_quant_dir(model_id, subfolder, load_in_4bit=load_in_4bit)
Expand All @@ -224,7 +238,7 @@ def get_quantized_model(
try:
logger.info(f"Loading quantized model from {quant_dir}")
model = model_class.from_pretrained(
quant_dir, torch_dtype=torch_dtype, local_files_only=True, use_safetensors=use_safetensors
quant_dir, torch_dtype=torch_dtype, local_files_only=True, use_safetensors=use_safetensors, **args
)
except Exception as e:
logger.error(f"Failed to load quantized model from {quant_dir}: {e}")
Expand Down
4 changes: 2 additions & 2 deletions workers/common/replicate_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from replicate.helpers import FileOutput


def replicate_run(model_path: str, payload: dict) -> Any:
def replicate_run(model_path: str, payload: dict[str, Any]) -> Any:
try:
output = replicate.run(model_path, input=payload)
except Exception as e:
raise RuntimeError(f"Error calling Replicate API: {e}")
raise RuntimeError(f"Error calling Replicate API {model_path}: {e}")

return output

Expand Down
Loading