Merge pull request #183 from togethercomputer/nikita/vlm_support

nikita-smetanin · web-flow · commit d0adb578614a · 2025-12-23T17:07:09.000Z
Support VLM finetuning
diff --git a/openapi.yaml b/openapi.yaml
@@ -1684,6 +1684,8 @@ paths:
                   oneOf:
                     - $ref: '#/components/schemas/FullTrainingType'
                     - $ref: '#/components/schemas/LoRATrainingType'
+                multimodal_params:
+                  $ref: '#/components/schemas/MultimodalParams'
                 from_checkpoint:
                   type: string
                   description: The checkpoint identifier to continue training from a previous fine-tuning job. Format is `{$JOB_ID}` or `{$OUTPUT_MODEL_NAME}` or `{$JOB_ID}:{$STEP}` or `{$OUTPUT_MODEL_NAME}:{$STEP}`. The step value is optional; without it, the final checkpoint will be used.
@@ -1823,6 +1825,8 @@ paths:
                   oneOf:
                     - $ref: '#/components/schemas/FullTrainingType'
                     - $ref: '#/components/schemas/LoRATrainingType'
+                multimodal_params:
+                  $ref: '#/components/schemas/MultimodalParams'
                 from_checkpoint:
                   type: string
                   description: The checkpoint identifier to continue training from a previous fine-tuning job. Format is `{$JOB_ID}` or `{$OUTPUT_MODEL_NAME}` or `{$JOB_ID}:{$STEP}` or `{$OUTPUT_MODEL_NAME}:{$STEP}`. The step value is optional; without it, the final checkpoint will be used.
@@ -7436,6 +7440,8 @@ components:
           oneOf:
             - $ref: '#/components/schemas/FullTrainingType'
             - $ref: '#/components/schemas/LoRATrainingType'
+        multimodal_params:
+          $ref: '#/components/schemas/MultimodalParams'
         status:
           $ref: '#/components/schemas/FinetuneJobStatus'
         job_id:
@@ -7832,6 +7838,13 @@ components:
       required:
         - method
 
+    MultimodalParams:
+      type: object
+      properties:
+        train_vision:
+          type: boolean
+          description: Whether to train the vision encoder of the model. Only available for multimodal models.
+
     LRScheduler:
       type: object
       properties: