ai-dynamo · matthewkotila · Feb 11, 2026 · Feb 12, 2026 · matthewkotila · Feb 13, 2026
diff --git a/docs/cli_options.md b/docs/cli_options.md
@@ -164,7 +164,7 @@ Use the legacy 'max_tokens' field instead of 'max_completion_tokens' in request
 
 #### `--use-server-token-count`
 
-Use server-reported token counts from API usage fields instead of client-side tokenization. When enabled, tokenizers are still loaded (needed for dataset generation) but tokenizer.encode() is not called for computing metrics. Token count fields will be None if the server does not provide usage information. For OpenAI-compatible streaming endpoints (chat/completions), stream_options.include_usage is automatically configured when this flag is enabled.
+Use server-reported token counts from API usage fields instead of client-side tokenization. When enabled, tokenizers are still loaded (needed for dataset generation) but tokenizer.encode() is not called for computing metrics. Token count fields will be None if the server does not provide usage information. For OpenAI-compatible streaming endpoints (chat/completions), stream_options.include_usage is automatically configured when this flag is enabled. NOTE: When non-text input modalities (e.g., images) are detected, input token counting is automatically supplemented with client-side text tokenization to derive image token counts. The server-reported usage.prompt_tokens provides the total input token count (ISL), the client tokenizer computes the text-only count, and the image token count is derived by subtraction. Output and reasoning token counts continue to use server-reported values.
 <br>_Flag (no value required)_
 
 #### `--connection-reuse-strategy` `<str>`

diff --git a/docs/metrics_reference.md b/docs/metrics_reference.md
@@ -34,6 +34,12 @@ This document provides a comprehensive reference of all metrics available in AIP
   - [Image Metrics](#image-metrics)
     - [Image Throughput](#image-throughput)
     - [Image Latency](#image-latency)
+    - [Image Input Token Count](#image-input-token-count)
+    - [Text Input Token Count](#text-input-token-count)
+    - [Tokens Per Image](#tokens-per-image)
+    - [Image Token Ratio](#image-token-ratio)
+    - [Total Image Input Tokens](#total-image-input-tokens)
+    - [Image Input Token Throughput](#image-input-token-throughput)
   - [Reasoning Metrics](#reasoning-metrics)
     - [Reasoning Token Count](#reasoning-token-count)
     - [Total Reasoning Tokens](#total-reasoning-tokens)
@@ -341,15 +347,20 @@ output_sequence_length = (output_token_count or 0) + (reasoning_token_count or 0
 
 **Type:** [Record Metric](#record-metrics)
 
-The number of input/prompt tokens for a single request. This represents the size of the input sent to the model.
+The total number of input/prompt tokens for a single request, across all modalities. This represents the size of the input sent to the model.
 
 **Formula:**
 ```python
+# Text-only requests:
 input_sequence_length = len(tokenizer.encode(prompt, add_special_tokens=False))
+
+# Requests with non-text input modalities (e.g., images):
+input_sequence_length = response.usage.prompt_tokens  # server-reported total (text + image)
 ```
 
 **Notes:**
-- Tokenization uses `add_special_tokens=False` to count only content tokens, excluding special tokens added by the tokenizer.
+- For text-only requests, tokenization uses `add_special_tokens=False` to count only content tokens.
+- When non-text input modalities (e.g., images) are detected, ISL reflects the **total** prompt token count from the server's `usage.prompt_tokens`, regardless of `--use-server-token-count`. The text-only and image-only breakdowns are available via the `text_input_token_count` and `image_input_token_count` metrics.
 - Useful for understanding the relationship between input size and latency/throughput.
 
 ---
@@ -438,6 +449,7 @@ total_token_throughput = (total_isl + total_osl) / benchmark_duration_seconds
 **Notes:**
 - Measures the combined input and output token processing rate.
 - Includes reasoning tokens in the output count (via total_osl).
+- When non-text input modalities (e.g., images) are present, `total_isl` includes image tokens, so Total Token Throughput reflects all modalities (text + image input, plus output).
 - Useful for understanding total system token processing capacity.
 
 ---
@@ -479,6 +491,108 @@ image_latency = request_latency_ms / num_images
 
 ---
 
+### Image Input Token Count
+
+**Type:** [Record Metric](#record-metrics)
+
+The number of image input tokens for a single request, derived by subtracting the client-computed text token count from the server-reported total prompt tokens.
+
+**Formula:**
+```python
+image_input_token_count = usage.prompt_tokens - client_text_tokens
+```
+
+**Notes:**
+- When non-text input modalities (e.g., images) are detected, input token counting automatically uses a hybrid approach (client-side text tokenization + server `usage.prompt_tokens`) regardless of `--use-server-token-count`. Output and reasoning token counts are unaffected.
+- Requires the server to report `usage.prompt_tokens`.
+- Clamped to 0 (with warning) if the client tokenizer overcounts text tokens relative to the server.
+- Only populated when images are present; None otherwise.
+- Exported to JSON/CSV only (not shown in console).
+
+---
+
+### Text Input Token Count
+
+**Type:** [Record Metric](#record-metrics)
+
+The number of text-only input tokens for a single request, computed by the client-side tokenizer. Only populated when non-text input modalities (e.g., images) are present in the request.
+
+**Formula:**
+```python
+text_input_token_count = len(tokenizer.encode(text_prompt))
+```
+
+**Notes:**
+- Exported to JSON/CSV only (not shown in console).
+
+---
+
+### Tokens Per Image
+
+**Type:** [Record Metric](#record-metrics)
+
+Image tokens divided by the number of images in a single request. When a request contains multiple images, the total image token count is a single aggregate (server total minus client text tokens), so this metric divides that aggregate by the number of images to approximate the per-image cost.
+
+**Formula:**
+```python
+tokens_per_image = image_input_token_count / num_images
+```
+
+**Notes:**
+- Useful for understanding how image resolution or content affects token consumption.
+- Exported to JSON/CSV only (not shown in console).
+
+---
+
+### Image Token Ratio
+
+**Type:** [Record Metric](#record-metrics)
+
+The ratio of image tokens to total input tokens for a single request. A value of 0.9 means 90% of the input tokens are from images.
+
+**Formula:**
+```python
+image_token_ratio = image_input_token_count / input_sequence_length
+```
+
+**Notes:**
+- Exported to JSON/CSV only (not shown in console).
+
+---
+
+### Total Image Input Tokens
+
+**Type:** [Derived Metric](#derived-metrics)
+
+The sum of all image input tokens across all requests in the benchmark.
+
+**Formula:**
+```python
+total_image_input_tokens = sum(r.image_input_token_count for r in records if r.valid)
+```
+
+**Notes:**
+- Exported to JSON/CSV only (not shown in console).
+
+---
+
+### Image Input Token Throughput
+
+**Type:** [Derived Metric](#derived-metrics)
+
+Image input tokens processed per second across the benchmark.
+
+**Formula:**
+```python
+image_input_token_throughput = total_image_input_tokens / benchmark_duration_seconds
+```
+
+**Notes:**
+- Measures how fast the server is ingesting image tokens.
+- Useful for comparing image processing throughput across model configurations and image resolutions.
+
+---
+
 ## Reasoning Metrics
 
 > [!NOTE]

diff --git a/src/aiperf/common/config/endpoint_config.py b/src/aiperf/common/config/endpoint_config.py
@@ -223,7 +223,13 @@ def url(self) -> str:
                 "for computing metrics. Token count fields will be None if the server "
                 "does not provide usage information. For OpenAI-compatible streaming "
                 "endpoints (chat/completions), stream_options.include_usage is automatically "
-                "configured when this flag is enabled."
+                "configured when this flag is enabled. NOTE: When non-text input modalities "
+                "(e.g., images) are detected, input token counting is automatically "
+                "supplemented with client-side text tokenization to derive image token "
+                "counts. The server-reported usage.prompt_tokens provides the total input "
+                "token count (ISL), the client tokenizer computes the text-only count, "
+                "and the image token count is derived by subtraction. Output and reasoning "
+                "token counts continue to use server-reported values."
             ),
         ),
         CLIParameter(

diff --git a/src/aiperf/common/models/record_models.py b/src/aiperf/common/models/record_models.py
@@ -847,11 +847,38 @@ class ParsedResponse(AIPerfBaseModel):
 
 
 class TokenCounts(AIPerfBaseModel):
-    """Token counts for a record."""
+    """Token counts for a record.
+
+    When non-text input modalities (e.g., images) are present, ``input`` reflects
+    the server-reported total prompt tokens (text + image) from
+    ``usage.prompt_tokens``, ``text_input`` holds the client-tokenized text-only
+    count, and ``image_input`` holds the derived difference.
+
+    When no non-text modalities are present, ``text_input`` and ``image_input``
+    remain None and ``input`` is the client-tokenized text count.
+
+    The invariant ``text_input + image_input == input`` holds whenever all three
+    fields are non-None.
+    """
 
     input: int | None = Field(
         default=None,
-        description="The number of tokens in the input. If None, the number of tokens could not be calculated.",
+        description="The total number of input tokens across all modalities. When non-text input "
+        "modalities (e.g., images) are present, this is the server-reported total from "
+        "usage.prompt_tokens. Otherwise, this equals the client-side tokenized text count. "
+        "If None, the number of tokens could not be calculated.",
+    )
+    text_input: int | None = Field(
+        default=None,
+        description="The number of text-only input tokens computed by the client-side tokenizer. "
+        "Only populated when non-text input modalities (e.g., images) are present. "
+        "If None, either no non-text modalities are present or text token count could not be calculated.",
+    )
+    image_input: int | None = Field(
+        default=None,
+        description="The number of image input tokens, derived as input minus text_input. "
+        "Only populated when images are present and both server usage and client tokenization are available. "
+        "If None, image token count could not be calculated.",
     )
     output: int | None = Field(
         default=None,