Skip to content

Commit 5118f79

Browse files
eric-tramelclaude
andcommitted
fix: normalize rollout timestamps before deriving started_at/ended_at
Claude Code, Codex, and ATIF ingesters called min()/max() on raw timestamp strings. That assumes lexicographic order matches chronological order, which breaks for ISO 8601 timestamps with mixed UTC offsets or precisions (e.g. 2025-01-01T00:30:00+01:00 is earlier than 2025-01-01T00:00:00Z but sorts later as a string). Introduce min_max_timestamps() in the shared rollout utils that parses each value as ISO 8601 (naive values treated as UTC, unparseable values skipped) and picks the chronological extremes, returning them in their original string form. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]> Signed-off-by: Eric W. Tramel <[email protected]>
1 parent cebfb0e commit 5118f79

5 files changed

Lines changed: 88 additions & 7 deletions

File tree

packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/atif.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from data_designer.engine.resources.agent_rollout.utils import (
1616
build_message,
1717
coerce_optional_str,
18+
min_max_timestamps,
1819
require_string,
1920
stringify_json_value,
2021
)
@@ -157,6 +158,7 @@ def parse_file(
157158
project_path = coerce_optional_str(agent_extra.get("project_path")) or cwd
158159
git_branch = coerce_optional_str(agent_extra.get("git_branch"))
159160

161+
started_at, ended_at = min_max_timestamps(timestamps)
160162
return [
161163
NormalizedAgentRolloutRecord(
162164
trace_id=session_id,
@@ -168,8 +170,8 @@ def parse_file(
168170
cwd=cwd,
169171
project_path=project_path,
170172
git_branch=git_branch,
171-
started_at=min(timestamps) if timestamps else None,
172-
ended_at=max(timestamps) if timestamps else None,
173+
started_at=started_at,
174+
ended_at=ended_at,
173175
messages=messages,
174176
source_meta=source_meta,
175177
)

packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/claude_code.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
build_message,
1717
coerce_optional_str,
1818
load_jsonl_rows,
19+
min_max_timestamps,
1920
require_string,
2021
stringify_json_value,
2122
stringify_text_value,
@@ -86,6 +87,7 @@ def parse_file(
8687
elif record_type == "user":
8788
messages.extend(normalize_claude_user_messages(raw_record))
8889

90+
started_at, ended_at = min_max_timestamps(timestamps)
8991
session_key = session_id or file_path.stem
9092
index_entry = session_index.get(session_key, {})
9193
project_path = coerce_optional_str(index_entry.get("projectPath")) or cwd
@@ -112,8 +114,8 @@ def parse_file(
112114
cwd=cwd,
113115
project_path=project_path,
114116
git_branch=git_branch,
115-
started_at=min(timestamps) if timestamps else None,
116-
ended_at=max(timestamps) if timestamps else None,
117+
started_at=started_at,
118+
ended_at=ended_at,
117119
messages=messages,
118120
source_meta=source_meta,
119121
)

packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/codex.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
build_message,
1515
coerce_optional_str,
1616
load_jsonl_rows,
17+
min_max_timestamps,
1718
require_string,
1819
stringify_json_value,
1920
stringify_text_value,
@@ -143,6 +144,7 @@ def parse_file(
143144
if pending_reasoning:
144145
source_meta["unattached_reasoning"] = list(pending_reasoning)
145146

147+
earliest, latest = min_max_timestamps(timestamps)
146148
return [
147149
NormalizedAgentRolloutRecord(
148150
trace_id=session_id,
@@ -154,9 +156,8 @@ def parse_file(
154156
cwd=coerce_optional_str(session_meta.get("cwd")),
155157
project_path=coerce_optional_str(session_meta.get("cwd")),
156158
git_branch=coerce_optional_str(session_meta.get("git_branch")),
157-
started_at=coerce_optional_str(session_meta.get("timestamp"))
158-
or (min(timestamps) if timestamps else None),
159-
ended_at=max(timestamps) if timestamps else None,
159+
started_at=coerce_optional_str(session_meta.get("timestamp")) or earliest,
160+
ended_at=latest,
160161
messages=messages,
161162
source_meta=source_meta,
162163
)

packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import json
77
from collections.abc import Iterator
8+
from datetime import datetime, timezone
89
from pathlib import Path
910
from typing import Any, Literal
1011

@@ -110,3 +111,39 @@ def stringify_text_value(value: Any) -> str:
110111
if isinstance(value, str):
111112
return value
112113
return str(value)
114+
115+
116+
def min_max_timestamps(timestamps: list[str]) -> tuple[str | None, str | None]:
117+
"""Return the chronologically earliest and latest timestamps.
118+
119+
Values are parsed as ISO 8601 before comparison so that mixed UTC offsets
120+
and precisions order correctly (e.g. ``2025-01-01T00:30:00+01:00`` is
121+
earlier than ``2025-01-01T00:00:00Z``). Naive timestamps are treated as
122+
UTC. Unparseable values are skipped. The winning entries are returned in
123+
their original string form.
124+
"""
125+
parsed: list[tuple[datetime, str]] = []
126+
for original in timestamps:
127+
instant = parse_iso8601(original)
128+
if instant is not None:
129+
parsed.append((instant, original))
130+
if not parsed:
131+
return None, None
132+
earliest = min(parsed, key=lambda pair: pair[0])[1]
133+
latest = max(parsed, key=lambda pair: pair[0])[1]
134+
return earliest, latest
135+
136+
137+
def parse_iso8601(value: str) -> datetime | None:
138+
"""Parse an ISO 8601 timestamp, treating naive values as UTC.
139+
140+
Returns ``None`` for strings that cannot be parsed so callers can silently
141+
skip malformed entries.
142+
"""
143+
try:
144+
parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
145+
except ValueError:
146+
return None
147+
if parsed.tzinfo is None:
148+
return parsed.replace(tzinfo=timezone.utc)
149+
return parsed
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from __future__ import annotations
5+
6+
import pytest
7+
8+
from data_designer.engine.resources.agent_rollout.utils import min_max_timestamps
9+
10+
11+
@pytest.mark.parametrize(
12+
("timestamps", "expected"),
13+
[
14+
pytest.param([], (None, None), id="empty"),
15+
pytest.param(
16+
["2025-01-01T00:30:00+01:00", "2025-01-01T00:00:00Z"],
17+
("2025-01-01T00:30:00+01:00", "2025-01-01T00:00:00Z"),
18+
id="mixed-offset-lex-disagrees-with-chrono",
19+
),
20+
pytest.param(
21+
["2025-01-01T00:00:00.500Z", "2025-01-01T00:00:00Z"],
22+
("2025-01-01T00:00:00Z", "2025-01-01T00:00:00.500Z"),
23+
id="mixed-precision",
24+
),
25+
pytest.param(
26+
["2025-01-01T00:00:00", "2025-01-02T00:00:00Z"],
27+
("2025-01-01T00:00:00", "2025-01-02T00:00:00Z"),
28+
id="naive-treated-as-utc-and-compared-against-aware",
29+
),
30+
pytest.param(
31+
["not-a-timestamp", "2025-01-01T00:00:00Z"],
32+
("2025-01-01T00:00:00Z", "2025-01-01T00:00:00Z"),
33+
id="unparseable-values-skipped",
34+
),
35+
pytest.param(["not-a-timestamp"], (None, None), id="only-unparseable"),
36+
],
37+
)
38+
def test_min_max_timestamps(timestamps: list[str], expected: tuple[str | None, str | None]) -> None:
39+
assert min_max_timestamps(timestamps) == expected

0 commit comments

Comments
 (0)