Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,53 @@ internal class KronosTimeProviderTest {
assertThat(result).isCloseTo(now, Offset.offset(TEST_OFFSET))
}

// region Reproduction tests for RUMS-5093: Incorrect Timing and Ordering of Traces

@Test
fun `REPRO RUMS-5093 - negative offset is returned when device clock is ahead of server`() {
// Given
// Simulate device clock being 5 seconds AHEAD of server: Kronos returns server time
// that is 5 seconds BEHIND the current device time.
// This is the root cause of RUMS-5093: when the device clock is ahead, serverOffset
// is a large negative value. When applied at serialization time in
// CoreTracerSpanToSpanEventMapper.map(), it shifts the span start to BEFORE the actual request.
val deviceClockAheadMs = 5_000L // device is 5 seconds ahead
val fakeServerTimeMs = System.currentTimeMillis() - deviceClockAheadMs
whenever(mockClock.getCurrentTimeMs()) doReturn fakeServerTimeMs

// When
val offsetMs = testedTimeProvider.getServerOffsetMillis()

// Then: offset is negative (server is behind device).
// This negative offset, when applied at serialization time to a span's startTime,
// shifts the absolute start timestamp EARLIER than the actual span start.
// FAILS: we assert that the provider guards against negative offsets being applied
// to serialized span timestamps (e.g., by clamping to 0 or flagging for caller).
// Currently, getServerOffsetMillis() returns -5000L with NO guard or warning.
assertThat(offsetMs)
.describedAs(
"RUMS-5093 root cause: device is 5s ahead, so serverOffset is ~-5000ms. " +
"Confirmed: offset=%d ms",
offsetMs
)
.isLessThan(0L)

// The SDK should protect callers from applying large negative offsets to span timestamps.
// FAILS: there is no guard — getServerOffsetMillis() returns the raw negative delta.
assertThat(offsetMs)
.describedAs(
"RUMS-5093: getServerOffsetMillis() must not return a value that would shift " +
"a span start timestamp earlier than the actual request time. " +
"A large negative offset of %d ms indicates device clock is ahead of NTP server. " +
"The SDK has no protection: span serialization blindly applies this offset, " +
"producing incorrect absolute start timestamps.",
offsetMs
)
.isGreaterThanOrEqualTo(0L)
}

// endregion

companion object {
const val TEST_OFFSET = 10L
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

package com.datadog.trace.core.propagation

import com.datadog.trace.api.DDSpanId
import com.datadog.trace.api.DDTraceId
import com.datadog.trace.bootstrap.instrumentation.api.AgentPropagation
import com.datadog.trace.core.DDSpanContext
Expand Down Expand Up @@ -107,6 +108,46 @@ internal class DatadogHttpCodecTest {
}
}

// region Reproduction tests for RUMS-5093: Incorrect Timing and Ordering of Traces

@Test
fun `REPRO RUMS-5093 - M inject spanId as x-datadog-parent-id W inject() { propagation correct }`(forge: Forge) {
// Given
// Verify that the x-datadog-parent-id header is set to the android.request span's OWN spanId.
// This confirms the trace tree IDs (parent-child relationships) are structurally correct.
// The visual waterfall misrendering in the Datadog UI (RUMS-5093) is caused by incorrect
// absolute start timestamps from NTP clock drift, NOT by wrong parent_id propagation.
val fakeSpanId = forge.aLong(min = 1L)
whenever(mockContext.spanId).thenReturn(fakeSpanId)
whenever(mockContext.tags).thenReturn(emptyMap())

// When
testedInjector.inject(mockContext, mockCarrier, mockSetter)

// Then: x-datadog-parent-id must equal the android.request span's own spanId.
// This test PASSES confirming that the context propagation header is correct.
// The RUMS-5093 bug (broken trace ordering in the UI) is NOT caused by wrong parent_id.
// Root cause is the incorrect absolute start timestamp applied in CoreTracerSpanToSpanEventMapper.
argumentCaptor<String> {
verify(mockSetter).set(
eq(mockCarrier),
eq(DatadogHttpCodec.SPAN_ID_KEY),
capture()
)
val expectedParentId = DDSpanId.toString(fakeSpanId)
assertThat(firstValue)
.describedAs(
"RUMS-5093 context propagation: x-datadog-parent-id must equal the span's own spanId. " +
"Expected: $expectedParentId but was: $firstValue. " +
"If this fails, the broken trace ordering has TWO root causes; " +
"if it passes, the ordering bug is solely due to incorrect NTP-adjusted timestamps."
)
.isEqualTo(expectedParentId)
}
}

// endregion

companion object {
@Suppress("unused")
@JvmStatic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
package com.datadog.android.trace.internal.domain.event

import com.datadog.android.api.context.DatadogContext
import com.datadog.android.api.context.TimeInfo
import com.datadog.android.log.LogAttributes
import com.datadog.android.trace.assertj.SpanEventAssert.Companion.assertThat
import com.datadog.android.utils.forge.Configurator
Expand All @@ -28,6 +29,7 @@ import org.mockito.junit.jupiter.MockitoExtension
import org.mockito.junit.jupiter.MockitoSettings
import org.mockito.kotlin.whenever
import org.mockito.quality.Strictness
import org.assertj.core.api.Assertions.assertThat as assertThatJ

@Extensions(
ExtendWith(MockitoExtension::class),
Expand All @@ -53,9 +55,7 @@ internal class CoreTracerSpanToSpanEventMapperTest {
}

@Test
fun `M map a DdSpan to a SpanEvent W map()`(
@Forgery fakeSpan: DDSpan
) {
fun `M map a DdSpan to a SpanEvent W map()`(@Forgery fakeSpan: DDSpan) {
// Given
val expectedMeta = fakeSpan.baggage + fakeSpan.tags.map {
it.key to it.value.toString()
Expand Down Expand Up @@ -160,9 +160,7 @@ internal class CoreTracerSpanToSpanEventMapperTest {
}

@Test
fun `M mark the SpanEvent as top span W map() { parentId is 0 }`(
@Forgery fakeSpan: DDSpan
) {
fun `M mark the SpanEvent as top span W map() { parentId is 0 }`(@Forgery fakeSpan: DDSpan) {
// Given
whenever(fakeSpan.parentId).thenReturn(0L)

Expand All @@ -188,14 +186,106 @@ internal class CoreTracerSpanToSpanEventMapperTest {
assertThat(event).isNotTopSpan()
}

// region Reproduction tests for RUMS-5093: Incorrect Timing and Ordering of Traces

@Test
fun `REPRO RUMS-5093 - M not shift span start earlier W map() { device clock ahead of server by 5 seconds }`(
@Forgery fakeSpan: DDSpan
) {
// Given
// Simulate device clock being 5 seconds AHEAD of server: serverOffset is negative.
// When serialization applies this negative offset, the span start is shifted to BEFORE
// the actual request started, making the android.request span appear shorter than backend
// child spans whose clocks are server-authoritative.
val deviceClockAheadOffsetNs = -5_000_000_000L // device is 5s ahead of server
val fakeSpanStartTime = 1_700_000_000_000_000_000L // arbitrary span start in nanos
val fakeSpanDuration = 2_000_000_000L // 2 second span duration
whenever(fakeSpan.startTime).thenReturn(fakeSpanStartTime)
whenever(fakeSpan.durationNano).thenReturn(fakeSpanDuration)

val timeInfo = TimeInfo(
deviceTimeNs = fakeSpanStartTime,
serverTimeNs = fakeSpanStartTime + deviceClockAheadOffsetNs,
serverTimeOffsetNs = deviceClockAheadOffsetNs,
serverTimeOffsetMs = -5_000L
)
val contextWithNegativeOffset = fakeDatadogContext.copy(time = timeInfo)

// When
val event = testedMapper.map(contextWithNegativeOffset, fakeSpan)

// Then: the serialized span start should NOT be before the actual span start on device.
// A negative serverOffset shifts start earlier, causing the span to appear to start
// before the actual HTTP request was initiated. The fix should capture the NTP offset
// at span creation time (not serialization time) or clamp negative offsets.
// FAILS because: event.start = fakeSpanStartTime + (-5s) = fakeSpanStartTime - 5s
// which is EARLIER than fakeSpanStartTime, making the android.request span
// appear to have negligible duration relative to server-authoritative backend spans.
assertThatJ(event.start)
.describedAs(
"RUMS-5093: Span start must not be shifted earlier than the actual span start " +
"when device clock is ahead of server. " +
"Current bug: start=${event.start} is before spanStartTime=$fakeSpanStartTime " +
"because serverOffset=$deviceClockAheadOffsetNs is applied at serialization time."
)
.isGreaterThanOrEqualTo(fakeSpanStartTime)
}

@Test
fun `REPRO RUMS-5093 - M preserve span duration W map() { device clock ahead of server by 5 seconds }`(
@Forgery fakeSpan: DDSpan
) {
// Given
// Even with negative server offset, the duration field (derived from monotonic nanoTime)
// should remain correct. This test documents that the duration is correct (computed from
// monotonic clock) while the start timestamp is wrong (shifted by serialization-time NTP offset).
val deviceClockAheadOffsetNs = -5_000_000_000L
val fakeSpanStartTime = 1_700_000_000_000_000_000L
val fakeSpanDuration = 2_000_000_000L // 2 seconds duration (monotonic, correct)
whenever(fakeSpan.startTime).thenReturn(fakeSpanStartTime)
whenever(fakeSpan.durationNano).thenReturn(fakeSpanDuration)

val timeInfo = TimeInfo(
deviceTimeNs = fakeSpanStartTime,
serverTimeNs = fakeSpanStartTime + deviceClockAheadOffsetNs,
serverTimeOffsetNs = deviceClockAheadOffsetNs,
serverTimeOffsetMs = -5_000L
)
val contextWithNegativeOffset = fakeDatadogContext.copy(time = timeInfo)

// When
val event = testedMapper.map(contextWithNegativeOffset, fakeSpan)

// Then: duration is correct (monotonic), but start is wrong.
// The backend child span starting at server-time T0+1s will have start > event.start,
// making the android.request span appear to have negligible duration vs. its children.
// This assertion documents the duration is preserved even though start is incorrectly shifted.
assertThatJ(event.duration)
.describedAs("Duration must equal the original span duration (monotonic, unaffected by NTP offset)")
.isEqualTo(fakeSpanDuration)

// This assertion shows the bug: start is offset to before actual span start.
// Together with correct duration, the span appears correct internally but wrong in the timeline.
val incorrectStart = fakeSpanStartTime + deviceClockAheadOffsetNs
assertThatJ(event.start)
.describedAs(
"RUMS-5093: BUG DOCUMENTED - start=${ event.start } is incorrectly shifted " +
"to $incorrectStart (5s before actual span start). " +
"A backend child span starting at server-time T0+1s has start > event.start, " +
"making the android.request span appear to have negligible duration."
)
// FAILS: we assert start should equal span start time (no NTP shift), but it's shifted
.isEqualTo(fakeSpanStartTime)
}

// endregion

// region Internal

private fun DDSpan.expectedMetrics(): Map<String, Number> {
return tags.filterValues { it is Number }.mapValues { it.value as Number }.toMutableMap().apply {
this[DDSpanContext.PRIORITY_SAMPLING_KEY] = spanSamplingPriority
}
private fun DDSpan.expectedMetrics(): Map<String, Number> = tags.filterValues {
it is Number
}.mapValues { it.value as Number }.toMutableMap().apply {
this[DDSpanContext.PRIORITY_SAMPLING_KEY] = spanSamplingPriority
}

// endregion
Expand Down
Loading