Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/8514.enhance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add `service.instance.id` and `service.instance.name` OpenTelemetry resource attributes to enable per-instance log filtering in Loki/Grafana. The `service.instance.id` uses the service's unique UUID (generated fresh on each restart) to distinguish individual process executions, following OpenTelemetry semantic conventions for tracing process lifecycles.
3 changes: 2 additions & 1 deletion src/ai/backend/agent/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,11 +1515,12 @@ async def service_discovery_ctx(
if local_config.otel.enabled:
meta = sd_loop.metadata
otel_spec = OpenTelemetrySpec(
service_id=meta.id,
service_name=meta.service_group,
service_version=meta.version,
log_level=local_config.otel.log_level,
endpoint=local_config.otel.endpoint,
service_instance_id=meta.id,
service_instance_name=meta.display_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
try:
Expand Down
3 changes: 2 additions & 1 deletion src/ai/backend/appproxy/coordinator/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,11 +760,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
if root_ctx.local_config.otel.enabled:
meta = sd_loop.metadata
otel_spec = OpenTelemetrySpec(
service_id=meta.id,
service_name=meta.service_group,
service_version=meta.version,
log_level=root_ctx.local_config.otel.log_level,
endpoint=root_ctx.local_config.otel.endpoint,
service_instance_id=meta.id,
service_instance_name=meta.display_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
try:
Expand Down
3 changes: 2 additions & 1 deletion src/ai/backend/appproxy/worker/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,11 +554,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
if root_ctx.local_config.otel.enabled:
meta = sd_loop.metadata
otel_spec = OpenTelemetrySpec(
service_id=meta.id,
service_name=meta.service_group,
service_version=meta.version,
log_level=root_ctx.local_config.otel.log_level,
endpoint=root_ctx.local_config.otel.endpoint,
service_instance_id=meta.id,
service_instance_name=meta.display_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
try:
Expand Down
11 changes: 7 additions & 4 deletions src/ai/backend/logging/otel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,21 @@

@dataclass
class OpenTelemetrySpec:
service_id: uuid.UUID
service_name: str
service_version: str
log_level: str
endpoint: str
service_instance_id: uuid.UUID
service_instance_name: str

def to_resource(self) -> Resource:
return Resource.create({
attributes = {
"service.name": self.service_name,
"service.id": str(self.service_id),
"service.version": self.service_version,
})
"service.instance.id": str(self.service_instance_id),
"service.instance.name": self.service_instance_name,
}
return Resource.create(attributes)


def apply_otel_loggers(loggers: Iterable[logging.Logger], spec: OpenTelemetrySpec) -> None:
Expand Down
3 changes: 2 additions & 1 deletion src/ai/backend/manager/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,11 +817,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]:
if root_ctx.config_provider.config.otel.enabled:
meta = root_ctx.sd_loop.metadata
otel_spec = OpenTelemetrySpec(
service_id=meta.id,
service_name=meta.service_group,
service_version=meta.version,
log_level=root_ctx.config_provider.config.otel.log_level,
endpoint=root_ctx.config_provider.config.otel.endpoint,
service_instance_id=meta.id,
service_instance_name=meta.display_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
try:
Expand Down
3 changes: 2 additions & 1 deletion src/ai/backend/storage/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,11 +537,12 @@ async def service_discovery_ctx(
if local_config.otel.enabled:
meta = sd_loop.metadata
otel_spec = OpenTelemetrySpec(
service_id=meta.id,
service_name=meta.service_group,
service_version=meta.version,
log_level=local_config.otel.log_level,
endpoint=local_config.otel.endpoint,
service_instance_id=meta.id,
service_instance_name=meta.display_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
try:
Expand Down
6 changes: 4 additions & 2 deletions src/ai/backend/web/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
import time
import traceback
import uuid
from collections.abc import (
AsyncGenerator,
AsyncIterator,
Expand All @@ -25,7 +26,6 @@
from pathlib import Path
from pprint import pprint
from typing import Any, cast
from uuid import uuid4

import aiohttp
import aiohttp_cors
Expand Down Expand Up @@ -909,12 +909,14 @@ async def on_prepare(_request: web.Request, response: web.StreamResponse) -> Non
@asynccontextmanager
async def service_discovery_ctx(config: WebServerUnifiedConfig) -> AsyncGenerator[None]:
if config.otel.enabled:
instance_name = f"webserver-{socket.gethostname()}"
otel_spec = OpenTelemetrySpec(
service_id=uuid4(),
service_name="webserver",
service_version=__version__,
log_level=config.otel.log_level,
endpoint=config.otel.endpoint,
service_instance_id=uuid.uuid4(),
service_instance_name=instance_name,
)
BraceStyleAdapter.apply_otel(otel_spec)
yield
Expand Down
Loading