diff --git a/changes/8514.enhance.md b/changes/8514.enhance.md new file mode 100644 index 00000000000..0d53d136f70 --- /dev/null +++ b/changes/8514.enhance.md @@ -0,0 +1 @@ +Add `service.instance.id` and `service.instance.name` OpenTelemetry resource attributes to enable per-instance log filtering in Loki/Grafana. The `service.instance.id` uses the service's unique UUID (generated fresh on each restart) to distinguish individual process executions, following OpenTelemetry semantic conventions for tracing process lifecycles. diff --git a/src/ai/backend/agent/server.py b/src/ai/backend/agent/server.py index 5e51312fee7..86ec46bfc41 100644 --- a/src/ai/backend/agent/server.py +++ b/src/ai/backend/agent/server.py @@ -1515,11 +1515,12 @@ async def service_discovery_ctx( if local_config.otel.enabled: meta = sd_loop.metadata otel_spec = OpenTelemetrySpec( - service_id=meta.id, service_name=meta.service_group, service_version=meta.version, log_level=local_config.otel.log_level, endpoint=local_config.otel.endpoint, + service_instance_id=meta.id, + service_instance_name=meta.display_name, ) BraceStyleAdapter.apply_otel(otel_spec) try: diff --git a/src/ai/backend/appproxy/coordinator/server.py b/src/ai/backend/appproxy/coordinator/server.py index 8891bc81366..6dd3c894f99 100644 --- a/src/ai/backend/appproxy/coordinator/server.py +++ b/src/ai/backend/appproxy/coordinator/server.py @@ -760,11 +760,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]: if root_ctx.local_config.otel.enabled: meta = sd_loop.metadata otel_spec = OpenTelemetrySpec( - service_id=meta.id, service_name=meta.service_group, service_version=meta.version, log_level=root_ctx.local_config.otel.log_level, endpoint=root_ctx.local_config.otel.endpoint, + service_instance_id=meta.id, + service_instance_name=meta.display_name, ) BraceStyleAdapter.apply_otel(otel_spec) try: diff --git a/src/ai/backend/appproxy/worker/server.py b/src/ai/backend/appproxy/worker/server.py index cbc9af50b83..fa1577783d5 100644 --- a/src/ai/backend/appproxy/worker/server.py +++ b/src/ai/backend/appproxy/worker/server.py @@ -554,11 +554,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]: if root_ctx.local_config.otel.enabled: meta = sd_loop.metadata otel_spec = OpenTelemetrySpec( - service_id=meta.id, service_name=meta.service_group, service_version=meta.version, log_level=root_ctx.local_config.otel.log_level, endpoint=root_ctx.local_config.otel.endpoint, + service_instance_id=meta.id, + service_instance_name=meta.display_name, ) BraceStyleAdapter.apply_otel(otel_spec) try: diff --git a/src/ai/backend/logging/otel.py b/src/ai/backend/logging/otel.py index 6bff4da739b..ad6912b2541 100644 --- a/src/ai/backend/logging/otel.py +++ b/src/ai/backend/logging/otel.py @@ -18,18 +18,21 @@ @dataclass class OpenTelemetrySpec: - service_id: uuid.UUID service_name: str service_version: str log_level: str endpoint: str + service_instance_id: uuid.UUID + service_instance_name: str def to_resource(self) -> Resource: - return Resource.create({ + attributes = { "service.name": self.service_name, - "service.id": str(self.service_id), "service.version": self.service_version, - }) + "service.instance.id": str(self.service_instance_id), + "service.instance.name": self.service_instance_name, + } + return Resource.create(attributes) def apply_otel_loggers(loggers: Iterable[logging.Logger], spec: OpenTelemetrySpec) -> None: diff --git a/src/ai/backend/manager/server.py b/src/ai/backend/manager/server.py index aab7b692359..5f3f66fdaa3 100644 --- a/src/ai/backend/manager/server.py +++ b/src/ai/backend/manager/server.py @@ -817,11 +817,12 @@ async def service_discovery_ctx(root_ctx: RootContext) -> AsyncIterator[None]: if root_ctx.config_provider.config.otel.enabled: meta = root_ctx.sd_loop.metadata otel_spec = OpenTelemetrySpec( - service_id=meta.id, service_name=meta.service_group, service_version=meta.version, log_level=root_ctx.config_provider.config.otel.log_level, endpoint=root_ctx.config_provider.config.otel.endpoint, + service_instance_id=meta.id, + service_instance_name=meta.display_name, ) BraceStyleAdapter.apply_otel(otel_spec) try: diff --git a/src/ai/backend/storage/server.py b/src/ai/backend/storage/server.py index bf8e8f20992..67c3f9272b1 100644 --- a/src/ai/backend/storage/server.py +++ b/src/ai/backend/storage/server.py @@ -537,11 +537,12 @@ async def service_discovery_ctx( if local_config.otel.enabled: meta = sd_loop.metadata otel_spec = OpenTelemetrySpec( - service_id=meta.id, service_name=meta.service_group, service_version=meta.version, log_level=local_config.otel.log_level, endpoint=local_config.otel.endpoint, + service_instance_id=meta.id, + service_instance_name=meta.display_name, ) BraceStyleAdapter.apply_otel(otel_spec) try: diff --git a/src/ai/backend/web/server.py b/src/ai/backend/web/server.py index cf99f3b0c85..38600844354 100644 --- a/src/ai/backend/web/server.py +++ b/src/ai/backend/web/server.py @@ -11,6 +11,7 @@ import sys import time import traceback +import uuid from collections.abc import ( AsyncGenerator, AsyncIterator, @@ -25,7 +26,6 @@ from pathlib import Path from pprint import pprint from typing import Any, cast -from uuid import uuid4 import aiohttp import aiohttp_cors @@ -909,12 +909,14 @@ async def on_prepare(_request: web.Request, response: web.StreamResponse) -> Non @asynccontextmanager async def service_discovery_ctx(config: WebServerUnifiedConfig) -> AsyncGenerator[None]: if config.otel.enabled: + instance_name = f"webserver-{socket.gethostname()}" otel_spec = OpenTelemetrySpec( - service_id=uuid4(), service_name="webserver", service_version=__version__, log_level=config.otel.log_level, endpoint=config.otel.endpoint, + service_instance_id=uuid.uuid4(), + service_instance_name=instance_name, ) BraceStyleAdapter.apply_otel(otel_spec) yield