Skip to content

Commit 93ce90a

Browse files
authored
feat: add /metrics/:region endpoint (#1656)
1 parent 4588af5 commit 93ce90a

File tree

4 files changed

+85
-8
lines changed

4 files changed

+85
-8
lines changed

lib/realtime_web/controllers/metrics_controller.ex

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,36 @@ defmodule RealtimeWeb.MetricsController do
66

77
# We give more memory and time to collect metrics from all nodes as this is a lot of work
88
def index(conn, _) do
9-
{time, metrics} = :timer.tc(&cluster_metrics/0, :millisecond)
9+
{time, metrics} = :timer.tc(fn -> metrics([Node.self() | Node.list()]) end, :millisecond)
1010
Logger.info("Collected cluster metrics in #{time} milliseconds")
1111

1212
conn
1313
|> put_resp_content_type("text/plain")
1414
|> send_resp(200, metrics)
1515
end
1616

17-
defp cluster_metrics() do
17+
def region(conn, %{"region" => region}) do
18+
nodes = Realtime.Nodes.region_nodes(region)
19+
{time, metrics} = :timer.tc(fn -> metrics(nodes) end, :millisecond)
20+
Logger.info("Collected metrics for region #{region} in #{time} milliseconds")
21+
22+
conn
23+
|> put_resp_content_type("text/plain")
24+
|> send_resp(200, metrics)
25+
end
26+
27+
defp metrics(nodes) do
1828
bump_max_heap_size()
1929
timeout = Application.fetch_env!(:realtime, :metrics_rpc_timeout)
2030

21-
Node.list()
31+
nodes
2232
|> Task.async_stream(
2333
fn node ->
2434
{node, GenRpc.call(node, __MODULE__, :get_metrics, [], timeout: timeout)}
2535
end,
2636
timeout: :infinity
2737
)
28-
|> Enum.reduce([PromEx.get_metrics()], fn {_, {node, response}}, acc ->
38+
|> Enum.reduce([], fn {_, {node, response}}, acc ->
2939
case response do
3040
{:error, :rpc_error, reason} ->
3141
Logger.error("Cannot fetch metrics from the node #{inspect(node)} because #{inspect(reason)}")

lib/realtime_web/router.ex

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ defmodule RealtimeWeb.Router do
7676
pipe_through(:metrics)
7777

7878
get("/", MetricsController, :index)
79+
get("/:region", MetricsController, :region)
7980
end
8081

8182
scope "/api" do

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ defmodule Realtime.MixProject do
44
def project do
55
[
66
app: :realtime,
7-
version: "2.67.8",
7+
version: "2.68.0",
88
elixir: "~> 1.18",
99
elixirc_paths: elixirc_paths(Mix.env()),
1010
start_permanent: Mix.env() == :prod,

test/realtime_web/controllers/metrics_controller_test.exs

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@ defmodule RealtimeWeb.MetricsControllerTest do
22
# Usage of Clustered
33
# Also changing Application env
44
use RealtimeWeb.ConnCase, async: false
5+
alias Realtime.GenRpc
56

67
import ExUnit.CaptureLog
8+
use Mimic
79

810
setup_all do
911
metrics_tags = %{
@@ -45,9 +47,13 @@ defmodule RealtimeWeb.MetricsControllerTest do
4547
end
4648

4749
test "returns 200 and log on timeout", %{conn: conn} do
48-
current_value = Application.get_env(:realtime, :metrics_rpc_timeout)
49-
on_exit(fn -> Application.put_env(:realtime, :metrics_rpc_timeout, current_value) end)
50-
Application.put_env(:realtime, :metrics_rpc_timeout, 0)
50+
Mimic.stub(GenRpc, :call, fn node, mod, func, args, opts ->
51+
if node != node() do
52+
{:error, :rpc_error, :timeout}
53+
else
54+
call_original(GenRpc, :call, [node, mod, func, args, opts])
55+
end
56+
end)
5157

5258
log =
5359
capture_log(fn ->
@@ -84,4 +90,64 @@ defmodule RealtimeWeb.MetricsControllerTest do
8490
|> response(403)
8591
end
8692
end
93+
94+
describe "GET /metrics/:region" do
95+
setup %{conn: conn} do
96+
# The metrics pipeline requires authentication
97+
jwt_secret = Application.fetch_env!(:realtime, :metrics_jwt_secret)
98+
token = generate_jwt_token(jwt_secret, %{})
99+
authenticated_conn = put_req_header(conn, "authorization", "Bearer #{token}")
100+
101+
{:ok, conn: authenticated_conn}
102+
end
103+
104+
test "returns 200", %{conn: conn} do
105+
assert response =
106+
conn
107+
|> get(~p"/metrics/ap-southeast-2")
108+
|> text_response(200)
109+
110+
# Check prometheus like metrics
111+
assert response =~
112+
"# HELP beam_system_schedulers_online_info The number of scheduler threads that are online."
113+
114+
assert response =~ "region=\"ap-southeast-2\""
115+
refute response =~ "region=\"us-east-1\""
116+
end
117+
118+
test "returns 200 and log on timeout", %{conn: conn} do
119+
Mimic.stub(GenRpc, :call, fn _node, _mod, _func, _args, _opts ->
120+
{:error, :rpc_error, :timeout}
121+
end)
122+
123+
log =
124+
capture_log(fn ->
125+
assert response =
126+
conn
127+
|> get(~p"/metrics/ap-southeast-2")
128+
|> text_response(200)
129+
130+
assert response == ""
131+
end)
132+
133+
assert log =~ "Cannot fetch metrics from the node"
134+
end
135+
136+
test "returns 403 when authorization header is missing", %{conn: conn} do
137+
assert conn
138+
|> delete_req_header("authorization")
139+
|> get(~p"/metrics/ap-southeast-2")
140+
|> response(403)
141+
end
142+
143+
test "returns 403 when authorization header is wrong", %{conn: conn} do
144+
token = generate_jwt_token("bad_secret", %{})
145+
146+
assert _ =
147+
conn
148+
|> put_req_header("authorization", "Bearer #{token}")
149+
|> get(~p"/metrics/ap-southeast-2")
150+
|> response(403)
151+
end
152+
end
87153
end

0 commit comments

Comments
 (0)