Skip to content

Commit 4166ff5

Browse files
jayasifacebook-github-bot
authored andcommitted
Prometheus support for telemetry (#1401)
Summary: Added a config option to allow sending metrics to Prometheus instead of Scuba. Reviewed By: pablorfb-meta Differential Revision: D83029998
1 parent 6e87ff0 commit 4166ff5

File tree

7 files changed

+811
-12
lines changed

7 files changed

+811
-12
lines changed

hyperactor_telemetry/Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# @generated by autocargo from //monarch/hyperactor_telemetry:hyperactor_telemetry
1+
# @generated by autocargo from //monarch/hyperactor_telemetry:[hyperactor_telemetry,prometheus_example]
22

33
[package]
44
name = "hyperactor_telemetry"
@@ -10,14 +10,20 @@ license = "BSD-3-Clause"
1010
[lib]
1111
edition = "2024"
1212

13+
[[bin]]
14+
name = "prometheus_example"
15+
path = "examples/prometheus_example.rs"
16+
1317
[dependencies]
1418
anyhow = "1.0.98"
1519
chrono = { version = "0.4.41", features = ["clock", "serde", "std"], default-features = false }
1620
dashmap = { version = "5.5.3", features = ["rayon", "serde"] }
1721
fbinit = { version = "0.2.0", git = "https://github.com/facebookexperimental/rust-shed.git", branch = "main" }
1822
hdrhistogram = "7.5"
23+
hyperactor = { version = "0.0.0", path = "../hyperactor" }
1924
lazy_static = "1.5"
2025
opentelemetry = "0.29"
26+
opentelemetry-otlp = { version = "0.29", features = ["http-proto", "logs", "metrics", "reqwest-blocking-client", "trace"], default-features = false }
2127
opentelemetry_sdk = { version = "0.29.0", features = ["rt-tokio"] }
2228
rand = { version = "0.8", features = ["small_rng"] }
2329
rusqlite = { version = "0.36.0", features = ["backup", "blob", "bundled", "column_decltype", "functions", "limits", "modern_sqlite", "serde_json"] }
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
//! Example showing how to use the Prometheus OTLP backend for telemetry.
10+
//!
11+
//! This example demonstrates the modern approach to Prometheus integration using
12+
//! OpenTelemetry's OTLP HTTP protocol to send metrics directly to Prometheus.
13+
//!
14+
//! ## Setup
15+
//!
16+
//! 1. Start Prometheus with OTLP receiver:
17+
//! ```bash
18+
//! prometheus --web.enable-otlp-receiver
19+
//! ```
20+
//!
21+
//! 2. Run this example:
22+
//! ```bash
23+
//! cd hyperactor_telemetry
24+
//! HYPERACTOR_OTEL_BACKEND=prometheus \
25+
//! OTEL_SERVICE_NAME=prometheus-example \
26+
//! OTEL_RESOURCE_ATTRIBUTES=environment=demo,version=1.0.0 \
27+
//! cargo run --example prometheus_example
28+
//! ```
29+
//!
30+
//! ## Query Examples
31+
//!
32+
//! After running, you can query Prometheus:
33+
//! - Rate of requests: `rate(http_requests_total[2m])`
34+
//! - With resource attributes: `rate(http_requests_total[2m]) * on (job, instance) group_left (environment) target_info`
35+
//! - P95 latency: `histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[2m]))`
36+
37+
use std::time::Duration;
38+
39+
use hyperactor::clock::Clock;
40+
use hyperactor::clock::RealClock;
41+
use hyperactor_telemetry::declare_static_counter;
42+
use hyperactor_telemetry::declare_static_gauge;
43+
use hyperactor_telemetry::declare_static_histogram;
44+
use hyperactor_telemetry::initialize_logging_for_test;
45+
use hyperactor_telemetry::kv_pairs;
46+
47+
// Declare some example metrics
48+
declare_static_counter!(REQUESTS_TOTAL, "http_requests_total");
49+
declare_static_histogram!(REQUEST_DURATION, "http_request_duration_seconds");
50+
declare_static_gauge!(ACTIVE_CONNECTIONS, "active_connections");
51+
52+
#[tokio::main]
53+
async fn main() -> Result<(), Box<dyn std::error::Error>> {
54+
// Configure environment if not already set
55+
// Safety: Setting environment variables at program startup before any threads are created
56+
unsafe {
57+
if std::env::var("HYPERACTOR_OTEL_BACKEND").is_err() {
58+
std::env::set_var("HYPERACTOR_OTEL_BACKEND", "prometheus");
59+
}
60+
61+
// Set default OpenTelemetry configuration for OTLP mode
62+
if std::env::var("OTEL_SERVICE_NAME").is_err() {
63+
std::env::set_var("OTEL_SERVICE_NAME", "prometheus-example");
64+
}
65+
if std::env::var("OTEL_RESOURCE_ATTRIBUTES").is_err() {
66+
std::env::set_var("OTEL_RESOURCE_ATTRIBUTES", "environment=demo,version=1.0.0");
67+
}
68+
if std::env::var("OTEL_METRIC_EXPORT_INTERVAL").is_err() {
69+
std::env::set_var("OTEL_METRIC_EXPORT_INTERVAL", "5000"); // 5 seconds for demo
70+
}
71+
}
72+
73+
// Initialize telemetry
74+
initialize_logging_for_test();
75+
76+
let endpoint = std::env::var("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
77+
.unwrap_or_else(|_| "http://localhost:9090/api/v1/otlp/v1/metrics".to_string());
78+
79+
println!("🚀 Starting Prometheus OTLP example...");
80+
println!("📡 Sending metrics directly to Prometheus via OTLP");
81+
println!("🔗 Endpoint: {}", endpoint);
82+
println!("ℹ️ Make sure Prometheus is running with --web.enable-otlp-receiver");
83+
println!("🎯 Query Prometheus at: http://localhost:9090");
84+
85+
println!("✅ OTLP exporter configured - metrics will be sent automatically!");
86+
87+
println!("Generating some sample metrics...");
88+
89+
// Generate some sample metrics using hyperactor telemetry macros
90+
for i in 0..100 {
91+
// Simulate HTTP requests
92+
let status = if i % 10 == 0 { "500" } else { "200" };
93+
let method = if i % 3 == 0 { "POST" } else { "GET" };
94+
95+
REQUESTS_TOTAL.add(
96+
1,
97+
kv_pairs!(
98+
"method" => method,
99+
"status" => status,
100+
"endpoint" => "/api/data"
101+
),
102+
);
103+
104+
// Simulate request durations
105+
let duration = 0.001 + (i as f64) * 0.001; // 1ms to 100ms
106+
REQUEST_DURATION.record(
107+
duration,
108+
kv_pairs!(
109+
"method" => method,
110+
"endpoint" => "/api/data"
111+
),
112+
);
113+
114+
// Simulate active connections
115+
let connections = 10.0 + (i as f64 % 20.0);
116+
ACTIVE_CONNECTIONS.record(
117+
connections,
118+
kv_pairs!(
119+
"server" => "primary"
120+
),
121+
);
122+
123+
// Small delay to spread metrics over time
124+
RealClock.sleep(Duration::from_millis(50)).await;
125+
126+
if i % 20 == 0 {
127+
println!("Generated {} metrics so far...", i + 1);
128+
}
129+
}
130+
131+
println!("✨ Finished generating metrics!");
132+
println!("🎯 Check Prometheus for your metrics:");
133+
println!(" - Prometheus UI: http://localhost:9090");
134+
println!(" - Example query: rate(http_requests_total[2m])");
135+
println!(
136+
" - Resource attributes query: rate(http_requests_total[2m]) * on (job, instance) group_left (environment) target_info"
137+
);
138+
println!("📡 Metrics are being sent to Prometheus via OTLP every 5 seconds");
139+
140+
// Keep generating metrics to show real-time updates
141+
println!("🔄 Continuing to generate metrics every 10 seconds...");
142+
for _ in 0..5 {
143+
// Generate 5 more batches then exit
144+
RealClock.sleep(Duration::from_secs(10)).await;
145+
146+
// Generate a few more metrics
147+
REQUESTS_TOTAL.add(
148+
1,
149+
kv_pairs!("method" => "GET", "status" => "200", "endpoint" => "/health"),
150+
);
151+
REQUEST_DURATION.record(0.001, kv_pairs!("method" => "GET", "endpoint" => "/health"));
152+
ACTIVE_CONNECTIONS.record(15.0, kv_pairs!("server" => "primary"));
153+
154+
println!("📊 Sent batch of metrics to Prometheus");
155+
}
156+
157+
println!("🎉 Example completed successfully!");
158+
Ok(())
159+
}

hyperactor_telemetry/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ pub mod in_memory_reader;
5555
mod meta;
5656
mod otel;
5757
mod pool;
58+
pub mod prometheus;
5859
pub mod recorder;
5960
mod spool;
6061
pub mod sqlite;

hyperactor_telemetry/src/otel.rs

Lines changed: 129 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,142 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
// Environment variable to select the OpenTelemetry backend
10+
const OTEL_BACKEND_ENV: &str = "HYPERACTOR_OTEL_BACKEND";
11+
12+
#[derive(Debug, Clone, PartialEq)]
13+
pub enum Backend {
14+
Scuba,
15+
Prometheus,
16+
None,
17+
}
18+
19+
impl Backend {
20+
fn from_env() -> Self {
21+
match std::env::var(OTEL_BACKEND_ENV).as_deref() {
22+
Ok("prometheus") => Backend::Prometheus,
23+
Ok("scuba") => Backend::Scuba,
24+
Ok("none") | Ok("") => Backend::None,
25+
_ => {
26+
// Default behavior: use scuba if fbcode_build is enabled, otherwise none
27+
#[cfg(fbcode_build)]
28+
return Backend::Scuba;
29+
#[cfg(not(fbcode_build))]
30+
return Backend::None;
31+
}
32+
}
33+
}
34+
}
35+
936
#[allow(dead_code)]
1037
pub fn tracing_layer<
1138
S: tracing::Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
12-
>() -> Option<impl tracing_subscriber::Layer<S>> {
13-
#[cfg(fbcode_build)]
14-
{
15-
Some(crate::meta::tracing_layer())
16-
}
17-
#[cfg(not(fbcode_build))]
18-
{
19-
None::<Box<dyn tracing_subscriber::Layer<S> + Send + Sync>>
39+
>() -> Option<Box<dyn tracing_subscriber::Layer<S> + Send + Sync>> {
40+
match Backend::from_env() {
41+
Backend::Scuba => {
42+
#[cfg(fbcode_build)]
43+
return Some(Box::new(crate::meta::tracing_layer()));
44+
#[cfg(not(fbcode_build))]
45+
None
46+
}
47+
Backend::Prometheus => {
48+
#[cfg(prometheus_build)]
49+
return Some(Box::new(crate::prometheus::tracing_layer()));
50+
#[cfg(not(prometheus_build))]
51+
None
52+
}
53+
Backend::None => None,
2054
}
2155
}
2256

2357
#[allow(dead_code)]
2458
pub fn init_metrics() {
25-
#[cfg(fbcode_build)]
26-
{
27-
opentelemetry::global::set_meter_provider(crate::meta::meter_provider());
59+
match Backend::from_env() {
60+
Backend::Scuba => {
61+
#[cfg(fbcode_build)]
62+
opentelemetry::global::set_meter_provider(crate::meta::meter_provider());
63+
}
64+
Backend::Prometheus => {
65+
if let Err(e) = crate::prometheus::initialize_prometheus_backend() {
66+
tracing::error!("Failed to initialize Prometheus backend: {}", e);
67+
}
68+
}
69+
Backend::None => {
70+
tracing::warn!("Metrics backend is set to None, no metrics will be collected");
71+
// Do nothing for None backend
72+
}
73+
}
74+
}
75+
76+
#[cfg(test)]
77+
mod tests {
78+
use super::*;
79+
80+
#[test]
81+
fn test_backend_from_env_defaults() {
82+
// Test default behavior when environment variable is not set
83+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
84+
unsafe {
85+
std::env::remove_var(OTEL_BACKEND_ENV);
86+
}
87+
88+
let backend = Backend::from_env();
89+
90+
#[cfg(fbcode_build)]
91+
assert_eq!(backend, Backend::Scuba);
92+
93+
#[cfg(not(fbcode_build))]
94+
assert_eq!(backend, Backend::None);
95+
}
96+
97+
#[test]
98+
fn test_backend_from_env_prometheus() {
99+
// Test that prometheus backend is selected when env var is set to "prometheus"
100+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
101+
unsafe {
102+
std::env::set_var(OTEL_BACKEND_ENV, "prometheus");
103+
}
104+
105+
let backend = Backend::from_env();
106+
assert_eq!(backend, Backend::Prometheus);
107+
108+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
109+
unsafe {
110+
std::env::remove_var(OTEL_BACKEND_ENV);
111+
}
112+
}
113+
114+
#[test]
115+
fn test_backend_from_env_scuba() {
116+
// Test that scuba backend is selected when env var is set to "scuba"
117+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
118+
unsafe {
119+
std::env::set_var(OTEL_BACKEND_ENV, "scuba");
120+
}
121+
122+
let backend = Backend::from_env();
123+
assert_eq!(backend, Backend::Scuba);
124+
125+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
126+
unsafe {
127+
std::env::remove_var(OTEL_BACKEND_ENV);
128+
}
129+
}
130+
131+
#[test]
132+
fn test_backend_from_env_empty_string() {
133+
// Test that none backend is selected when env var is set to empty string
134+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
135+
unsafe {
136+
std::env::set_var(OTEL_BACKEND_ENV, "");
137+
}
138+
139+
let backend = Backend::from_env();
140+
assert_eq!(backend, Backend::None);
141+
142+
// SAFETY: Setting environment variables in tests is generally safe as long as tests are run serially.
143+
unsafe {
144+
std::env::remove_var(OTEL_BACKEND_ENV);
145+
}
28146
}
29147
}

0 commit comments

Comments
 (0)