Skip to content

Commit d31c105

Browse files
authored
Custom metric buckets (#844)
1 parent dd37e29 commit d31c105

File tree

17 files changed

+234
-188
lines changed

17 files changed

+234
-188
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ license-file = "LICENSE.txt"
99
[workspace.dependencies]
1010
derive_builder = "0.20"
1111
derive_more = { version = "1.0", features = ["constructor", "display", "from", "into", "debug"] }
12+
thiserror = "2"
1213
tonic = "0.12"
1314
tonic-build = "0.12"
14-
opentelemetry = { version = "0.24", features = ["metrics"] }
15+
opentelemetry = { version = "0.26", features = ["metrics"] }
1516
prost = "0.13"
1617
prost-types = "0.13"
1718

client/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ opentelemetry = { workspace = true, features = ["metrics"], optional = true }
3030
parking_lot = "0.12"
3131
prost-types = { workspace = true }
3232
slotmap = "1.0"
33-
thiserror = "1.0"
33+
thiserror = { workspace = true }
3434
tokio = "1.1"
3535
tonic = { workspace = true, features = ["tls", "tls-roots"] }
3636
tower = { version = "0.5", features = ["util"] }

client/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ pub use crate::{
1818
proxy::HttpConnectProxyOptions,
1919
retry::{CallType, RetryClient, RETRYABLE_ERROR_CODES},
2020
};
21+
pub use metrics::{LONG_REQUEST_LATENCY_HISTOGRAM_NAME, REQUEST_LATENCY_HISTOGRAM_NAME};
2122
pub use raw::{CloudService, HealthService, OperatorService, TestService, WorkflowService};
2223
pub use temporal_sdk_core_protos::temporal::api::{
2324
enums::v1::ArchivalState,
@@ -42,13 +43,12 @@ use crate::{
4243
use backoff::{exponential, ExponentialBackoff, SystemClock};
4344
use http::{uri::InvalidUri, Uri};
4445
use parking_lot::RwLock;
45-
use std::sync::OnceLock;
4646
use std::{
4747
collections::HashMap,
4848
fmt::{Debug, Formatter},
4949
ops::{Deref, DerefMut},
5050
str::FromStr,
51-
sync::Arc,
51+
sync::{Arc, OnceLock},
5252
time::{Duration, Instant},
5353
};
5454
use temporal_sdk_core_api::telemetry::metrics::TemporalMeter;

client/src/metrics.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ use temporal_sdk_core_api::telemetry::metrics::{
1212
use tonic::{body::BoxBody, transport::Channel, Code};
1313
use tower::Service;
1414

15+
/// The string name (which may be prefixed) for this metric
16+
pub static REQUEST_LATENCY_HISTOGRAM_NAME: &str = "request_latency";
17+
/// The string name (which may be prefixed) for this metric
18+
pub static LONG_REQUEST_LATENCY_HISTOGRAM_NAME: &str = "long_request_latency";
19+
1520
/// Used to track context associated with metrics, and record/update them
1621
// Possible improvement: make generic over some type tag so that methods are only exposed if the
1722
// appropriate k/vs have already been set.
@@ -58,12 +63,12 @@ impl MetricsContext {
5863
unit: "".into(),
5964
}),
6065
svc_request_latency: meter.histogram_duration(MetricParameters {
61-
name: "request_latency".into(),
66+
name: REQUEST_LATENCY_HISTOGRAM_NAME.into(),
6267
unit: "duration".into(),
6368
description: "Histogram of client request latencies".into(),
6469
}),
6570
long_svc_request_latency: meter.histogram_duration(MetricParameters {
66-
name: "long_request_latency".into(),
71+
name: LONG_REQUEST_LATENCY_HISTOGRAM_NAME.into(),
6772
unit: "duration".into(),
6873
description: "Histogram of client long-poll request latencies".into(),
6974
}),

core-api/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ opentelemetry = { workspace = true, optional = true }
2323
prost = { workspace = true }
2424
prost-types = { workspace = true }
2525
serde_json = "1.0"
26-
thiserror = "1.0"
26+
thiserror = { workspace = true }
2727
tonic = { workspace = true }
2828
tracing-core = "0.1"
2929
url = "2.3"

core-api/src/telemetry.rs

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ pub struct OtelCollectorOptions {
6868
/// If set to true, use f64 seconds for durations instead of u64 milliseconds
6969
#[builder(default)]
7070
pub use_seconds_for_durations: bool,
71+
/// Overrides for histogram buckets. Units depend on the value of `use_seconds_for_durations`.
72+
#[builder(default)]
73+
pub histogram_bucket_overrides: HistogramBucketOverrides,
7174
}
7275

7376
/// Options for exporting metrics to Prometheus
@@ -78,15 +81,33 @@ pub struct PrometheusExporterOptions {
7881
#[builder(default)]
7982
pub global_tags: HashMap<String, String>,
8083
/// If set true, all counters will include a "_total" suffix
81-
#[builder(default = "false")]
84+
#[builder(default)]
8285
pub counters_total_suffix: bool,
8386
/// If set true, all histograms will include the unit in their name as a suffix.
8487
/// Ex: "_milliseconds".
85-
#[builder(default = "false")]
88+
#[builder(default)]
8689
pub unit_suffix: bool,
8790
/// If set to true, use f64 seconds for durations instead of u64 milliseconds
8891
#[builder(default)]
8992
pub use_seconds_for_durations: bool,
93+
/// Overrides for histogram buckets. Units depend on the value of `use_seconds_for_durations`.
94+
#[builder(default)]
95+
pub histogram_bucket_overrides: HistogramBucketOverrides,
96+
}
97+
98+
/// Allows overriding the buckets used by histogram metrics
99+
#[derive(Debug, Clone, Default)]
100+
pub struct HistogramBucketOverrides {
101+
/// Overrides where the key is the metric name and the value is the list of bucket boundaries.
102+
/// The metric name will apply regardless of name prefixing, if any. IE: the name acts like
103+
/// `*metric_name`.
104+
///
105+
/// The string names of core's built-in histogram metrics are publicly available on the
106+
/// `core::telemetry` module and the `client` crate.
107+
///
108+
/// See [here](https://docs.rs/opentelemetry_sdk/latest/opentelemetry_sdk/metrics/enum.Aggregation.html#variant.ExplicitBucketHistogram.field.boundaries)
109+
/// for the exact meaning of boundaries.
110+
pub overrides: HashMap<String, Vec<f64>>,
90111
}
91112

92113
/// Control where logs go
@@ -102,7 +123,8 @@ pub enum Logger {
102123
/// An [EnvFilter](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/struct.EnvFilter.html) filter string.
103124
filter: String,
104125
},
105-
// Push logs to Lang. Can used with temporal_sdk_core::telemetry::CoreLogBufferedConsumer to buffer.
126+
/// Push logs to Lang. Can be used with
127+
/// temporal_sdk_core::telemetry::log_export::CoreLogBufferedConsumer to buffer.
106128
Push {
107129
/// An [EnvFilter](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/struct.EnvFilter.html) filter string.
108130
filter: String,

core-api/src/telemetry/metrics.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,32 @@ mod otel_impls {
377377
}
378378
}
379379

380+
impl Gauge for metrics::Gauge<u64> {
381+
fn record(&self, value: u64, attributes: &MetricAttributes) {
382+
if let MetricAttributes::OTel { kvs } = attributes {
383+
self.record(value, kvs);
384+
} else {
385+
debug_assert!(
386+
false,
387+
"Must use OTel attributes with an OTel metric implementation"
388+
);
389+
}
390+
}
391+
}
392+
393+
impl GaugeF64 for metrics::Gauge<f64> {
394+
fn record(&self, value: f64, attributes: &MetricAttributes) {
395+
if let MetricAttributes::OTel { kvs } = attributes {
396+
self.record(value, kvs);
397+
} else {
398+
debug_assert!(
399+
false,
400+
"Must use OTel attributes with an OTel metric implementation"
401+
);
402+
}
403+
}
404+
}
405+
380406
impl Histogram for metrics::Histogram<u64> {
381407
fn record(&self, value: u64, attributes: &MetricAttributes) {
382408
if let MetricAttributes::OTel { kvs } = attributes {

core/Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ itertools = "0.13"
4343
lru = "0.12"
4444
mockall = "0.13"
4545
opentelemetry = { workspace = true, features = ["metrics"], optional = true }
46-
opentelemetry_sdk = { version = "0.24", features = ["rt-tokio", "metrics"], optional = true }
47-
opentelemetry-otlp = { version = "0.17", features = ["tokio", "metrics", "tls"], optional = true }
48-
opentelemetry-prometheus = { version = "0.17", optional = true }
46+
opentelemetry_sdk = { version = "0.26", features = ["rt-tokio", "metrics"], optional = true }
47+
opentelemetry-otlp = { version = "0.26", features = ["tokio", "metrics", "tls"], optional = true }
48+
opentelemetry-prometheus = { git = "https://github.com/open-telemetry/opentelemetry-rust.git", rev = "e911383", optional = true }
4949
parking_lot = { version = "0.12", features = ["send_guard"] }
5050
pid = "4.0"
5151
pin-project = "1.0"
@@ -61,7 +61,7 @@ siphasher = "1.0"
6161
slotmap = "1.0"
6262
sysinfo = { version = "0.32", default-features = false, features = ["system"] }
6363
tar = { version = "0.4", optional = true }
64-
thiserror = "1.0"
64+
thiserror = { workspace = true }
6565
tokio = { version = "1.37", features = ["rt", "rt-multi-thread", "parking_lot", "time", "fs", "process"] }
6666
tokio-util = { version = "0.7", features = ["io", "io-util"] }
6767
tokio-stream = "0.1"

core/src/telemetry/metrics.rs

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ impl Instruments {
292292
unit: "".into(),
293293
}),
294294
wf_e2e_latency: meter.histogram_duration(MetricParameters {
295-
name: WF_E2E_LATENCY_NAME.into(),
295+
name: WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME.into(),
296296
unit: "duration".into(),
297297
description: "Histogram of total workflow execution latencies".into(),
298298
}),
@@ -312,17 +312,17 @@ impl Instruments {
312312
unit: "".into(),
313313
}),
314314
wf_task_sched_to_start_latency: meter.histogram_duration(MetricParameters {
315-
name: WF_TASK_SCHED_TO_START_LATENCY_NAME.into(),
315+
name: WORKFLOW_TASK_SCHED_TO_START_LATENCY_HISTOGRAM_NAME.into(),
316316
unit: "duration".into(),
317317
description: "Histogram of workflow task schedule-to-start latencies".into(),
318318
}),
319319
wf_task_replay_latency: meter.histogram_duration(MetricParameters {
320-
name: WF_TASK_REPLAY_LATENCY_NAME.into(),
320+
name: WORKFLOW_TASK_REPLAY_LATENCY_HISTOGRAM_NAME.into(),
321321
unit: "duration".into(),
322322
description: "Histogram of workflow task replay latencies".into(),
323323
}),
324324
wf_task_execution_latency: meter.histogram_duration(MetricParameters {
325-
name: WF_TASK_EXECUTION_LATENCY_NAME.into(),
325+
name: WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME.into(),
326326
unit: "duration".into(),
327327
description: "Histogram of workflow task execution (not replay) latencies".into(),
328328
}),
@@ -342,12 +342,12 @@ impl Instruments {
342342
unit: "".into(),
343343
}),
344344
act_sched_to_start_latency: meter.histogram_duration(MetricParameters {
345-
name: ACT_SCHED_TO_START_LATENCY_NAME.into(),
345+
name: ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME.into(),
346346
unit: "duration".into(),
347347
description: "Histogram of activity schedule-to-start latencies".into(),
348348
}),
349349
act_exec_latency: meter.histogram_duration(MetricParameters {
350-
name: ACT_EXEC_LATENCY_NAME.into(),
350+
name: ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME.into(),
351351
unit: "duration".into(),
352352
description: "Histogram of activity execution latencies".into(),
353353
}),
@@ -496,13 +496,20 @@ pub(crate) fn failure_reason(reason: FailureReason) -> MetricKeyValue {
496496
MetricKeyValue::new(KEY_TASK_FAILURE_TYPE, reason.to_string())
497497
}
498498

499-
pub(super) const WF_E2E_LATENCY_NAME: &str = "workflow_endtoend_latency";
500-
pub(super) const WF_TASK_SCHED_TO_START_LATENCY_NAME: &str =
499+
/// The string name (which may be prefixed) for this metric
500+
pub const WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME: &str = "workflow_endtoend_latency";
501+
/// The string name (which may be prefixed) for this metric
502+
pub const WORKFLOW_TASK_SCHED_TO_START_LATENCY_HISTOGRAM_NAME: &str =
501503
"workflow_task_schedule_to_start_latency";
502-
pub(super) const WF_TASK_REPLAY_LATENCY_NAME: &str = "workflow_task_replay_latency";
503-
pub(super) const WF_TASK_EXECUTION_LATENCY_NAME: &str = "workflow_task_execution_latency";
504-
pub(super) const ACT_SCHED_TO_START_LATENCY_NAME: &str = "activity_schedule_to_start_latency";
505-
pub(super) const ACT_EXEC_LATENCY_NAME: &str = "activity_execution_latency";
504+
/// The string name (which may be prefixed) for this metric
505+
pub const WORKFLOW_TASK_REPLAY_LATENCY_HISTOGRAM_NAME: &str = "workflow_task_replay_latency";
506+
/// The string name (which may be prefixed) for this metric
507+
pub const WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME: &str = "workflow_task_execution_latency";
508+
/// The string name (which may be prefixed) for this metric
509+
pub const ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME: &str =
510+
"activity_schedule_to_start_latency";
511+
/// The string name (which may be prefixed) for this metric
512+
pub const ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME: &str = "activity_execution_latency";
506513
pub(super) const NUM_POLLERS_NAME: &str = "num_pollers";
507514
pub(super) const TASK_SLOTS_AVAILABLE_NAME: &str = "worker_task_slots_available";
508515
pub(super) const TASK_SLOTS_USED_NAME: &str = "worker_task_slots_used";
@@ -533,7 +540,7 @@ macro_rules! define_latency_buckets {
533540

534541
define_latency_buckets!(
535542
(
536-
WF_E2E_LATENCY_NAME,
543+
WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME,
537544
WF_LATENCY_MS_BUCKETS,
538545
WF_LATENCY_S_BUCKETS,
539546
[
@@ -556,19 +563,21 @@ define_latency_buckets!(
556563
]
557564
),
558565
(
559-
WF_TASK_EXECUTION_LATENCY_NAME | WF_TASK_REPLAY_LATENCY_NAME,
566+
WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME
567+
| WORKFLOW_TASK_REPLAY_LATENCY_HISTOGRAM_NAME,
560568
WF_TASK_MS_BUCKETS,
561569
WF_TASK_S_BUCKETS,
562570
[1., 10., 20., 50., 100., 200., 500., 1000.]
563571
),
564572
(
565-
ACT_EXEC_LATENCY_NAME,
573+
ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME,
566574
ACT_EXE_MS_BUCKETS,
567575
ACT_EXE_S_BUCKETS,
568576
[50., 100., 500., 1000., 5000., 10_000., 60_000.]
569577
),
570578
(
571-
WF_TASK_SCHED_TO_START_LATENCY_NAME | ACT_SCHED_TO_START_LATENCY_NAME,
579+
WORKFLOW_TASK_SCHED_TO_START_LATENCY_HISTOGRAM_NAME
580+
| ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME,
572581
TASK_SCHED_TO_START_MS_BUCKETS,
573582
TASK_SCHED_TO_START_S_BUCKETS,
574583
[100., 500., 1000., 5000., 10_000., 100_000., 1_000_000.]

core/src/telemetry/mod.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ mod otel;
99
mod prometheus_server;
1010

1111
#[cfg(feature = "otel")]
12-
pub use metrics::{default_buckets_for, MetricsCallBuffer};
12+
pub use metrics::{
13+
default_buckets_for, MetricsCallBuffer, ACTIVITY_EXEC_LATENCY_HISTOGRAM_NAME,
14+
ACTIVITY_SCHED_TO_START_LATENCY_HISTOGRAM_NAME, WORKFLOW_E2E_LATENCY_HISTOGRAM_NAME,
15+
WORKFLOW_TASK_EXECUTION_LATENCY_HISTOGRAM_NAME, WORKFLOW_TASK_REPLAY_LATENCY_HISTOGRAM_NAME,
16+
WORKFLOW_TASK_SCHED_TO_START_LATENCY_HISTOGRAM_NAME,
17+
};
1318
#[cfg(feature = "otel")]
1419
pub use otel::{build_otlp_metric_exporter, start_prometheus_metric_exporter};
1520

0 commit comments

Comments
 (0)