@@ -31,69 +31,81 @@ var (
3131 slmRetentionRunsTotal = prometheus .NewDesc (
3232 prometheus .BuildFQName (namespace , "slm_stats" , "retention_runs_total" ),
3333 "Total retention runs" ,
34- nil , nil ,
34+ [] string { "cluster" } , nil ,
3535 )
3636 slmRetentionFailedTotal = prometheus .NewDesc (
3737 prometheus .BuildFQName (namespace , "slm_stats" , "retention_failed_total" ),
3838 "Total failed retention runs" ,
39- nil , nil ,
39+ [] string { "cluster" } , nil ,
4040 )
4141 slmRetentionTimedOutTotal = prometheus .NewDesc (
4242 prometheus .BuildFQName (namespace , "slm_stats" , "retention_timed_out_total" ),
4343 "Total timed out retention runs" ,
44- nil , nil ,
44+ [] string { "cluster" } , nil ,
4545 )
4646 slmRetentionDeletionTimeSeconds = prometheus .NewDesc (
4747 prometheus .BuildFQName (namespace , "slm_stats" , "retention_deletion_time_seconds" ),
4848 "Retention run deletion time" ,
49- nil , nil ,
49+ [] string { "cluster" } , nil ,
5050 )
5151 slmTotalSnapshotsTaken = prometheus .NewDesc (
5252 prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_taken_total" ),
5353 "Total snapshots taken" ,
54- nil , nil ,
54+ [] string { "cluster" } , nil ,
5555 )
5656 slmTotalSnapshotsFailed = prometheus .NewDesc (
5757 prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_failed_total" ),
5858 "Total snapshots failed" ,
59- nil , nil ,
59+ [] string { "cluster" } , nil ,
6060 )
6161 slmTotalSnapshotsDeleted = prometheus .NewDesc (
6262 prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshots_deleted_total" ),
6363 "Total snapshots deleted" ,
64- nil , nil ,
64+ [] string { "cluster" } , nil ,
6565 )
6666 slmTotalSnapshotsDeleteFailed = prometheus .NewDesc (
6767 prometheus .BuildFQName (namespace , "slm_stats" , "total_snapshot_deletion_failures_total" ),
6868 "Total snapshot deletion failures" ,
69- nil , nil ,
69+ [] string { "cluster" } , nil ,
7070 )
7171
7272 slmOperationMode = prometheus .NewDesc (
7373 prometheus .BuildFQName (namespace , "slm_stats" , "operation_mode" ),
7474 "Operating status of SLM" ,
75- []string {"operation_mode" }, nil ,
75+ []string {"cluster" , " operation_mode" }, nil ,
7676 )
7777
7878 slmSnapshotsTaken = prometheus .NewDesc (
7979 prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_taken_total" ),
8080 "Total snapshots taken" ,
81- []string {"policy" }, nil ,
81+ []string {
82+ "policy" ,
83+ "cluster" ,
84+ }, nil ,
8285 )
8386 slmSnapshotsFailed = prometheus .NewDesc (
8487 prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_failed_total" ),
8588 "Total snapshots failed" ,
86- []string {"policy" }, nil ,
89+ []string {
90+ "policy" ,
91+ "cluster" ,
92+ }, nil ,
8793 )
8894 slmSnapshotsDeleted = prometheus .NewDesc (
8995 prometheus .BuildFQName (namespace , "slm_stats" , "snapshots_deleted_total" ),
9096 "Total snapshots deleted" ,
91- []string {"policy" }, nil ,
97+ []string {
98+ "policy" ,
99+ "cluster" ,
100+ }, nil ,
92101 )
93102 slmSnapshotsDeletionFailure = prometheus .NewDesc (
94103 prometheus .BuildFQName (namespace , "slm_stats" , "snapshot_deletion_failures_total" ),
95104 "Total snapshot deletion failures" ,
96- []string {"policy" }, nil ,
105+ []string {
106+ "policy" ,
107+ "cluster" ,
108+ }, nil ,
97109 )
98110)
99111
@@ -103,18 +115,67 @@ func init() {
103115
104116// SLM information struct
105117type SLM struct {
106- logger * slog.Logger
107- hc * http.Client
108- u * url.URL
118+ logger * slog.Logger
119+ hc * http.Client
120+ u * url.URL
121+ clusterInfoCh chan * clusterinfo.Response
122+ lastClusterInfo * clusterinfo.Response
109123}
110124
111125// NewSLM defines SLM Prometheus metrics
112126func NewSLM (logger * slog.Logger , u * url.URL , hc * http.Client , ci * clusterinfo.Retriever ) (Collector , error ) {
113- return & SLM {
114- logger : logger ,
115- hc : hc ,
116- u : u ,
117- }, nil
127+ slm := & SLM {
128+ logger : logger ,
129+ hc : hc ,
130+ u : u ,
131+ clusterInfoCh : make (chan * clusterinfo.Response ),
132+ lastClusterInfo : & clusterinfo.Response {
133+ ClusterName : "unknown_cluster" ,
134+ },
135+ }
136+
137+ err := ci .RegisterConsumer (slm )
138+ if err != nil {
139+ return slm , err
140+ }
141+
142+ // start go routine to fetch clusterinfo updates and save them to lastClusterinfo
143+ go func () {
144+ logger .Debug ("starting cluster info receive loop" )
145+ for ci := range slm .clusterInfoCh {
146+ if ci != nil {
147+ logger .Debug ("received cluster info update" , "cluster" , ci .ClusterName )
148+ slm .lastClusterInfo = ci
149+ }
150+ }
151+ logger .Debug ("exiting cluster info receive loop" )
152+ }()
153+
154+ return slm , nil
155+ }
156+
157+ func (s * SLM ) Describe (ch chan <- * prometheus.Desc ) {
158+ ch <- slmRetentionRunsTotal
159+ ch <- slmRetentionFailedTotal
160+ ch <- slmRetentionTimedOutTotal
161+ ch <- slmRetentionDeletionTimeSeconds
162+ ch <- slmTotalSnapshotsTaken
163+ ch <- slmTotalSnapshotsFailed
164+ ch <- slmTotalSnapshotsDeleted
165+ ch <- slmTotalSnapshotsDeleteFailed
166+ ch <- slmOperationMode
167+ ch <- slmSnapshotsTaken
168+ ch <- slmSnapshotsFailed
169+ ch <- slmSnapshotsDeleted
170+ ch <- slmSnapshotsDeletionFailure
171+ }
172+
173+ func (s * SLM ) ClusterLabelUpdates () * chan * clusterinfo.Response {
174+ return & s .clusterInfoCh
175+ }
176+
177+ func (s * SLM ) String () string {
178+ return namespace + "slm"
118179}
119180
120181// SLMStatsResponse is a representation of the SLM stats
@@ -181,6 +242,7 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
181242 slmOperationMode ,
182243 prometheus .GaugeValue ,
183244 value ,
245+ s .lastClusterInfo .ClusterName ,
184246 status ,
185247 )
186248 }
@@ -189,43 +251,51 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
189251 slmRetentionRunsTotal ,
190252 prometheus .CounterValue ,
191253 float64 (slmStatsResp .RetentionRuns ),
254+ s .lastClusterInfo .ClusterName ,
192255 )
193256
194257 ch <- prometheus .MustNewConstMetric (
195258 slmRetentionFailedTotal ,
196259 prometheus .CounterValue ,
197260 float64 (slmStatsResp .RetentionFailed ),
261+ s .lastClusterInfo .ClusterName ,
198262 )
199263
200264 ch <- prometheus .MustNewConstMetric (
201265 slmRetentionTimedOutTotal ,
202266 prometheus .CounterValue ,
203267 float64 (slmStatsResp .RetentionTimedOut ),
268+ s .lastClusterInfo .ClusterName ,
204269 )
205270 ch <- prometheus .MustNewConstMetric (
206271 slmRetentionDeletionTimeSeconds ,
207272 prometheus .GaugeValue ,
208273 float64 (slmStatsResp .RetentionDeletionTimeMillis )/ 1000 ,
274+ s .lastClusterInfo .ClusterName ,
209275 )
210276 ch <- prometheus .MustNewConstMetric (
211277 slmTotalSnapshotsTaken ,
212278 prometheus .CounterValue ,
213279 float64 (slmStatsResp .TotalSnapshotsTaken ),
280+ s .lastClusterInfo .ClusterName ,
214281 )
215282 ch <- prometheus .MustNewConstMetric (
216283 slmTotalSnapshotsFailed ,
217284 prometheus .CounterValue ,
218285 float64 (slmStatsResp .TotalSnapshotsFailed ),
286+ s .lastClusterInfo .ClusterName ,
219287 )
220288 ch <- prometheus .MustNewConstMetric (
221289 slmTotalSnapshotsDeleted ,
222290 prometheus .CounterValue ,
223291 float64 (slmStatsResp .TotalSnapshotsDeleted ),
292+ s .lastClusterInfo .ClusterName ,
224293 )
225294 ch <- prometheus .MustNewConstMetric (
226295 slmTotalSnapshotsDeleteFailed ,
227296 prometheus .CounterValue ,
228297 float64 (slmStatsResp .TotalSnapshotDeletionFailures ),
298+ s .lastClusterInfo .ClusterName ,
229299 )
230300
231301 for _ , policy := range slmStatsResp .PolicyStats {
@@ -234,24 +304,28 @@ func (s *SLM) Update(ctx context.Context, ch chan<- prometheus.Metric) error {
234304 prometheus .CounterValue ,
235305 float64 (policy .SnapshotsTaken ),
236306 policy .Policy ,
307+ s .lastClusterInfo .ClusterName ,
237308 )
238309 ch <- prometheus .MustNewConstMetric (
239310 slmSnapshotsFailed ,
240311 prometheus .CounterValue ,
241312 float64 (policy .SnapshotsFailed ),
242313 policy .Policy ,
314+ s .lastClusterInfo .ClusterName ,
243315 )
244316 ch <- prometheus .MustNewConstMetric (
245317 slmSnapshotsDeleted ,
246318 prometheus .CounterValue ,
247319 float64 (policy .SnapshotsDeleted ),
248320 policy .Policy ,
321+ s .lastClusterInfo .ClusterName ,
249322 )
250323 ch <- prometheus .MustNewConstMetric (
251324 slmSnapshotsDeletionFailure ,
252325 prometheus .CounterValue ,
253326 float64 (policy .SnapshotDeletionFailures ),
254327 policy .Policy ,
328+ s .lastClusterInfo .ClusterName ,
255329 )
256330 }
257331
0 commit comments