1212
1313
1414class  MetricsHistogram :
15-     def  __init__ (self , metrics_reader ,  select_metrics = None ):
15+     def  __init__ (self , metrics_reader ):
1616
1717        self .metrics_reader  =  metrics_reader 
1818
1919        # get timestamp of latest files 
2020        self .last_timestamp  =  self .metrics_reader .get_timestamp_of_latest_available_file ()
21-         self .all_events  =  self .metrics_reader .get_events (0 , self .last_timestamp )
22- 
23-         # define the list of metrics to plot: per default cpu and gpu 
24-         self .select_metrics  =  ["cpu" , "gpu" ]
25-         if  select_metrics  is  not   None :
26-             self .select_metrics .extend (select_metrics )
27- 
28-         self .preprocess_system_metrics ()
29- 
30-         self .create_plot ()
31- 
32-     def  preprocess_system_metrics (self ):
21+         self .seen_system_metric_list  =  set ()
22+         self .select_metrics  =  []
23+         self .sources  =  {}
24+         self .target  =  None 
25+ 
26+     """ 
27+     @param starttime is starttime_since_epoch_in_micros. Default value 0, which means start 
28+     @param endtime is endtime_since_epoch_in_micros. Default value is  MetricsHistogram.last_timestamp , i.e., last_timestamp seen by system_metrics_reader 
29+     @param select_metrics is array of metrics to be selected, Default ["cpu", "gpu"] 
30+     """ 
31+ 
32+     def  plot (self , starttime = 0 , endtime = None , select_metrics = [".*" ]):
33+         if  endtime  ==  None :
34+             endtime  =  self .metrics_reader .get_timestamp_of_latest_available_file ()
35+         all_events  =  self .metrics_reader .get_events (starttime , endtime )
36+         print (
37+             f"Found { len (all_events )}   system metrics events from timestamp_in_us:{ starttime }   to timestamp_in_us:{ endtime }  " 
38+         )
39+         self .last_timestamp  =  endtime 
40+         self .select_metrics  =  select_metrics 
41+         self .system_metrics  =  self .preprocess_system_metrics (all_events = all_events )
42+         self .create_plot (self .system_metrics )
43+ 
44+     def  clear ():
45+         self .system_metrics  =  {}
46+         self .sources  =  {}
3347
48+     def  preprocess_system_metrics (self , all_events = [], system_metrics = {}):
49+         cpu_name  =  None 
3450        # read all available system metric events and store them in dict 
35-         self .system_metrics  =  {}
36-         for  event  in  self .all_events :
37-             if  (
38-                 event .name  not  in   self .system_metrics 
39-                 and  event .dimension  is  not   "GPUMemoryUtilization" 
40-             ):
41-                 self .system_metrics [event .name ] =  []
42-             self .system_metrics [event .name ].append (event .value )
51+         for  event  in  all_events :
52+             if  event .name  not  in   system_metrics :
53+                 system_metrics [event .name ] =  []
54+                 if  cpu_name  is  None  and  event .dimension  ==  "CPUUtilization" :
55+                     cpu_name  =  event .name 
56+                     print (cpu_name )
57+             system_metrics [event .name ].append (event .value )
4358
4459        # total cpu utilization is not recorded in SM 
45-         self .cores  =  0.0 
46-         cpu_total  =  np .zeros (len (self .system_metrics ["cpu0" ]))
47-         for  metric  in  self .system_metrics :
48-             if  "cpu"  in  metric :
49-                 self .cores  +=  1 
50-                 cpu_total  +=  self .system_metrics [metric ]
60+         if  cpu_name  is  not   None :
61+             self .cores  =  0.0 
62+             cpu_total  =  np .zeros (len (system_metrics [cpu_name ]))
63+             for  metric  in  system_metrics :
64+                 # TODO should we do similar for gpu too 
65+                 if  "cpu"  in  metric  and  metric :
66+                     if  metric  not  in   self .seen_system_metric_list :
67+                         self .cores  +=  1 
68+                         self .seen_system_metric_list .add (metric )
5169
52-         self . system_metrics [ "cpu_total" ]  =   cpu_total  /   self . cores 
70+                      cpu_total  +=   system_metrics [ metric ] 
5371
54-         # number of datapoints 
55-         self .width  =  self .system_metrics ["cpu_total" ].shape [0 ]
72+             system_metrics ["cpu_total" ] =  cpu_total  /  self .cores 
5673
5774        # add user defined metrics to the list 
58-         self .metrics  =  []
59-         available_metrics  =  list (self .system_metrics .keys ())
75+         filtered_metrics  =  []
76+         available_metrics  =  list (system_metrics .keys ())
77+         print (f"select metrics:{ self .select_metrics }  " )
6078
6179        for  metric  in  self .select_metrics :
62-             r  =  re .compile (".*"  +  metric )
63-             self .metrics .extend (list (filter (r .match , available_metrics )))
64- 
65-     def  create_plot (self ):
66- 
80+             r  =  re .compile (r".*"  +  metric  +  r".*" )
81+             filtered_metrics .extend (list (filter (r .search , available_metrics )))
82+         print (f"filtered_metrics:{ filtered_metrics }  " )
83+ 
84+         # delete the keys which needs to be filtered out 
85+         for  key  in  available_metrics :
86+             if  key  not  in   filtered_metrics  and  "total"  not  in   key :
87+                 del  system_metrics [key ]
88+         return  system_metrics 
89+ 
90+     def  _get_probs_binedges (self , values ):
91+         # create histogram bins 
92+         bins  =  np .arange (0 , 100 , 2 )
93+         probs , binedges  =  np .histogram (values , bins = bins )
94+         bincenters  =  0.5  *  (binedges [1 :] +  binedges [:- 1 ])
95+         return  probs , binedges 
96+ 
97+     def  create_plot (self , system_metrics = {}):
98+         metrics  =  list (system_metrics .keys ())
6799        figures  =  []
68-         self .sources  =  {}
69100
70101        # create a histogram per metric 
71-         for  index , metric  in  enumerate (self . metrics ):
102+         for  index , metric  in  enumerate (metrics ):
72103            p  =  figure (plot_height = 250 , plot_width = 250 )
73-             values  =  self .system_metrics [metric ]
74- 
75-             # create histogram bins 
76-             bins  =  np .arange (0 , 100 , 2 )
77-             probs , binedges  =  np .histogram (values , bins = bins )
78-             bincenters  =  0.5  *  (binedges [1 :] +  binedges [:- 1 ])
79- 
104+             probs , binedges  =  self ._get_probs_binedges (system_metrics [metric ])
80105            # set data 
81106            source  =  ColumnDataSource (data = dict (top = probs , left = binedges [:- 1 ], right = binedges [1 :]))
82107            self .sources [metric ] =  source 
@@ -93,47 +118,29 @@ def create_plot(self):
93118
94119            # set plot 
95120            p .y_range .start  =  0 
96-             p .xaxis .axis_label  =  metric 
121+             p .xaxis .axis_label  =  metric  +  " util" 
122+             p .yaxis .axis_label  =  "Occurences" 
97123            p .grid .grid_line_color  =  "white" 
98124            figures .append (p )
99125
100126        p  =  gridplot (figures , ncols = 4 )
101127        self .target  =  show (p , notebook_handle = True )
102128
103129    def  update_data (self , current_timestamp ):
104- 
105130        # get all events from last to current timestamp 
106131        events  =  self .metrics_reader .get_events (self .last_timestamp , current_timestamp )
107132        self .last_timestamp  =  current_timestamp 
108133
109-         if  len (events ) >  0 :
110-             for  event  in  events :
111-                 if  event .name  !=  None :
112-                     self .system_metrics [event .name ].append (event .value )
113- 
114-             cpu_total  =  np .zeros (len (self .system_metrics ["cpu0" ]))
115- 
116-             # iterate over available metrics 
117-             for  metric  in  self .system_metrics :
134+         self .system_metrics  =  self .preprocess_system_metrics (events , self .system_metrics )
118135
119-                 # compute total cpu utilization 
120-                 if  "cpu"  in  metric  and  metric  !=  "cpu_total" :
121-                     cpu_total  +=  self .system_metrics [metric ]
122- 
123-             self .system_metrics ["cpu_total" ] =  cpu_total  /  self .cores 
124- 
125-             # update histograms 
126-             for  index , metric  in  enumerate (self .metrics ):
127-                 values  =  self .system_metrics [metric ]
128- 
129-                 # create new histogram bins 
130-                 bins  =  np .arange (0 , 100 , 2 )
131-                 probs , binedges  =  np .histogram (values , bins = bins )
132-                 bincenters  =  0.5  *  (binedges [1 :] +  binedges [:- 1 ])
133- 
134-                 # update data 
135-                 self .sources [metric ].data ["top" ] =  probs 
136-                 self .sources [metric ].data ["left" ] =  binedges [:- 1 ]
137-                 self .sources [metric ].data ["right" ] =  binedges [1 :]
136+         # update histograms 
137+         for  index , metric  in  enumerate (self .system_metrics ):
138+             values  =  self .system_metrics [metric ]
138139
139-             push_notebook ()
140+             # create new histogram bins 
141+             probs , binedges  =  self ._get_probs_binedges (self .system_metrics [metric ])
142+             # update data 
143+             self .sources [metric ].data ["top" ] =  probs 
144+             self .sources [metric ].data ["left" ] =  binedges [:- 1 ]
145+             self .sources [metric ].data ["right" ] =  binedges [1 :]
146+         push_notebook ()
0 commit comments