3333from smdebug .core .locations import TraceFileLocation
3434from smdebug .core .logger import get_logger
3535from smdebug .core .utils import ensure_dir , get_node_id
36- from smdebug .profiler .profiler_constants import CONVERT_TO_MICROSECS
36+ from smdebug .profiler .profiler_constants import CONVERT_TO_MICROSECS , PYTHONTIMELINE_SUFFIX
3737
3838logger = get_logger ()
3939
@@ -88,7 +88,18 @@ def to_json(self):
8888 "ph" : self .phase ,
8989 "ts" : self .rel_ts_micros ,
9090 }
91- if self .phase == "X" :
91+
92+ # handle Instant event
93+ if self .phase == "i" :
94+ if self .args :
95+ # Instant events have a field unique to them called scope.
96+ # scope can be "g" - global, "p" - process, "t" - thread.
97+ # parsing this value that is being passed as args.
98+ s = self .args ["s" ] if "s" in self .args else "t"
99+ json_dict .update ({"s" : s })
100+ if "s" in self .args :
101+ self .args .pop ("s" )
102+ elif self .phase == "X" :
92103 json_dict .update ({"dur" : self .duration })
93104
94105 if self .args :
@@ -105,7 +116,7 @@ class TimelineFileWriter:
105116 and asynchronously writes TimelineRecord to the file.
106117 """
107118
108- def __init__ (self , profiler_config_parser , max_queue = 100 ):
119+ def __init__ (self , profiler_config_parser , max_queue = 100 , suffix = PYTHONTIMELINE_SUFFIX ):
109120 """Creates a `TimelineFileWriter` and a trace event file to write to.
110121 This event file will contain TimelineRecord as JSON strings, which are written to
111122 disk via the write_record method.
@@ -120,14 +131,34 @@ def __init__(self, profiler_config_parser, max_queue=100):
120131 self ._worker = _TimelineLoggerThread (
121132 queue = self ._event_queue ,
122133 sentinel_event = self ._sentinel_event ,
123- base_start_time = self .start_time_since_epoch_in_micros ,
134+ base_start_time_in_us = self .start_time_since_epoch_in_micros ,
124135 profiler_config_parser = self ._profiler_config_parser ,
136+ suffix = suffix ,
125137 )
126138 self ._worker .start ()
127139
140+ def _update_base_start_time (self , base_start_time_in_us ):
141+ """
142+ Some trace files such as the Horovod trace file may start before this timeline
143+ writer is initialized. In such case, use this function to update the start time
144+ since epoch in micros.
145+ """
146+ if base_start_time_in_us != self .start_time_since_epoch_in_micros :
147+ self .start_time_since_epoch_in_micros = base_start_time_in_us
148+ self ._worker ._update_base_start_time (base_start_time_in_us )
149+
128150 def write_trace_events (
129151 self , timestamp , training_phase = "" , op_name = "" , phase = "X" , duration = 0 , ** kwargs
130152 ):
153+ """
154+ Creates TimelineRecord from the details passed as parameters, and enqueues an event for write.
155+ :param timestamp:start_time for the event (in seconds)
156+ :param training_phase: strings like, data_iteration, forward, backward, operations etc
157+ :param op_name: more details about phase like whether dataset or iterator
158+ :param phase: phase of trace event. default is 'X'
159+ :param duration: any duration manually computed (in seconds)
160+ :param kwargs: other params. can be process id and thread id
161+ """
131162 if not self ._worker ._healthy or not self ._profiler_config_parser .profiling_enabled :
132163 return
133164 duration_in_us = int (duration * CONVERT_TO_MICROSECS ) # convert to micro seconds
@@ -167,7 +198,13 @@ class _TimelineLoggerThread(threading.Thread):
167198 https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/summary/writer/event_file_writer.py#L133"""
168199
169200 def __init__ (
170- self , queue , sentinel_event , base_start_time , profiler_config_parser , verbose = False
201+ self ,
202+ queue ,
203+ sentinel_event ,
204+ base_start_time_in_us ,
205+ profiler_config_parser ,
206+ verbose = False ,
207+ suffix = PYTHONTIMELINE_SUFFIX ,
171208 ):
172209 """Creates a _TimelineLoggerThread."""
173210 threading .Thread .__init__ (self )
@@ -180,14 +217,23 @@ def __init__(
180217 self .tensor_table = collections .defaultdict (int )
181218 self .continuous_fail_count = 0
182219 self .is_first = True
183- self .last_event_end_time_in_us = int (round (base_start_time ))
220+ self ._update_base_start_time (base_start_time_in_us )
221+ self ._healthy = True
222+ self ._profiler_config_parser = profiler_config_parser
223+ self .node_id = get_node_id ()
224+ self .suffix = suffix
225+
226+ def _update_base_start_time (self , base_start_time_in_us ):
227+ """
228+ Some trace files such as the Horovod trace file may start before this timeline
229+ writer is initialized. In such case, use this function to update the start time
230+ since epoch in micros.
231+ """
232+ self .last_event_end_time_in_us = int (round (base_start_time_in_us ))
184233 self .last_file_close_time_in_us = self .last_event_end_time_in_us
185234 self .cur_hour = datetime .utcfromtimestamp (
186235 self .last_file_close_time_in_us / CONVERT_TO_MICROSECS
187236 ).hour
188- self ._healthy = True
189- self ._profiler_config_parser = profiler_config_parser
190- self .node_id = get_node_id ()
191237
192238 def run (self ):
193239 while True :
@@ -315,13 +361,20 @@ def write_event(self, record):
315361 json_dict = {"name" : "process_sort_index" , "ph" : "M" , "pid" : 0 , "args" : args }
316362 self ._writer .write (json .dumps (json_dict ) + ",\n " )
317363
318- args = {"name" : record .training_phase }
319- json_dict = {"name" : "process_name" , "ph" : "M" , "pid" : tensor_idx , "args" : args }
320- self ._writer .write (json .dumps (json_dict ) + ",\n " )
364+ # Instant events don't have a training phase
365+ if record .phase != "i" :
366+ args = {"name" : record .training_phase }
367+ json_dict = {"name" : "process_name" , "ph" : "M" , "pid" : tensor_idx , "args" : args }
368+ self ._writer .write (json .dumps (json_dict ) + ",\n " )
321369
322- args = {"sort_index" : tensor_idx }
323- json_dict = {"name" : "process_sort_index" , "ph" : "M" , "pid" : tensor_idx , "args" : args }
324- self ._writer .write (json .dumps (json_dict ) + ",\n " )
370+ args = {"sort_index" : tensor_idx }
371+ json_dict = {
372+ "name" : "process_sort_index" ,
373+ "ph" : "M" ,
374+ "pid" : tensor_idx ,
375+ "args" : args ,
376+ }
377+ self ._writer .write (json .dumps (json_dict ) + ",\n " )
325378
326379 self .is_first = False
327380
@@ -366,6 +419,7 @@ def close(self):
366419 new_file_name = TraceFileLocation ().get_file_location (
367420 base_dir = self ._profiler_config_parser .config .local_path ,
368421 timestamp = self .last_event_end_time_in_us ,
422+ suffix = self .suffix ,
369423 )
370424 ensure_dir (new_file_name )
371425 os .rename (self .name (), new_file_name )
@@ -378,6 +432,8 @@ def name(self):
378432 self ._profiler_config_parser .config .local_path
379433 + "/framework/"
380434 + self .node_id
435+ + "_"
436+ + self .suffix
381437 + SMDEBUG_TEMP_PATH_SUFFIX
382438 )
383439
0 commit comments