@@ -146,7 +146,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
146146 self .LayerNorm = tf .keras .layers .LayerNormalization (epsilon = config .layer_norm_eps , name = "LayerNorm" )
147147 self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
148148
149- def build (self , input_shape : tf . TensorShape ):
149+ def build (self , input_shape = None ):
150150 with tf .name_scope ("word_embeddings" ):
151151 self .weight = self .add_weight (
152152 name = "weight" ,
@@ -168,7 +168,12 @@ def build(self, input_shape: tf.TensorShape):
168168 initializer = get_initializer (self .initializer_range ),
169169 )
170170
171- super ().build (input_shape )
171+ if self .built :
172+ return
173+ self .built = True
174+ if getattr (self , "LayerNorm" , None ) is not None :
175+ with tf .name_scope (self .LayerNorm .name ):
176+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
172177
173178 # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call
174179 def call (
@@ -246,6 +251,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
246251 # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993
247252 self .attention_dropout = tf .keras .layers .Dropout (rate = config .attention_probs_dropout_prob )
248253 self .output_dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
254+ self .config = config
249255
250256 def transpose_for_scores (self , tensor : tf .Tensor , batch_size : int ) -> tf .Tensor :
251257 # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
@@ -307,6 +313,26 @@ def call(
307313
308314 return outputs
309315
316+ def build (self , input_shape = None ):
317+ if self .built :
318+ return
319+ self .built = True
320+ if getattr (self , "query" , None ) is not None :
321+ with tf .name_scope (self .query .name ):
322+ self .query .build (self .config .hidden_size )
323+ if getattr (self , "key" , None ) is not None :
324+ with tf .name_scope (self .key .name ):
325+ self .key .build (self .config .hidden_size )
326+ if getattr (self , "value" , None ) is not None :
327+ with tf .name_scope (self .value .name ):
328+ self .value .build (self .config .hidden_size )
329+ if getattr (self , "dense" , None ) is not None :
330+ with tf .name_scope (self .dense .name ):
331+ self .dense .build (self .config .hidden_size )
332+ if getattr (self , "LayerNorm" , None ) is not None :
333+ with tf .name_scope (self .LayerNorm .name ):
334+ self .LayerNorm .build ([None , None , self .config .hidden_size ])
335+
310336
311337class TFAlbertLayer (tf .keras .layers .Layer ):
312338 def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -329,6 +355,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
329355 epsilon = config .layer_norm_eps , name = "full_layer_layer_norm"
330356 )
331357 self .dropout = tf .keras .layers .Dropout (rate = config .hidden_dropout_prob )
358+ self .config = config
332359
333360 def call (
334361 self ,
@@ -356,6 +383,23 @@ def call(
356383
357384 return outputs
358385
386+ def build (self , input_shape = None ):
387+ if self .built :
388+ return
389+ self .built = True
390+ if getattr (self , "attention" , None ) is not None :
391+ with tf .name_scope (self .attention .name ):
392+ self .attention .build (None )
393+ if getattr (self , "ffn" , None ) is not None :
394+ with tf .name_scope (self .ffn .name ):
395+ self .ffn .build (self .config .hidden_size )
396+ if getattr (self , "ffn_output" , None ) is not None :
397+ with tf .name_scope (self .ffn_output .name ):
398+ self .ffn_output .build (self .config .intermediate_size )
399+ if getattr (self , "full_layer_layer_norm" , None ) is not None :
400+ with tf .name_scope (self .full_layer_layer_norm .name ):
401+ self .full_layer_layer_norm .build ([None , None , self .config .hidden_size ])
402+
359403
360404class TFAlbertLayerGroup (tf .keras .layers .Layer ):
361405 def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -399,6 +443,15 @@ def call(
399443
400444 return tuple (v for v in [hidden_states , layer_hidden_states , layer_attentions ] if v is not None )
401445
446+ def build (self , input_shape = None ):
447+ if self .built :
448+ return
449+ self .built = True
450+ if getattr (self , "albert_layers" , None ) is not None :
451+ for layer in self .albert_layers :
452+ with tf .name_scope (layer .name ):
453+ layer .build (None )
454+
402455
403456class TFAlbertTransformer (tf .keras .layers .Layer ):
404457 def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -416,6 +469,7 @@ def __init__(self, config: AlbertConfig, **kwargs):
416469 self .albert_layer_groups = [
417470 TFAlbertLayerGroup (config , name = f"albert_layer_groups_._{ i } " ) for i in range (config .num_hidden_groups )
418471 ]
472+ self .config = config
419473
420474 def call (
421475 self ,
@@ -457,6 +511,18 @@ def call(
457511 last_hidden_state = hidden_states , hidden_states = all_hidden_states , attentions = all_attentions
458512 )
459513
514+ def build (self , input_shape = None ):
515+ if self .built :
516+ return
517+ self .built = True
518+ if getattr (self , "embedding_hidden_mapping_in" , None ) is not None :
519+ with tf .name_scope (self .embedding_hidden_mapping_in .name ):
520+ self .embedding_hidden_mapping_in .build (self .config .embedding_size )
521+ if getattr (self , "albert_layer_groups" , None ) is not None :
522+ for layer in self .albert_layer_groups :
523+ with tf .name_scope (layer .name ):
524+ layer .build (None )
525+
460526
461527class TFAlbertPreTrainedModel (TFPreTrainedModel ):
462528 """
@@ -488,13 +554,21 @@ def __init__(self, config: AlbertConfig, input_embeddings: tf.keras.layers.Layer
488554 # an output-only bias for each token.
489555 self .decoder = input_embeddings
490556
491- def build (self , input_shape : tf . TensorShape ):
557+ def build (self , input_shape = None ):
492558 self .bias = self .add_weight (shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "bias" )
493559 self .decoder_bias = self .add_weight (
494560 shape = (self .config .vocab_size ,), initializer = "zeros" , trainable = True , name = "decoder/bias"
495561 )
496562
497- super ().build (input_shape )
563+ if self .built :
564+ return
565+ self .built = True
566+ if getattr (self , "dense" , None ) is not None :
567+ with tf .name_scope (self .dense .name ):
568+ self .dense .build (self .config .hidden_size )
569+ if getattr (self , "LayerNorm" , None ) is not None :
570+ with tf .name_scope (self .LayerNorm .name ):
571+ self .LayerNorm .build ([None , None , self .config .embedding_size ])
498572
499573 def get_output_embeddings (self ) -> tf .keras .layers .Layer :
500574 return self .decoder
@@ -650,6 +724,20 @@ def call(
650724 attentions = encoder_outputs .attentions ,
651725 )
652726
727+ def build (self , input_shape = None ):
728+ if self .built :
729+ return
730+ self .built = True
731+ if getattr (self , "embeddings" , None ) is not None :
732+ with tf .name_scope (self .embeddings .name ):
733+ self .embeddings .build (None )
734+ if getattr (self , "encoder" , None ) is not None :
735+ with tf .name_scope (self .encoder .name ):
736+ self .encoder .build (None )
737+ if getattr (self , "pooler" , None ) is not None :
738+ with tf .name_scope (self .pooler .name ):
739+ self .pooler .build (None ) # TODO Matt might be wrong
740+
653741
654742@dataclass
655743class TFAlbertForPreTrainingOutput (ModelOutput ):
@@ -825,6 +913,14 @@ def call(
825913
826914 return outputs
827915
916+ def build (self , input_shape = None ):
917+ if self .built :
918+ return
919+ self .built = True
920+ if getattr (self , "albert" , None ) is not None :
921+ with tf .name_scope (self .albert .name ):
922+ self .albert .build (None )
923+
828924
829925@add_start_docstrings (
830926 """
@@ -921,6 +1017,20 @@ def call(
9211017 attentions = outputs .attentions ,
9221018 )
9231019
1020+ def build (self , input_shape = None ):
1021+ if self .built :
1022+ return
1023+ self .built = True
1024+ if getattr (self , "albert" , None ) is not None :
1025+ with tf .name_scope (self .albert .name ):
1026+ self .albert .build (None )
1027+ if getattr (self , "predictions" , None ) is not None :
1028+ with tf .name_scope (self .predictions .name ):
1029+ self .predictions .build (None )
1030+ if getattr (self , "sop_classifier" , None ) is not None :
1031+ with tf .name_scope (self .sop_classifier .name ):
1032+ self .sop_classifier .build (None )
1033+
9241034
9251035class TFAlbertSOPHead (tf .keras .layers .Layer ):
9261036 def __init__ (self , config : AlbertConfig , ** kwargs ):
@@ -932,13 +1042,22 @@ def __init__(self, config: AlbertConfig, **kwargs):
9321042 kernel_initializer = get_initializer (config .initializer_range ),
9331043 name = "classifier" ,
9341044 )
1045+ self .config = config
9351046
9361047 def call (self , pooled_output : tf .Tensor , training : bool ) -> tf .Tensor :
9371048 dropout_pooled_output = self .dropout (inputs = pooled_output , training = training )
9381049 logits = self .classifier (inputs = dropout_pooled_output )
9391050
9401051 return logits
9411052
1053+ def build (self , input_shape = None ):
1054+ if self .built :
1055+ return
1056+ self .built = True
1057+ if getattr (self , "classifier" , None ) is not None :
1058+ with tf .name_scope (self .classifier .name ):
1059+ self .classifier .build (self .config .hidden_size )
1060+
9421061
9431062@add_start_docstrings ("""Albert Model with a `language modeling` head on top.""" , ALBERT_START_DOCSTRING )
9441063class TFAlbertForMaskedLM (TFAlbertPreTrainedModel , TFMaskedLanguageModelingLoss ):
@@ -1035,6 +1154,17 @@ def call(
10351154 attentions = outputs .attentions ,
10361155 )
10371156
1157+ def build (self , input_shape = None ):
1158+ if self .built :
1159+ return
1160+ self .built = True
1161+ if getattr (self , "albert" , None ) is not None :
1162+ with tf .name_scope (self .albert .name ):
1163+ self .albert .build (None )
1164+ if getattr (self , "predictions" , None ) is not None :
1165+ with tf .name_scope (self .predictions .name ):
1166+ self .predictions .build (None )
1167+
10381168
10391169@add_start_docstrings (
10401170 """
@@ -1058,6 +1188,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
10581188 self .classifier = tf .keras .layers .Dense (
10591189 units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
10601190 )
1191+ self .config = config
10611192
10621193 @unpack_inputs
10631194 @add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1117,6 +1248,17 @@ def call(
11171248 attentions = outputs .attentions ,
11181249 )
11191250
1251+ def build (self , input_shape = None ):
1252+ if self .built :
1253+ return
1254+ self .built = True
1255+ if getattr (self , "albert" , None ) is not None :
1256+ with tf .name_scope (self .albert .name ):
1257+ self .albert .build (None )
1258+ if getattr (self , "classifier" , None ) is not None :
1259+ with tf .name_scope (self .classifier .name ):
1260+ self .classifier .build (self .config .hidden_size )
1261+
11201262
11211263@add_start_docstrings (
11221264 """
@@ -1145,6 +1287,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
11451287 self .classifier = tf .keras .layers .Dense (
11461288 units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
11471289 )
1290+ self .config = config
11481291
11491292 @unpack_inputs
11501293 @add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1200,6 +1343,17 @@ def call(
12001343 attentions = outputs .attentions ,
12011344 )
12021345
1346+ def build (self , input_shape = None ):
1347+ if self .built :
1348+ return
1349+ self .built = True
1350+ if getattr (self , "albert" , None ) is not None :
1351+ with tf .name_scope (self .albert .name ):
1352+ self .albert .build (None )
1353+ if getattr (self , "classifier" , None ) is not None :
1354+ with tf .name_scope (self .classifier .name ):
1355+ self .classifier .build (self .config .hidden_size )
1356+
12031357
12041358@add_start_docstrings (
12051359 """
@@ -1221,6 +1375,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
12211375 self .qa_outputs = tf .keras .layers .Dense (
12221376 units = config .num_labels , kernel_initializer = get_initializer (config .initializer_range ), name = "qa_outputs"
12231377 )
1378+ self .config = config
12241379
12251380 @unpack_inputs
12261381 @add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, sequence_length" ))
@@ -1295,6 +1450,17 @@ def call(
12951450 attentions = outputs .attentions ,
12961451 )
12971452
1453+ def build (self , input_shape = None ):
1454+ if self .built :
1455+ return
1456+ self .built = True
1457+ if getattr (self , "albert" , None ) is not None :
1458+ with tf .name_scope (self .albert .name ):
1459+ self .albert .build (None )
1460+ if getattr (self , "qa_outputs" , None ) is not None :
1461+ with tf .name_scope (self .qa_outputs .name ):
1462+ self .qa_outputs .build (self .config .hidden_size )
1463+
12981464
12991465@add_start_docstrings (
13001466 """
@@ -1316,6 +1482,7 @@ def __init__(self, config: AlbertConfig, *inputs, **kwargs):
13161482 self .classifier = tf .keras .layers .Dense (
13171483 units = 1 , kernel_initializer = get_initializer (config .initializer_range ), name = "classifier"
13181484 )
1485+ self .config = config
13191486
13201487 @unpack_inputs
13211488 @add_start_docstrings_to_model_forward (ALBERT_INPUTS_DOCSTRING .format ("batch_size, num_choices, sequence_length" ))
@@ -1394,3 +1561,14 @@ def call(
13941561 hidden_states = outputs .hidden_states ,
13951562 attentions = outputs .attentions ,
13961563 )
1564+
1565+ def build (self , input_shape = None ):
1566+ if self .built :
1567+ return
1568+ self .built = True
1569+ if getattr (self , "albert" , None ) is not None :
1570+ with tf .name_scope (self .albert .name ):
1571+ self .albert .build (None )
1572+ if getattr (self , "classifier" , None ) is not None :
1573+ with tf .name_scope (self .classifier .name ):
1574+ self .classifier .build (self .config .hidden_size )
0 commit comments