@@ -843,6 +843,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
843843 if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
844844 # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
845845 res = "lfm2"
846+ if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb" :
847+ # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
848+ res = "exaone4"
846849
847850 if res is None :
848851 logger .warning ("\n " )
@@ -2861,7 +2864,8 @@ def set_gguf_parameters(self):
28612864 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
28622865 num_heads = self .hparams ["num_attention_heads" ]
28632866 num_kv_heads = self .hparams ["num_key_value_heads" ]
2864- head_dim = self .hparams ["head_dim" ]
2867+ if (head_dim := self .hparams .get ("head_dim" )) is None :
2868+ head_dim = self .hparams ["hidden_size" ] // num_heads
28652869
28662870 if "ernie." in name :
28672871 name = name .replace ("ernie." , "model." )
@@ -2894,6 +2898,93 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
28942898 return [(self .map_tensor_name (name ), data_torch )]
28952899
28962900
2901+ @ModelBase .register ("Ernie4_5_MoeForCausalLM" )
2902+ class Ernie4_5MoeModel (Ernie4_5Model ):
2903+ model_arch = gguf .MODEL_ARCH .ERNIE4_5_MOE
2904+ _experts : list [dict [str , Tensor ]] | None = None
2905+
2906+ def __init__ (self , * args , ** kwargs ):
2907+ super ().__init__ (* args , ** kwargs )
2908+ self ._experts = [{} for _ in range (self .block_count )]
2909+
2910+ def set_gguf_parameters (self ):
2911+ super ().set_gguf_parameters ()
2912+ self .gguf_writer .add_expert_count (self .hparams ["moe_num_experts" ])
2913+ self .gguf_writer .add_expert_used_count (self .hparams ["moe_k" ])
2914+ self .gguf_writer .add_interleave_moe_layer_step (self .hparams ["moe_layer_interval" ])
2915+ self .gguf_writer .add_leading_dense_block_count (self .hparams ["moe_layer_start_index" ])
2916+ if (moe_intermediate_size := self .hparams .get ("moe_intermediate_size" )) is not None :
2917+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
2918+ if (shared_expert_count := self .hparams .get ('moe_num_shared_experts' )) is not None :
2919+ self .gguf_writer .add_expert_shared_count (shared_expert_count )
2920+ if shared_expert_count > 0 and (shared_expert_intermediate_size := self .hparams .get ('intermediate_size' )) is not None and (num_key_value_heads := self .hparams .get ('num_key_value_heads' )) is not None :
2921+ self .gguf_writer .add_expert_shared_feed_forward_length (shared_expert_intermediate_size // num_key_value_heads )
2922+
2923+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2924+ # Modify correction bias name as in DeepseekV2
2925+ if name .endswith ("e_score_correction_bias" ):
2926+ name = name .replace ("e_score_correction_bias" , "e_score_correction.bias" )
2927+
2928+ # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
2929+ match = re .match (r"model.mtp_block.(\d+)" , name )
2930+ if match :
2931+ return []
2932+
2933+ # skip all other MTP tensors for now
2934+ match = re .match (r"model.mtp_emb_norm.(\d+)" , name )
2935+ if match :
2936+ return []
2937+
2938+ match = re .match (r"model.mtp_hidden_norm.(\d+)" , name )
2939+ if match :
2940+ return []
2941+
2942+ match = re .match (r"model.mtp_linear_proj.(\d+)" , name )
2943+ if match :
2944+ return []
2945+
2946+ # process the experts separately
2947+ if name .find ("mlp.experts" ) != - 1 :
2948+ n_experts = self .hparams ["moe_num_experts" ]
2949+ assert bid is not None
2950+
2951+ if self ._experts is None :
2952+ self ._experts = [{} for _ in range (self .block_count )]
2953+
2954+ self ._experts [bid ][name ] = data_torch
2955+
2956+ if len (self ._experts [bid ]) >= n_experts * 3 :
2957+ tensors : list [tuple [str , Tensor ]] = []
2958+
2959+ # merge the experts into a single 3d tensor
2960+ for w_name in ["gate_proj" , "up_proj" , "down_proj" ]:
2961+ datas : list [Tensor ] = []
2962+
2963+ for xid in range (n_experts ):
2964+ ename_to_retrieve = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
2965+ datas .append (self ._experts [bid ][ename_to_retrieve ])
2966+ del self ._experts [bid ][ename_to_retrieve ]
2967+
2968+ data_torch = torch .stack (datas , dim = 0 )
2969+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
2970+ new_name = self .map_tensor_name (merged_name )
2971+ tensors .append ((new_name , data_torch ))
2972+
2973+ return tensors
2974+ else :
2975+ return []
2976+ return [(self .map_tensor_name (name ), data_torch )]
2977+
2978+ def prepare_tensors (self ):
2979+ super ().prepare_tensors ()
2980+
2981+ if self ._experts is not None :
2982+ # flatten `list[dict[str, Tensor]]` into `list[str]`
2983+ experts = [k for d in self ._experts for k in d .keys ()]
2984+ if len (experts ) > 0 :
2985+ raise ValueError (f"Unprocessed experts: { experts } " )
2986+
2987+
28972988@ModelBase .register (
28982989 "Qwen2VLModel" ,
28992990 "Qwen2VLForConditionalGeneration" ,
@@ -6692,6 +6783,75 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
66926783 yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
66936784
66946785
6786+ @ModelBase .register ("Exaone4ForCausalLM" )
6787+ class Exaone4Model (TextModel ):
6788+ model_arch = gguf .MODEL_ARCH .EXAONE4
6789+
6790+ def set_vocab (self ):
6791+ tokens , toktypes , tokpre = self .get_vocab_base ()
6792+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6793+ self .gguf_writer .add_tokenizer_pre (tokpre )
6794+ self .gguf_writer .add_token_list (tokens )
6795+ self .gguf_writer .add_token_types (toktypes )
6796+
6797+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = True )
6798+ special_vocab .add_to_gguf (self .gguf_writer )
6799+
6800+ def set_gguf_parameters (self ):
6801+ super ().set_gguf_parameters ()
6802+ hparams = self .hparams
6803+ self .gguf_writer .add_vocab_size (hparams ["vocab_size" ])
6804+
6805+ if hparams .get ("sliding_window" ) is not None :
6806+ self .gguf_writer .add_sliding_window (hparams ["sliding_window" ])
6807+ if "layer_types" in hparams :
6808+ self .gguf_writer .add_sliding_window_pattern ([t == "sliding_attention" for t in hparams ["layer_types" ]])
6809+ elif "sliding_window_pattern" in hparams :
6810+ sliding_window_pattern = []
6811+ if isinstance (hparams ["sliding_window_pattern" ], str ): # e.g. LLLG
6812+ for i in range (hparams ["num_hidden_layers" ]):
6813+ sliding_window_pattern .append (hparams ["sliding_window_pattern" ][i % len (hparams ["sliding_window_pattern" ])] == "L" )
6814+ if isinstance (hparams ["sliding_window_pattern" ], int ): # e.g. 4
6815+ for i in range (hparams ["num_hidden_layers" ]):
6816+ sliding_window_pattern .append ((i + 1 ) % hparams ["sliding_window_pattern" ] != 0 )
6817+ if len (sliding_window_pattern ) == hparams ["num_hidden_layers" ]:
6818+ self .gguf_writer .add_sliding_window_pattern (sliding_window_pattern )
6819+
6820+ rope_scaling = self .hparams .get ("rope_scaling" ) or {}
6821+ if rope_scaling .get ("rope_type" , rope_scaling .get ("type" )) == "linear" and "factor" in rope_scaling :
6822+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .LINEAR )
6823+ self .gguf_writer .add_rope_scaling_factor (rope_scaling ["factor" ])
6824+
6825+ def generate_extra_tensors (self ) -> Iterable [tuple [str , Tensor ]]:
6826+ if rope_scaling := self .find_hparam (["rope_scaling" ], optional = True ):
6827+ if rope_scaling .get ("rope_type" , '' ).lower () == "llama3" :
6828+ base = self .hparams .get ("rope_theta" , 10_000.0 )
6829+ if (dim := self .hparams .get ("head_dim" )) is None :
6830+ dim = self .hparams ["hidden_size" ] // self .hparams ["num_attention_heads" ]
6831+ freqs = 1.0 / (base ** (torch .arange (0 , dim , 2 , dtype = torch .float32 ) / dim ))
6832+
6833+ factor = rope_scaling .get ("factor" , 16.0 )
6834+ low_freq_factor = rope_scaling .get ("low_freq_factor" , 1.0 )
6835+ high_freq_factor = rope_scaling .get ("high_freq_factor" , 4.0 )
6836+ old_context_len = self .hparams .get ("original_max_position_embeddings" , 8192 )
6837+
6838+ low_freq_wavelen = old_context_len / low_freq_factor
6839+ high_freq_wavelen = old_context_len / high_freq_factor
6840+
6841+ rope_factors = []
6842+ for freq in freqs :
6843+ wavelen = 2 * math .pi / freq
6844+ if wavelen < high_freq_wavelen :
6845+ rope_factors .append (1 )
6846+ elif wavelen > low_freq_wavelen :
6847+ rope_factors .append (factor )
6848+ else :
6849+ smooth = (old_context_len / wavelen - low_freq_factor ) / (high_freq_factor - low_freq_factor )
6850+ rope_factors .append (1 / ((1 - smooth ) / factor + smooth ))
6851+
6852+ yield (self .format_tensor_name (gguf .MODEL_TENSOR .ROPE_FREQS ), torch .tensor (rope_factors , dtype = torch .float32 ))
6853+
6854+
66956855@ModelBase .register ("GraniteForCausalLM" )
66966856class GraniteModel (LlamaModel ):
66976857 """Conversion for IBM's GraniteForCausalLM"""
0 commit comments