@@ -2661,6 +2661,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
26612661 case LLAMA_FTYPE_MOSTLY_Q6_K: return " Q6_K" ;
26622662 case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return " IQ2_XSS - 2.0625 bpw" ;
26632663 case LLAMA_FTYPE_MOSTLY_IQ2_XS: return " IQ2_XS - 2.3125 bpw" ;
2664+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return " Q3_K - Extra small" ;
26642665
26652666 default : return " unknown, may not work" ;
26662667 }
@@ -8765,9 +8766,13 @@ struct quantize_state_internal {
87658766 const llama_model_quantize_params * params;
87668767
87678768 int n_attention_wv = 0 ;
8768- int n_feed_forward_w2 = 0 ;
8769+ int n_ffn_down = 0 ;
8770+ int n_ffn_gate = 0 ;
8771+ int n_ffn_up = 0 ;
87698772 int i_attention_wv = 0 ;
8770- int i_feed_forward_w2 = 0 ;
8773+ int i_ffn_down = 0 ;
8774+ int i_ffn_gate = 0 ;
8775+ int i_ffn_up = 0 ;
87718776
87728777 int n_k_quantized = 0 ;
87738778 int n_fallback = 0 ;
@@ -8870,8 +8875,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
88708875 ++qs.i_attention_wv ;
88718876 }
88728877 else if (name.find (" ffn_down" ) != std::string::npos) {
8873- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8 ) new_type = GGML_TYPE_Q2_K;
8874- ++qs.i_feed_forward_w2 ;
8878+ if (qs.i_ffn_down < qs.n_ffn_down /8 ) new_type = GGML_TYPE_Q2_K;
8879+ ++qs.i_ffn_down ;
88758880 }
88768881 else if (name == " token_embd.weight" ) new_type = GGML_TYPE_Q2_K;
88778882 } else if (name.find (" attn_v.weight" ) != std::string::npos) {
@@ -8908,18 +8913,21 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89088913 // TODO: explore better strategies
89098914 new_type = GGML_TYPE_Q8_0;
89108915 }
8916+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
8917+ new_type = GGML_TYPE_Q2_K;
8918+ }
89118919 } else if (name.find (" ffn_down" ) != std::string::npos) {
89128920 const int n_expert = std::max (1 , (int )qs.model .hparams .n_expert );
89138921 int i_layer, n_layer;
89148922 if (n_expert == 1 ) {
8915- i_layer = qs.i_feed_forward_w2 ;
8916- n_layer = qs.n_feed_forward_w2 ;
8923+ i_layer = qs.i_ffn_down ;
8924+ n_layer = qs.n_ffn_down ;
89178925 } else {
89188926 // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8919- // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8927+ // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
89208928 // for getting the current layer as I initially thought, and we need to resort to parsing the
89218929 // tensor name.
8922- n_layer = qs.n_feed_forward_w2 / n_expert;
8930+ n_layer = qs.n_ffn_down / n_expert;
89238931 if (sscanf (name.c_str (), " blk.%d.ffn_down" , &i_layer) != 1 ) {
89248932 throw std::runtime_error (format (" Failed to determine layer for tensor %s" , name.c_str ()));
89258933 }
@@ -8928,7 +8936,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89288936 }
89298937 }
89308938 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8931- else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8939+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ) {
89328940 if (i_layer < n_layer/8 ) new_type = GGML_TYPE_Q4_K;
89338941 }
89348942 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8958,11 +8966,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89588966 // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
89598967 new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
89608968 }
8961- ++qs.i_feed_forward_w2 ;
8969+ ++qs.i_ffn_down ;
89628970 } else if (name.find (" attn_output.weight" ) != std::string::npos) {
89638971 if (arch != LLM_ARCH_FALCON) {
89648972 if (qs.model .hparams .n_expert == 8 ) {
8965- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8973+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS ||
8974+ ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
89668975 ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
89678976 new_type = GGML_TYPE_Q5_K;
89688977 }
@@ -8980,6 +8989,20 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89808989 else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
89818990 else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89828991 }
8992+ else if (name.find (" ffn_gate" ) != std::string::npos) {
8993+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_gate , qs.n_ffn_gate )) {
8994+ new_type = GGML_TYPE_Q2_K;
8995+ }
8996+ ++qs.i_ffn_gate ;
8997+ }
8998+ else if (name.find (" ffn_up" ) != std::string::npos) {
8999+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits (qs.i_ffn_up , qs.n_ffn_up )) {
9000+ new_type = GGML_TYPE_Q2_K;
9001+ }
9002+ ++qs.i_ffn_up ;
9003+ }
9004+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9005+ // }
89839006 // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
89849007 // else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
89859008 // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -9034,8 +9057,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90349057 case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break ;
90359058
90369059 // K-quants
9060+ case LLAMA_FTYPE_MOSTLY_Q2_K_S:
90379061 case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break ;
9038- case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break ;
9062+ case LLAMA_FTYPE_MOSTLY_Q3_K_XS:
90399063 case LLAMA_FTYPE_MOSTLY_Q3_K_S:
90409064 case LLAMA_FTYPE_MOSTLY_Q3_K_M:
90419065 case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break ;
@@ -9103,12 +9127,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
91039127 ++qs.n_attention_wv ;
91049128 }
91059129 else if (name.find (" ffn_down" ) != std::string::npos) {
9106- ++qs.n_feed_forward_w2 ;
9130+ ++qs.n_ffn_down ;
9131+ }
9132+ else if (name.find (" ffn_gate" ) != std::string::npos) {
9133+ ++qs.n_ffn_gate ;
9134+ }
9135+ else if (name.find (" ffn_up" ) != std::string::npos) {
9136+ ++qs.n_ffn_up ;
91079137 }
91089138 }
9109- if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t )qs.n_attention_wv != model.hparams .n_layer ) {
9110- LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n " ,
9111- __func__, qs.n_attention_wv , qs.n_feed_forward_w2 , model.hparams .n_layer );
9139+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t )qs.n_attention_wv != model.hparams .n_layer ) {
9140+ LLAMA_LOG_WARN (" %s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n " ,
9141+ __func__, qs.n_attention_wv , qs.n_ffn_down , model.hparams .n_layer );
91129142 }
91139143
91149144 size_t total_size_org = 0 ;
0 commit comments