Skip to content

Commit edeca43

Browse files
authored
🚨 Support dequantization for most GGML types (#32625)
* use gguf internal dequantize * add Q5_0 test * add iq1 test * add remained test * remove duplicated test * update docs * add gguf version limit * make style * update gguf import catch * revert vocab_size patch * make style * use GGUF_MIN_VERSION everywhere
1 parent 979f477 commit edeca43

File tree

7 files changed

+169
-356
lines changed

7 files changed

+169
-356
lines changed

docs/source/en/gguf.md

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,30 @@ The initial supported quantization types are decided according to the popular qu
4646
on the Hub.
4747

4848
- F32
49+
- F16
50+
- BF16
51+
- Q4_0
52+
- Q4_1
53+
- Q5_0
54+
- Q5_1
55+
- Q8_0
4956
- Q2_K
5057
- Q3_K
51-
- Q4_0
5258
- Q4_K
5359
- Q5_K
5460
- Q6_K
55-
- Q8_0
61+
- IQ1_S
62+
- IQ1_M
63+
- IQ2_XXS
64+
- IQ2_XS
65+
- IQ2_S
66+
- IQ3_XXS
67+
- IQ3_S
68+
- IQ4_XS
69+
- IQ4_NL
5670

57-
We take example from the excellent [99991/pygguf](https://github.com/99991/pygguf) Python parser to dequantize the
58-
weights.
71+
> [!NOTE]
72+
> To support gguf dequantization, `gguf>=0.10.0` installation is required.
5973
6074
### Supported model architectures
6175

src/transformers/integrations/ggml.py

Lines changed: 0 additions & 335 deletions
Original file line numberDiff line numberDiff line change
@@ -33,44 +33,6 @@
3333
logger = logging.get_logger(__name__)
3434

3535

36-
# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
37-
GGML_TYPES = {
38-
"F32": 0,
39-
"F16": 1,
40-
"Q4_0": 2,
41-
"Q8_0": 8,
42-
"Q2_K": 10,
43-
"Q3_K": 11,
44-
"Q4_K": 12,
45-
"Q5_K": 13,
46-
"Q6_K": 14,
47-
}
48-
49-
# The Blocksizes are reported in bytes
50-
# Check out: https://github.com/ggerganov/llama.cpp/blob/8a56075b07a8b571bf95a912ffdce4c928c2b414/gguf-py/gguf/constants.py#L801
51-
GGML_BLOCK_SIZES = {
52-
"Q8_0": 2 + 32, # Q8_0 uses a blocksize of 32 (int8 tensors) + 2 bytes allocated for the scales
53-
"Q4_K": 144,
54-
# Q4_0 uses a blocksize of 32 but the 4-bit tensors are packed into 8-bit tensors + 2 bytes for the scales
55-
"Q4_0": 2 + 16,
56-
"Q6_K": 210,
57-
# See: https://github.com/99991/pygguf/commit/a417edbfc029a1bc270f984a694f9128c5afa8b9
58-
"Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
59-
"Q3_K": 256 // 8 + 256 // 4 + 12 + 2,
60-
"Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2,
61-
}
62-
63-
# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
64-
DATA_TYPES = {
65-
"uint32": 4,
66-
"int32": 5,
67-
"float32": 6,
68-
"bool": 7,
69-
"string": 8,
70-
"array": 9,
71-
"uint64": 10,
72-
}
73-
7436
GGUF_TENSOR_MAPPING = {
7537
"llama": {
7638
"token_embd": "model.embed_tokens",
@@ -217,303 +179,6 @@ def _gguf_parse_value(_value, data_type):
217179
return _value
218180

219181

220-
def dequantize_q4_k(data, n_bytes: int):
221-
# C implementation
222-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929
223-
# C struct definition
224-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116
225-
block_size = GGML_BLOCK_SIZES["Q4_K"]
226-
num_blocks = n_bytes // block_size
227-
228-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
229-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
230-
231-
# Casting to float32 because float16 is very slow on CPU
232-
scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32)
233-
scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32)
234-
qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
235-
qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32)
236-
237-
# Dequantize scales and offsets (6 bits and 4 + 2 bits)
238-
factors = scale_factors * np.concatenate(
239-
[qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1
240-
)
241-
offsets = scale_offsets * np.concatenate(
242-
[qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1
243-
)
244-
245-
# Interleave low and high quantized bits
246-
qs2 = np.stack([qs2 & 0xF, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32)
247-
# Dequantize final weights using scales and offsets
248-
return factors * qs2 - offsets
249-
250-
251-
def dequantize_q4_0(data, n_bytes: int):
252-
# C implementation
253-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086
254-
# C struct definition
255-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11
256-
block_size = GGML_BLOCK_SIZES["Q4_0"]
257-
num_blocks = n_bytes // block_size
258-
259-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
260-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
261-
262-
# The scales are stored on the first 2 bytes and the rest corresponds to the quants
263-
scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
264-
# scales = np.nan_to_num(scales)
265-
# the rest of the bytes corresponds to the quants - we discard the first two bytes
266-
quants = data_u8[:, 2:]
267-
268-
ql = (quants[:, :] & 0xF).astype(np.int8) - 8
269-
qr = (quants[:, :] >> 4).astype(np.int8) - 8
270-
271-
# Use hstack
272-
quants = np.hstack([ql, qr])
273-
274-
return (scales * quants).astype(np.float32)
275-
276-
277-
def dequantize_q6_k(data, n_bytes: int):
278-
# C implementation
279-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275
280-
# C struct definition
281-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152
282-
block_size = GGML_BLOCK_SIZES["Q6_K"]
283-
num_blocks = n_bytes // block_size
284-
285-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
286-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
287-
data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size)
288-
289-
scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32)
290-
291-
# TODO use uint8 and cast later?
292-
ql = data_u8[:, :128].astype(np.int16)
293-
qh = data_u8[:, 128:192].astype(np.int16)
294-
sc = data_i8[:, 192:208, np.newaxis].astype(np.float32)
295-
296-
# Unpack bits, subtraction requires signed data type
297-
q1 = (ql[:, :32] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32
298-
q2 = (ql[:, 32:64] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32
299-
q3 = (ql[:, :32] >> 4) | (((qh[:, :32] >> 4) & 3) << 4) - 32
300-
q4 = (ql[:, 32:64] >> 4) | (((qh[:, :32] >> 6) & 3) << 4) - 32
301-
q5 = (ql[:, 64:96] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32
302-
q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32
303-
q7 = (ql[:, 64:96] >> 4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32
304-
q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32
305-
306-
# Dequantize
307-
return scales * np.concatenate(
308-
[
309-
sc[:, 0] * q1[:, :16],
310-
sc[:, 1] * q1[:, 16:],
311-
sc[:, 2] * q2[:, :16],
312-
sc[:, 3] * q2[:, 16:],
313-
sc[:, 4] * q3[:, :16],
314-
sc[:, 5] * q3[:, 16:],
315-
sc[:, 6] * q4[:, :16],
316-
sc[:, 7] * q4[:, 16:],
317-
sc[:, 8] * q5[:, :16],
318-
sc[:, 9] * q5[:, 16:],
319-
sc[:, 10] * q6[:, :16],
320-
sc[:, 11] * q6[:, 16:],
321-
sc[:, 12] * q7[:, :16],
322-
sc[:, 13] * q7[:, 16:],
323-
sc[:, 14] * q8[:, :16],
324-
sc[:, 15] * q8[:, 16:],
325-
],
326-
axis=1,
327-
)
328-
329-
330-
def dequantize_q8_0(data, n_bytes: int):
331-
# C struct definition
332-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43
333-
block_size = GGML_BLOCK_SIZES["Q8_0"]
334-
num_blocks = n_bytes // block_size
335-
336-
scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32)
337-
qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:]
338-
339-
return scales * qs
340-
341-
342-
def dequantize_q2_k(data, n_bytes: int):
343-
# C implementation
344-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547
345-
# C struct definition
346-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74
347-
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"]
348-
349-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2)
350-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"])
351-
352-
dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
353-
d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32)
354-
scales = data_u8[:, :16].reshape(num_blocks, 16, 1)
355-
qs = data_u8[:, 16:80].reshape(num_blocks, 64)
356-
357-
tmp = np.stack(
358-
[
359-
qs[:, 00:16] >> 0,
360-
qs[:, 16:32] >> 0,
361-
qs[:, 00:16] >> 2,
362-
qs[:, 16:32] >> 2,
363-
qs[:, 00:16] >> 4,
364-
qs[:, 16:32] >> 4,
365-
qs[:, 00:16] >> 6,
366-
qs[:, 16:32] >> 6,
367-
qs[:, 32:48] >> 0,
368-
qs[:, 48:64] >> 0,
369-
qs[:, 32:48] >> 2,
370-
qs[:, 48:64] >> 2,
371-
qs[:, 32:48] >> 4,
372-
qs[:, 48:64] >> 4,
373-
qs[:, 32:48] >> 6,
374-
qs[:, 48:64] >> 6,
375-
],
376-
axis=1,
377-
)
378-
379-
return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4)
380-
381-
382-
def dequantize_q3_k(data, n_bytes: int):
383-
# C implementation
384-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42
385-
# C struct definition
386-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95
387-
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"]
388-
389-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2)
390-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"])
391-
392-
d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32)
393-
bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little")
394-
bits = 4 ^ (bits << 2)
395-
qs = data_u8[:, 32 : 32 + 64].astype(np.int16)
396-
a, b, c = data_u8[:, 96 : 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2)
397-
scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8)
398-
scales[:, 0] = (a & 15) | ((c & 3) << 4)
399-
scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4)
400-
scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4)
401-
scales[:, 3] = (b >> 4) | ((c >> 6) << 4)
402-
scales = scales.reshape(num_blocks, 16, 1).astype(np.int16)
403-
404-
return (
405-
d
406-
* (scales - 32)
407-
* np.stack(
408-
[
409-
(((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]),
410-
(((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]),
411-
(((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]),
412-
(((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]),
413-
(((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]),
414-
(((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]),
415-
(((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]),
416-
(((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]),
417-
(((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]),
418-
(((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]),
419-
(((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]),
420-
(((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]),
421-
(((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]),
422-
(((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]),
423-
(((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]),
424-
(((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]),
425-
],
426-
axis=1,
427-
)
428-
)
429-
430-
431-
def dequantize_q5_k(data, n_bytes: int):
432-
# C implementation
433-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129
434-
# C struct definition
435-
# https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138
436-
num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"]
437-
438-
data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2)
439-
data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"])
440-
441-
d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)
442-
dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32)
443-
scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1)
444-
qh = data_u8[:, 16 : 16 + 32].reshape(num_blocks, 32, 1)
445-
qs = data_u8[:, 48 : 48 + 128].reshape(num_blocks, 4, 32)
446-
447-
bits = np.unpackbits(qh, axis=-1, bitorder="little")
448-
449-
qs_hi_4 = qs >> 4
450-
qs_lo_4 = qs & 15
451-
452-
scales_lo_6 = scales[:, :8] & 63
453-
scales_hi_6 = scales[:, :8] >> 6
454-
scales_lo_4 = scales[:, 8:] & 15
455-
scales_hi_4 = scales[:, 8:] >> 4
456-
457-
m1 = dmin * scales_lo_6[:, 4]
458-
m2 = dmin * scales_lo_6[:, 5]
459-
m3 = dmin * scales_lo_6[:, 6]
460-
m4 = dmin * scales_lo_6[:, 7]
461-
m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4))
462-
m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4))
463-
m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4))
464-
m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4))
465-
466-
d1 = d * scales_lo_6[:, 0]
467-
d2 = d * scales_lo_6[:, 1]
468-
d3 = d * scales_lo_6[:, 2]
469-
d4 = d * scales_lo_6[:, 3]
470-
d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4))
471-
d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4))
472-
d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4))
473-
d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4))
474-
475-
return np.concatenate(
476-
[
477-
d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1,
478-
d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2,
479-
d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3,
480-
d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4,
481-
d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5,
482-
d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6,
483-
d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7,
484-
d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8,
485-
],
486-
axis=1,
487-
)
488-
489-
490-
def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes):
491-
if ggml_type == GGML_TYPES["F32"]:
492-
values = data
493-
elif ggml_type == GGML_TYPES["F16"]:
494-
values = data
495-
elif ggml_type == GGML_TYPES["Q8_0"]:
496-
values = dequantize_q8_0(data, n_bytes)
497-
elif ggml_type == GGML_TYPES["Q4_0"]:
498-
values = dequantize_q4_0(data, n_bytes)
499-
elif ggml_type == GGML_TYPES["Q4_K"]:
500-
values = dequantize_q4_k(data, n_bytes)
501-
elif ggml_type == GGML_TYPES["Q6_K"]:
502-
values = dequantize_q6_k(data, n_bytes)
503-
elif ggml_type == GGML_TYPES["Q2_K"]:
504-
values = dequantize_q2_k(data, n_bytes)
505-
elif ggml_type == GGML_TYPES["Q3_K"]:
506-
values = dequantize_q3_k(data, n_bytes)
507-
elif ggml_type == GGML_TYPES["Q5_K"]:
508-
values = dequantize_q5_k(data, n_bytes)
509-
else:
510-
raise NotImplementedError(
511-
f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose"
512-
)
513-
514-
return values.reshape(shape[::-1])
515-
516-
517182
class GGUFTokenizerSkeleton:
518183
def __init__(self, dict_):
519184
for k, v in dict_.items():

0 commit comments

Comments
 (0)