|
33 | 33 | logger = logging.get_logger(__name__) |
34 | 34 |
|
35 | 35 |
|
36 | | -# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md |
37 | | -GGML_TYPES = { |
38 | | - "F32": 0, |
39 | | - "F16": 1, |
40 | | - "Q4_0": 2, |
41 | | - "Q8_0": 8, |
42 | | - "Q2_K": 10, |
43 | | - "Q3_K": 11, |
44 | | - "Q4_K": 12, |
45 | | - "Q5_K": 13, |
46 | | - "Q6_K": 14, |
47 | | -} |
48 | | - |
49 | | -# The Blocksizes are reported in bytes |
50 | | -# Check out: https://github.com/ggerganov/llama.cpp/blob/8a56075b07a8b571bf95a912ffdce4c928c2b414/gguf-py/gguf/constants.py#L801 |
51 | | -GGML_BLOCK_SIZES = { |
52 | | - "Q8_0": 2 + 32, # Q8_0 uses a blocksize of 32 (int8 tensors) + 2 bytes allocated for the scales |
53 | | - "Q4_K": 144, |
54 | | - # Q4_0 uses a blocksize of 32 but the 4-bit tensors are packed into 8-bit tensors + 2 bytes for the scales |
55 | | - "Q4_0": 2 + 16, |
56 | | - "Q6_K": 210, |
57 | | - # See: https://github.com/99991/pygguf/commit/a417edbfc029a1bc270f984a694f9128c5afa8b9 |
58 | | - "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, |
59 | | - "Q3_K": 256 // 8 + 256 // 4 + 12 + 2, |
60 | | - "Q5_K": 2 + 2 + 12 + 256 // 8 + 256 // 2, |
61 | | -} |
62 | | - |
63 | | -# Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md |
64 | | -DATA_TYPES = { |
65 | | - "uint32": 4, |
66 | | - "int32": 5, |
67 | | - "float32": 6, |
68 | | - "bool": 7, |
69 | | - "string": 8, |
70 | | - "array": 9, |
71 | | - "uint64": 10, |
72 | | -} |
73 | | - |
74 | 36 | GGUF_TENSOR_MAPPING = { |
75 | 37 | "llama": { |
76 | 38 | "token_embd": "model.embed_tokens", |
@@ -217,303 +179,6 @@ def _gguf_parse_value(_value, data_type): |
217 | 179 | return _value |
218 | 180 |
|
219 | 181 |
|
220 | | -def dequantize_q4_k(data, n_bytes: int): |
221 | | - # C implementation |
222 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1929 |
223 | | - # C struct definition |
224 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L116 |
225 | | - block_size = GGML_BLOCK_SIZES["Q4_K"] |
226 | | - num_blocks = n_bytes // block_size |
227 | | - |
228 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
229 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
230 | | - |
231 | | - # Casting to float32 because float16 is very slow on CPU |
232 | | - scale_factors = data_f16[:, 0].reshape(num_blocks, 1, 1).astype(np.float32) |
233 | | - scale_offsets = data_f16[:, 1].reshape(num_blocks, 1, 1).astype(np.float32) |
234 | | - qs1 = data_u8[:, 4:16].reshape(num_blocks, 12, 1) |
235 | | - qs2 = data_u8[:, 16:].reshape(num_blocks, 4, 32) |
236 | | - |
237 | | - # Dequantize scales and offsets (6 bits and 4 + 2 bits) |
238 | | - factors = scale_factors * np.concatenate( |
239 | | - [qs1[:, 0:4] & 0b111111, (qs1[:, 8:] & 15) | ((qs1[:, 0:4] >> 6) << 4)], axis=1 |
240 | | - ) |
241 | | - offsets = scale_offsets * np.concatenate( |
242 | | - [qs1[:, 4:8] & 0b111111, (qs1[:, 8:] >> 4) | ((qs1[:, 4:8] >> 6) << 4)], axis=1 |
243 | | - ) |
244 | | - |
245 | | - # Interleave low and high quantized bits |
246 | | - qs2 = np.stack([qs2 & 0xF, qs2 >> 4], axis=2).reshape(num_blocks, 8, 32) |
247 | | - # Dequantize final weights using scales and offsets |
248 | | - return factors * qs2 - offsets |
249 | | - |
250 | | - |
251 | | -def dequantize_q4_0(data, n_bytes: int): |
252 | | - # C implementation |
253 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1086 |
254 | | - # C struct definition |
255 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L11 |
256 | | - block_size = GGML_BLOCK_SIZES["Q4_0"] |
257 | | - num_blocks = n_bytes // block_size |
258 | | - |
259 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
260 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
261 | | - |
262 | | - # The scales are stored on the first 2 bytes and the rest corresponds to the quants |
263 | | - scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) |
264 | | - # scales = np.nan_to_num(scales) |
265 | | - # the rest of the bytes corresponds to the quants - we discard the first two bytes |
266 | | - quants = data_u8[:, 2:] |
267 | | - |
268 | | - ql = (quants[:, :] & 0xF).astype(np.int8) - 8 |
269 | | - qr = (quants[:, :] >> 4).astype(np.int8) - 8 |
270 | | - |
271 | | - # Use hstack |
272 | | - quants = np.hstack([ql, qr]) |
273 | | - |
274 | | - return (scales * quants).astype(np.float32) |
275 | | - |
276 | | - |
277 | | -def dequantize_q6_k(data, n_bytes: int): |
278 | | - # C implementation |
279 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275 |
280 | | - # C struct definition |
281 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L152 |
282 | | - block_size = GGML_BLOCK_SIZES["Q6_K"] |
283 | | - num_blocks = n_bytes // block_size |
284 | | - |
285 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) |
286 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) |
287 | | - data_i8 = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, block_size) |
288 | | - |
289 | | - scales = data_f16[:, -1].reshape(num_blocks, 1).astype(np.float32) |
290 | | - |
291 | | - # TODO use uint8 and cast later? |
292 | | - ql = data_u8[:, :128].astype(np.int16) |
293 | | - qh = data_u8[:, 128:192].astype(np.int16) |
294 | | - sc = data_i8[:, 192:208, np.newaxis].astype(np.float32) |
295 | | - |
296 | | - # Unpack bits, subtraction requires signed data type |
297 | | - q1 = (ql[:, :32] & 0xF) | (((qh[:, :32] >> 0) & 3) << 4) - 32 |
298 | | - q2 = (ql[:, 32:64] & 0xF) | (((qh[:, :32] >> 2) & 3) << 4) - 32 |
299 | | - q3 = (ql[:, :32] >> 4) | (((qh[:, :32] >> 4) & 3) << 4) - 32 |
300 | | - q4 = (ql[:, 32:64] >> 4) | (((qh[:, :32] >> 6) & 3) << 4) - 32 |
301 | | - q5 = (ql[:, 64:96] & 0xF) | (((qh[:, 32:] >> 0) & 3) << 4) - 32 |
302 | | - q6 = (ql[:, 96:128] & 0xF) | (((qh[:, 32:] >> 2) & 3) << 4) - 32 |
303 | | - q7 = (ql[:, 64:96] >> 4) | (((qh[:, 32:] >> 4) & 3) << 4) - 32 |
304 | | - q8 = (ql[:, 96:128] >> 4) | (((qh[:, 32:] >> 6) & 3) << 4) - 32 |
305 | | - |
306 | | - # Dequantize |
307 | | - return scales * np.concatenate( |
308 | | - [ |
309 | | - sc[:, 0] * q1[:, :16], |
310 | | - sc[:, 1] * q1[:, 16:], |
311 | | - sc[:, 2] * q2[:, :16], |
312 | | - sc[:, 3] * q2[:, 16:], |
313 | | - sc[:, 4] * q3[:, :16], |
314 | | - sc[:, 5] * q3[:, 16:], |
315 | | - sc[:, 6] * q4[:, :16], |
316 | | - sc[:, 7] * q4[:, 16:], |
317 | | - sc[:, 8] * q5[:, :16], |
318 | | - sc[:, 9] * q5[:, 16:], |
319 | | - sc[:, 10] * q6[:, :16], |
320 | | - sc[:, 11] * q6[:, 16:], |
321 | | - sc[:, 12] * q7[:, :16], |
322 | | - sc[:, 13] * q7[:, 16:], |
323 | | - sc[:, 14] * q8[:, :16], |
324 | | - sc[:, 15] * q8[:, 16:], |
325 | | - ], |
326 | | - axis=1, |
327 | | - ) |
328 | | - |
329 | | - |
330 | | -def dequantize_q8_0(data, n_bytes: int): |
331 | | - # C struct definition |
332 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L43 |
333 | | - block_size = GGML_BLOCK_SIZES["Q8_0"] |
334 | | - num_blocks = n_bytes // block_size |
335 | | - |
336 | | - scales = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, 1 + 16)[:, :1].astype(np.float32) |
337 | | - qs = np.frombuffer(data, dtype=np.int8).reshape(num_blocks, 2 + 32)[:, 2:] |
338 | | - |
339 | | - return scales * qs |
340 | | - |
341 | | - |
342 | | -def dequantize_q2_k(data, n_bytes: int): |
343 | | - # C implementation |
344 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1547 |
345 | | - # C struct definition |
346 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L74 |
347 | | - num_blocks = n_bytes // GGML_BLOCK_SIZES["Q2_K"] |
348 | | - |
349 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"] // 2) |
350 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q2_K"]) |
351 | | - |
352 | | - dmin = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) |
353 | | - d = data_f16[:, -2].reshape(num_blocks, 1, 1).astype(np.float32) |
354 | | - scales = data_u8[:, :16].reshape(num_blocks, 16, 1) |
355 | | - qs = data_u8[:, 16:80].reshape(num_blocks, 64) |
356 | | - |
357 | | - tmp = np.stack( |
358 | | - [ |
359 | | - qs[:, 00:16] >> 0, |
360 | | - qs[:, 16:32] >> 0, |
361 | | - qs[:, 00:16] >> 2, |
362 | | - qs[:, 16:32] >> 2, |
363 | | - qs[:, 00:16] >> 4, |
364 | | - qs[:, 16:32] >> 4, |
365 | | - qs[:, 00:16] >> 6, |
366 | | - qs[:, 16:32] >> 6, |
367 | | - qs[:, 32:48] >> 0, |
368 | | - qs[:, 48:64] >> 0, |
369 | | - qs[:, 32:48] >> 2, |
370 | | - qs[:, 48:64] >> 2, |
371 | | - qs[:, 32:48] >> 4, |
372 | | - qs[:, 48:64] >> 4, |
373 | | - qs[:, 32:48] >> 6, |
374 | | - qs[:, 48:64] >> 6, |
375 | | - ], |
376 | | - axis=1, |
377 | | - ) |
378 | | - |
379 | | - return d * (scales & 15) * (tmp & 3) - dmin * (scales >> 4) |
380 | | - |
381 | | - |
382 | | -def dequantize_q3_k(data, n_bytes: int): |
383 | | - # C implementation |
384 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1723C32-L1723C42 |
385 | | - # C struct definition |
386 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L95 |
387 | | - num_blocks = n_bytes // GGML_BLOCK_SIZES["Q3_K"] |
388 | | - |
389 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"] // 2) |
390 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q3_K"]) |
391 | | - |
392 | | - d = data_f16[:, -1].reshape(num_blocks, 1, 1).astype(np.float32) |
393 | | - bits = np.unpackbits(data_u8[:, :32].reshape(num_blocks, 32, 1), axis=-1, bitorder="little") |
394 | | - bits = 4 ^ (bits << 2) |
395 | | - qs = data_u8[:, 32 : 32 + 64].astype(np.int16) |
396 | | - a, b, c = data_u8[:, 96 : 96 + 12].reshape(num_blocks, 3, 4).transpose(1, 0, 2) |
397 | | - scales = np.zeros((num_blocks, 4, 4), dtype=np.uint8) |
398 | | - scales[:, 0] = (a & 15) | ((c & 3) << 4) |
399 | | - scales[:, 1] = (b & 15) | (((c >> 2) & 3) << 4) |
400 | | - scales[:, 2] = (a >> 4) | (((c >> 4) & 3) << 4) |
401 | | - scales[:, 3] = (b >> 4) | ((c >> 6) << 4) |
402 | | - scales = scales.reshape(num_blocks, 16, 1).astype(np.int16) |
403 | | - |
404 | | - return ( |
405 | | - d |
406 | | - * (scales - 32) |
407 | | - * np.stack( |
408 | | - [ |
409 | | - (((qs[:, 00:16] >> 0) & 3) - bits[:, :16, 0]), |
410 | | - (((qs[:, 16:32] >> 0) & 3) - bits[:, 16:, 0]), |
411 | | - (((qs[:, 00:16] >> 2) & 3) - bits[:, :16, 1]), |
412 | | - (((qs[:, 16:32] >> 2) & 3) - bits[:, 16:, 1]), |
413 | | - (((qs[:, 00:16] >> 4) & 3) - bits[:, :16, 2]), |
414 | | - (((qs[:, 16:32] >> 4) & 3) - bits[:, 16:, 2]), |
415 | | - (((qs[:, 00:16] >> 6) & 3) - bits[:, :16, 3]), |
416 | | - (((qs[:, 16:32] >> 6) & 3) - bits[:, 16:, 3]), |
417 | | - (((qs[:, 32:48] >> 0) & 3) - bits[:, :16, 4]), |
418 | | - (((qs[:, 48:64] >> 0) & 3) - bits[:, 16:, 4]), |
419 | | - (((qs[:, 32:48] >> 2) & 3) - bits[:, :16, 5]), |
420 | | - (((qs[:, 48:64] >> 2) & 3) - bits[:, 16:, 5]), |
421 | | - (((qs[:, 32:48] >> 4) & 3) - bits[:, :16, 6]), |
422 | | - (((qs[:, 48:64] >> 4) & 3) - bits[:, 16:, 6]), |
423 | | - (((qs[:, 32:48] >> 6) & 3) - bits[:, :16, 7]), |
424 | | - (((qs[:, 48:64] >> 6) & 3) - bits[:, 16:, 7]), |
425 | | - ], |
426 | | - axis=1, |
427 | | - ) |
428 | | - ) |
429 | | - |
430 | | - |
431 | | -def dequantize_q5_k(data, n_bytes: int): |
432 | | - # C implementation |
433 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2129 |
434 | | - # C struct definition |
435 | | - # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L138 |
436 | | - num_blocks = n_bytes // GGML_BLOCK_SIZES["Q5_K"] |
437 | | - |
438 | | - data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"] // 2) |
439 | | - data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, GGML_BLOCK_SIZES["Q5_K"]) |
440 | | - |
441 | | - d = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) |
442 | | - dmin = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32) |
443 | | - scales = data_u8[:, 4:16].reshape(num_blocks, 12, 1) |
444 | | - qh = data_u8[:, 16 : 16 + 32].reshape(num_blocks, 32, 1) |
445 | | - qs = data_u8[:, 48 : 48 + 128].reshape(num_blocks, 4, 32) |
446 | | - |
447 | | - bits = np.unpackbits(qh, axis=-1, bitorder="little") |
448 | | - |
449 | | - qs_hi_4 = qs >> 4 |
450 | | - qs_lo_4 = qs & 15 |
451 | | - |
452 | | - scales_lo_6 = scales[:, :8] & 63 |
453 | | - scales_hi_6 = scales[:, :8] >> 6 |
454 | | - scales_lo_4 = scales[:, 8:] & 15 |
455 | | - scales_hi_4 = scales[:, 8:] >> 4 |
456 | | - |
457 | | - m1 = dmin * scales_lo_6[:, 4] |
458 | | - m2 = dmin * scales_lo_6[:, 5] |
459 | | - m3 = dmin * scales_lo_6[:, 6] |
460 | | - m4 = dmin * scales_lo_6[:, 7] |
461 | | - m5 = dmin * (scales_hi_4[:, 0] | (scales_hi_6[:, 4] << 4)) |
462 | | - m6 = dmin * (scales_hi_4[:, 1] | (scales_hi_6[:, 5] << 4)) |
463 | | - m7 = dmin * (scales_hi_4[:, 2] | (scales_hi_6[:, 6] << 4)) |
464 | | - m8 = dmin * (scales_hi_4[:, 3] | (scales_hi_6[:, 7] << 4)) |
465 | | - |
466 | | - d1 = d * scales_lo_6[:, 0] |
467 | | - d2 = d * scales_lo_6[:, 1] |
468 | | - d3 = d * scales_lo_6[:, 2] |
469 | | - d4 = d * scales_lo_6[:, 3] |
470 | | - d5 = d * (scales_lo_4[:, 0] | (scales_hi_6[:, 0] << 4)) |
471 | | - d6 = d * (scales_lo_4[:, 1] | (scales_hi_6[:, 1] << 4)) |
472 | | - d7 = d * (scales_lo_4[:, 2] | (scales_hi_6[:, 2] << 4)) |
473 | | - d8 = d * (scales_lo_4[:, 3] | (scales_hi_6[:, 3] << 4)) |
474 | | - |
475 | | - return np.concatenate( |
476 | | - [ |
477 | | - d1 * (qs_lo_4[:, 0] + (bits[:, :, 0] << 4)) - m1, |
478 | | - d2 * (qs_hi_4[:, 0] + (bits[:, :, 1] << 4)) - m2, |
479 | | - d3 * (qs_lo_4[:, 1] + (bits[:, :, 2] << 4)) - m3, |
480 | | - d4 * (qs_hi_4[:, 1] + (bits[:, :, 3] << 4)) - m4, |
481 | | - d5 * (qs_lo_4[:, 2] + (bits[:, :, 4] << 4)) - m5, |
482 | | - d6 * (qs_hi_4[:, 2] + (bits[:, :, 5] << 4)) - m6, |
483 | | - d7 * (qs_lo_4[:, 3] + (bits[:, :, 6] << 4)) - m7, |
484 | | - d8 * (qs_hi_4[:, 3] + (bits[:, :, 7] << 4)) - m8, |
485 | | - ], |
486 | | - axis=1, |
487 | | - ) |
488 | | - |
489 | | - |
490 | | -def load_dequant_gguf_tensor(shape, ggml_type, data, n_bytes): |
491 | | - if ggml_type == GGML_TYPES["F32"]: |
492 | | - values = data |
493 | | - elif ggml_type == GGML_TYPES["F16"]: |
494 | | - values = data |
495 | | - elif ggml_type == GGML_TYPES["Q8_0"]: |
496 | | - values = dequantize_q8_0(data, n_bytes) |
497 | | - elif ggml_type == GGML_TYPES["Q4_0"]: |
498 | | - values = dequantize_q4_0(data, n_bytes) |
499 | | - elif ggml_type == GGML_TYPES["Q4_K"]: |
500 | | - values = dequantize_q4_k(data, n_bytes) |
501 | | - elif ggml_type == GGML_TYPES["Q6_K"]: |
502 | | - values = dequantize_q6_k(data, n_bytes) |
503 | | - elif ggml_type == GGML_TYPES["Q2_K"]: |
504 | | - values = dequantize_q2_k(data, n_bytes) |
505 | | - elif ggml_type == GGML_TYPES["Q3_K"]: |
506 | | - values = dequantize_q3_k(data, n_bytes) |
507 | | - elif ggml_type == GGML_TYPES["Q5_K"]: |
508 | | - values = dequantize_q5_k(data, n_bytes) |
509 | | - else: |
510 | | - raise NotImplementedError( |
511 | | - f"ggml_type {ggml_type} not implemented - please raise an issue on huggingface transformers: https://github.com/huggingface/transformers/issues/new/choose" |
512 | | - ) |
513 | | - |
514 | | - return values.reshape(shape[::-1]) |
515 | | - |
516 | | - |
517 | 182 | class GGUFTokenizerSkeleton: |
518 | 183 | def __init__(self, dict_): |
519 | 184 | for k, v in dict_.items(): |
|
0 commit comments