|
31 | 31 | @pytest.mark.parametrize("model_id, use_marlin_kernel", MODEL_QUANT) |
32 | 32 | def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, |
33 | 33 | monkeypatch): |
34 | | - # vllm_runner.apply_model() relies on V0 internals. |
35 | | - monkeypatch.setenv("VLLM_USE_V1", "0") |
36 | | - |
37 | | - vllm_model = vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) |
| 34 | + # `LLM.apply_model` requires pickling a function. |
| 35 | + monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") |
38 | 36 |
|
39 | 37 | linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( |
40 | 38 | GPTQLinearMethod) |
41 | 39 |
|
42 | | - for name, submodule in (vllm_model.llm.llm_engine.model_executor. |
43 | | - driver_worker.model_runner.model.named_modules()): |
44 | | - if name == "lm_head": |
45 | | - assert isinstance(submodule.quant_method, linear_method_cls) |
46 | | - elif name == 'model.layers.0.self_attn.qkv_proj': |
47 | | - # The first layer is quantized using bits=4, group_size=128 |
48 | | - # desc_act=True |
49 | | - assert isinstance(submodule.quant_method, linear_method_cls) |
50 | | - config = submodule.quant_method.quant_config |
51 | | - assert config.weight_bits == 4 |
52 | | - assert config.group_size == 128 |
53 | | - assert config.desc_act |
54 | | - elif name == 'model.layers.1.self_attn.qkv_proj': |
55 | | - # The second layer is quantized using bits=8, group_size=32 |
56 | | - # desc_act=False |
57 | | - assert isinstance(submodule.quant_method, linear_method_cls) |
58 | | - config = submodule.quant_method.quant_config |
59 | | - assert get_dynamic_override(config, layer_name=name, |
60 | | - key="bits") == 8 |
61 | | - assert get_dynamic_override(config, |
62 | | - layer_name=name, |
63 | | - key="group_size") == 32 |
64 | | - assert not get_dynamic_override( |
65 | | - config, layer_name=name, key="desc_act") |
66 | | - elif (name == 'model.layers.2.self_attn.qkv_proj' |
67 | | - or name == 'model.layers.2.mlp.gate_up_proj'): |
68 | | - # All other layers (layer index >= 2) are not quantized |
69 | | - assert isinstance(submodule.quant_method, UnquantizedLinearMethod) |
| 40 | + with vllm_runner(model_id, dtype=torch.float16, max_model_len=2048) as llm: |
| 41 | + |
| 42 | + def check_model(model): |
| 43 | + for name, submodule in model.named_modules(): |
| 44 | + if name == "lm_head": |
| 45 | + assert isinstance(submodule.quant_method, |
| 46 | + linear_method_cls) |
| 47 | + elif name == 'model.layers.0.self_attn.qkv_proj': |
| 48 | + # The first layer is quantized using bits=4, group_size=128 |
| 49 | + # desc_act=True |
| 50 | + assert isinstance(submodule.quant_method, |
| 51 | + linear_method_cls) |
| 52 | + config = submodule.quant_method.quant_config |
| 53 | + assert config.weight_bits == 4 |
| 54 | + assert config.group_size == 128 |
| 55 | + assert config.desc_act |
| 56 | + elif name == 'model.layers.1.self_attn.qkv_proj': |
| 57 | + # The second layer is quantized using bits=8, group_size=32 |
| 58 | + # desc_act=False |
| 59 | + assert isinstance(submodule.quant_method, |
| 60 | + linear_method_cls) |
| 61 | + config = submodule.quant_method.quant_config |
| 62 | + assert get_dynamic_override(config, |
| 63 | + layer_name=name, |
| 64 | + key="bits") == 8 |
| 65 | + assert get_dynamic_override(config, |
| 66 | + layer_name=name, |
| 67 | + key="group_size") == 32 |
| 68 | + assert not get_dynamic_override( |
| 69 | + config, layer_name=name, key="desc_act") |
| 70 | + elif (name == 'model.layers.2.self_attn.qkv_proj' |
| 71 | + or name == 'model.layers.2.mlp.gate_up_proj'): |
| 72 | + # All other layers (layer index >= 2) are not quantized |
| 73 | + assert isinstance(submodule.quant_method, |
| 74 | + UnquantizedLinearMethod) |
70 | 75 |
|
71 | | - del vllm_model |
| 76 | + llm.apply_model(check_model) |
0 commit comments