@@ -117,68 +117,59 @@ def model(x):
117117
118118@create_new_process_for_each_test ()
119119@pytest .mark .parametrize (
120- "model, use_v1 " ,
120+ "model" ,
121121 [
122122 # sleep mode with safetensors
123- ( "meta-llama/Llama-3.2-1B" , True ) ,
123+ "meta-llama/Llama-3.2-1B" ,
124124 # sleep mode with pytorch checkpoint
125- ( "facebook/opt-125m" , True ) ,
125+ "facebook/opt-125m" ,
126126 ],
127127)
128- def test_end_to_end (monkeypatch : pytest .MonkeyPatch , model : str , use_v1 : bool ):
129- with monkeypatch .context () as m :
130- assert use_v1
131- m .setenv ("VLLM_USE_V1" , "1" )
132- free , total = torch .cuda .mem_get_info ()
133- used_bytes_baseline = total - free # in case other process is running
134- llm = LLM (model , enable_sleep_mode = True )
135- prompt = "How are you?"
136- sampling_params = SamplingParams (temperature = 0 , max_tokens = 10 )
137- output = llm .generate (prompt , sampling_params )
138-
139- # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
140- # which is difficult to measure in the test. therefore, we only
141- # test sleep level 1 here.
142- llm .sleep (level = 1 )
143-
144- free_gpu_bytes_after_sleep , total = torch .cuda .mem_get_info ()
145- used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
146- # now the memory usage is mostly cudagraph memory pool,
147- # and it should be less than the model weights (1B model, 2GiB weights)
148-
149- # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
150- # is captured but cannot be releasesd from PyTorch due to a known bug,
151- # therefore high memory usage after `llm.sleep` is called is expected.
152- # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
153- # in V1.
154- if use_v1 :
155- assert used_bytes < 7 * GiB_bytes
156- else :
157- assert used_bytes < 2 * GiB_bytes
158-
159- llm .wake_up ()
160- output2 = llm .generate (prompt , sampling_params )
161- # cmp output
162- assert output [0 ].outputs [0 ].text == output2 [0 ].outputs [0 ].text
163-
164- llm .sleep (level = 1 )
165- llm .wake_up (tags = ["weights" ])
166-
167- free_gpu_bytes_wake_up_w , total = torch .cuda .mem_get_info ()
168- used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
169-
170- # should just reallocate memory for weights (1B model, ~2GiB weights)
171- if use_v1 :
172- assert used_bytes < 10 * GiB_bytes
173- else :
174- assert used_bytes < 6 * GiB_bytes
175-
176- # now allocate kv cache memory
177- llm .wake_up (tags = ["kv_cache" ])
178- output3 = llm .generate (prompt , sampling_params )
179-
180- # cmp output
181- assert output [0 ].outputs [0 ].text == output3 [0 ].outputs [0 ].text
128+ def test_end_to_end (model : str ):
129+ free , total = torch .cuda .mem_get_info ()
130+ used_bytes_baseline = total - free # in case other process is running
131+ llm = LLM (model , enable_sleep_mode = True )
132+ prompt = "How are you?"
133+ sampling_params = SamplingParams (temperature = 0 , max_tokens = 10 )
134+ output = llm .generate (prompt , sampling_params )
135+
136+ # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
137+ # which is difficult to measure in the test. therefore, we only
138+ # test sleep level 1 here.
139+ llm .sleep (level = 1 )
140+
141+ free_gpu_bytes_after_sleep , total = torch .cuda .mem_get_info ()
142+ used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
143+ # now the memory usage is mostly cudagraph memory pool,
144+ # and it should be less than the model weights (1B model, 2GiB weights)
145+
146+ # NOTE: In V1, the memory buffer for logits (max_num_reqs x vocab_size)
147+ # is captured but cannot be releasesd from PyTorch due to a known bug,
148+ # therefore high memory usage after `llm.sleep` is called is expected.
149+ # FIXME(youkaichao & ywang96): Fix memory buffer issue with sleep mode
150+ # in V1.
151+ assert used_bytes < 7 * GiB_bytes
152+
153+ llm .wake_up ()
154+ output2 = llm .generate (prompt , sampling_params )
155+ # cmp output
156+ assert output [0 ].outputs [0 ].text == output2 [0 ].outputs [0 ].text
157+
158+ llm .sleep (level = 1 )
159+ llm .wake_up (tags = ["weights" ])
160+
161+ free_gpu_bytes_wake_up_w , total = torch .cuda .mem_get_info ()
162+ used_bytes = total - free_gpu_bytes_wake_up_w - used_bytes_baseline
163+
164+ # should just reallocate memory for weights (1B model, ~2GiB weights)
165+ assert used_bytes < 10 * GiB_bytes
166+
167+ # now allocate kv cache memory
168+ llm .wake_up (tags = ["kv_cache" ])
169+ output3 = llm .generate (prompt , sampling_params )
170+
171+ # cmp output
172+ assert output [0 ].outputs [0 ].text == output3 [0 ].outputs [0 ].text
182173
183174
184175@create_new_process_for_each_test ()
0 commit comments