-
-
Notifications
You must be signed in to change notification settings - Fork 10.4k
Closed
Copy link
Labels
bugSomething isn't workingSomething isn't workingci-failureIssue about an unexpected test failure in CIIssue about an unexpected test failure in CI
Description
Your current environment
Failing on main as of commit 9609327
🐛 Describe the bug
Failing tests:
FAILED quantization/test_cpu_offload.py::test_cpu_offload_gptq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_awq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_compressed_tensors - AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same.
ref_args=[] ref_envs=None
compare_args=['--cpu-offload-gb', '1'] compare_envs=None
ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
Logs
quantization/test_torchao.py::test_opt_125m_int4wo_model_loading_with_params[cuda:0] SKIPPED
quantization/test_torchao.py::test_opt_125m_int4wo_model_per_module_quant SKIPPED
=================================== FAILURES ===================================
_ test_load_8bit_bnb_model[meta-llama/Llama-Guard-3-8B-INT8-read pre-quantized llama 8-bit model] _
args = ()
kwargs = {'description': 'read pre-quantized llama 8-bit model', 'example_prompts': ['vLLM is a high-throughput and memory-effi...odels.\n', ...], 'hf_runner': <class 'tests.conftest.HfRunner'>, 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', ...}
Skipped = <class 'Skipped'>, pid = 1736, pgid = 19, _pid = 1736, _exitcode = 256
old_signal_handler = <Handlers.SIG_DFL: 0>
@functools.wraps(f)
def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None:
# Make the process the leader of its own process group
# to avoid sending SIGTERM to the parent process
os.setpgrp()
from _pytest.outcomes import Skipped
pid = os.fork()
print(f"Fork a new process to run a test {pid}")
if pid == 0:
try:
f(*args, **kwargs)
except Skipped as e:
# convert Skipped to exit code 0
print(str(e))
os._exit(0)
except Exception:
import traceback
traceback.print_exc()
os._exit(1)
else:
os._exit(0)
else:
pgid = os.getpgid(pid)
_pid, _exitcode = os.waitpid(pid, 0)
# ignore SIGTERM signal itself
old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN)
# kill all child processes
os.killpg(pgid, signal.SIGTERM)
# restore the signal handler
signal.signal(signal.SIGTERM, old_signal_handler)
> assert _exitcode == 0, (f"function {f} failed when called with"
f" args {args} and kwargs {kwargs}")
E AssertionError: function <function test_load_8bit_bnb_model at 0x7fc6998f1800> failed when called with args () and kwargs {'hf_runner': <class 'tests.conftest.HfRunner'>, 'vllm_runner': <class 'tests.conftest.VllmRunner'>, 'example_prompts': ['vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', 'Describe the basic components of a neural network and how it can be trained.\n', 'Write a short story about a robot that dreams for the first time.\n', 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"], 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', 'description': 'read pre-quantized llama 8-bit model'}
utils.py:747: AssertionError
____________________________ test_cpu_offload_gptq _____________________________
monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc5b4cb2600>
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test GPTQ Marlin
> compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
quantization/test_cpu_offload.py:33:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
utils.py:465: in compare_two_settings
compare_all_settings(
utils.py:529: in compare_all_settings
with RemoteOpenAIServer(model,
utils.py:133: in __init__
self._wait_for_server(url=self.url_for("health"),
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tests.utils.RemoteOpenAIServer object at 0x7fc59e873800>
def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()
while True:
try:
if requests.get(url).status_code == 200:
break
except Exception:
# this exception can only be raised by requests.get,
# which means the server is not ready yet.
# the stack trace is not useful, so we suppress it
# by using `raise from None`.
result = self.proc.poll()
if result is not None and result != 0:
> raise RuntimeError("Server exited unexpectedly.") from None
E RuntimeError: Server exited unexpectedly.
utils.py:161: RuntimeError
_____________________________ test_cpu_offload_awq _____________________________
monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc59e8de900>
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test AWQ Marlin
> compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
quantization/test_cpu_offload.py:49:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
utils.py:465: in compare_two_settings
compare_all_settings(
utils.py:529: in compare_all_settings
with RemoteOpenAIServer(model,
utils.py:133: in __init__
self._wait_for_server(url=self.url_for("health"),
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tests.utils.RemoteOpenAIServer object at 0x7fc5b55128d0>
def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()
while True:
try:
if requests.get(url).status_code == 200:
break
except Exception:
# this exception can only be raised by requests.get,
# which means the server is not ready yet.
# the stack trace is not useful, so we suppress it
# by using `raise from None`.
result = self.proc.poll()
if result is not None and result != 0:
> raise RuntimeError("Server exited unexpectedly.") from None
E RuntimeError: Server exited unexpectedly.
utils.py:161: RuntimeError
_____________________ test_cpu_offload_compressed_tensors ______________________
monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc5b4b958e0>
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(monkeypatch):
# This quant method is sensitive to dummy weights, so we force real weights
monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test w4a16_marlin24
> compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
[], ["--cpu-offload-gb", "1"],
max_wait_seconds=480)
quantization/test_cpu_offload.py:69:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
utils.py:465: in compare_two_settings
compare_all_settings(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
model = 'nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t'
all_args = [[], ['--cpu-offload-gb', '1']], all_envs = [None, None]
def compare_all_settings(model: str,
all_args: list[list[str]],
all_envs: list[Optional[dict[str, str]]],
*,
method: str = "generate",
max_wait_seconds: Optional[float] = None) -> None:
"""
Launch API server with several different sets of arguments/environments
and compare the results of the API calls with the first set of arguments.
Args:
model: The model to test.
all_args: A list of argument lists to pass to the API server.
all_envs: A list of environment dictionaries to pass to the API server.
"""
trust_remote_code = False
for args in all_args:
if "--trust-remote-code" in args:
trust_remote_code = True
break
tokenizer_mode = "auto"
for args in all_args:
if "--tokenizer-mode" in args:
tokenizer_mode = args[args.index("--tokenizer-mode") + 1]
break
tokenizer = get_tokenizer(
model,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
)
can_force_load_format = True
for args in all_args:
if "--load-format" in args:
can_force_load_format = False
break
prompt = "Hello, my name is"
token_ids = tokenizer(prompt).input_ids
ref_results: list = []
for i, (args, env) in enumerate(zip(all_args, all_envs)):
if can_force_load_format:
# we are comparing the results and
# usually we don't need real weights.
# we force to use dummy weights by default,
# and it should work for most of the cases.
# if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT
# environment variable to force the load format,
# e.g. in quantization tests.
args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT]
compare_results: list = []
results = ref_results if i == 0 else compare_results
with RemoteOpenAIServer(model,
args,
env_dict=env,
max_wait_seconds=max_wait_seconds) as server:
client = server.get_client()
# test models list
models = client.models.list()
models = models.data
served_model = models[0]
results.append({
"test": "models_list",
"id": served_model.id,
"root": served_model.root,
})
if method == "generate":
results += _test_completion(client, model, prompt, token_ids)
elif method == "generate_close":
results += _test_completion_close(client, model, prompt)
elif method == "generate_chat":
results += _test_chat(client, model, prompt)
elif method == "generate_with_image":
results += _test_image_text(
client, model,
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
)
elif method == "encode":
results += _test_embeddings(client, model, prompt)
else:
raise ValueError(f"Unknown method: {method}")
if i > 0:
# if any setting fails, raise an error early
ref_args = all_args[0]
ref_envs = all_envs[0]
compare_args = all_args[i]
compare_envs = all_envs[i]
for ref_result, compare_result in zip(ref_results,
compare_results):
ref_result = copy.deepcopy(ref_result)
compare_result = copy.deepcopy(compare_result)
if "embedding" in ref_result and method == "encode":
sim = F.cosine_similarity(
torch.tensor(ref_result["embedding"]),
torch.tensor(compare_result["embedding"]),
dim=0,
)
assert sim >= 0.999, (
f"Embedding for {model=} are not the same.\n"
f"cosine_similarity={sim}\n")
del ref_result["embedding"]
del compare_result["embedding"]
> assert ref_result == compare_result, (
f"Results for {model=} are not the same.\n"
f"{ref_args=} {ref_envs=}\n"
f"{compare_args=} {compare_envs=}\n"
f"{ref_result=}\n"
f"{compare_result=}\n")
E AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same.
E ref_args=[] ref_envs=None
E compare_args=['--cpu-offload-gb', '1'] compare_envs=None
E ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
E compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
utils.py:582: AssertionError
...
=========================== short test summary info ============================
FAILED quantization/test_bitsandbytes.py::test_load_8bit_bnb_model[meta-llama/Llama-Guard-3-8B-INT8-read pre-quantized llama 8-bit model] - AssertionError: function <function test_load_8bit_bnb_model at 0x7fc6998f1800> failed when called with args () and kwargs {'hf_runner': <class 'tests.conftest.HfRunner'>, 'vllm_runner': <class 'tests.conftest.VllmRunner'>, 'example_prompts': ['vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', 'Describe the basic components of a neural network and how it can be trained.\n', 'Write a short story about a robot that dreams for the first time.\n', 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"], 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', 'description': 'read pre-quantized llama 8-bit model'}
FAILED quantization/test_cpu_offload.py::test_cpu_offload_gptq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_awq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_compressed_tensors - AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same.
ref_args=[] ref_envs=None
compare_args=['--cpu-offload-gb', '1'] compare_envs=None
ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
====== 4 failed, 78 passed, 35 skipped, 49 warnings in 4561.59s (1:16:01) ======
^^^ +++
🚨 Error: The command exited with status 1
^^^ +++
user command error: The plugin docker command hook exited with status 1
~~~ Running global pre-exit hook
$ /etc/buildkite-agent/hooks/pre-exit
~~~ Running plugin docker pre-exit hook
$ /var/lib/buildkite-agent/plugins/bk-gpu-1-queue-ci-i-036cff6c74f0af4ae-1/github-com-buildkite-plugins-docker-buildkite-plugin-v5-2-0/hooks/pre-exit
</details>
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingci-failureIssue about an unexpected test failure in CIIssue about an unexpected test failure in CI
Type
Projects
Status
Done