Did anyone get speculative decode working?
VLLM_ATTENTION_BACKEND='FLASHINFER' NVCC_THREADS=25 MAX_JOBS=25 OMP_NUM_THREADS=25 vllm serve cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit --served-model-name 'Qwen/Qwen3-Next-80B-A3B-Instruct' --gpu-memory-utilization .85 --speculative-config '{"method": "mtp", "num_speculative_tokens": 7}'
I get the (EngineCore_DP0 pid=49694) RuntimeError: Worker failed with error ''GPUModelRunner' object has no attribute 'drafter'', please check the stack trace above for the root cause error. Which indicates that speculative decode model is not present in the quantized version.
MTP is not supported in AWQ quantization, you need to use the full model
@cpatonn , thanks for the update. Did you try this with vllm 0.11.2 (latest stable). I am getting this error
Command (working with vllm 0.11 with cuda 12.8)
vllm serve cpatonn/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit --pipeline-parallel-size 5 --tool-call-parser qwen3_xml --kv-cache-dtype fp8 --gpu-memory-utilization .85 --served-model-name 'Qwen/Qwen3-Next-80B-A3B-Instruct' --dtype float16 --swap-space 40 --enable-auto-tool-choice --reasoning-parser qwen3
Error with vllm 0.11.2 with cuda 13.
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] Traceback (most recent call last):
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 715, in worker_main
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] worker = WorkerProc(*args, **kwargs)
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 555, in __init__
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.worker.load_model()
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 273, in load_model
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.model_runner.load_model(eep_scale_up=eep_scale_up)
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3276, in load_model
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.model = model_loader.load_model(
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 49, in load_model
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] model = initialize_model(
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 55, in initialize_model
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] return model_class(vllm_config=vllm_config, prefix=prefix)
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_next.py", line 1218, in __init__
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.set_moe_parameters()
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_next.py", line 1158, in set_moe_parameters
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] raise RuntimeError("No Qwen3Next layer found in the model.layers.")
(Worker_PP2 pid=613) ERROR 11-28 02:19:46 [multiproc_executor.py:743] RuntimeError: No Qwen3Next layer found in the model.layers.
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] WorkerProc failed to start.
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] Traceback (most recent call last):
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 715, in worker_main
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] worker = WorkerProc(*args, **kwargs)
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 555, in __init__
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.worker.load_model()
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 273, in load_model
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.model_runner.load_model(eep_scale_up=eep_scale_up)
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3276, in load_model
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.model = model_loader.load_model(
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 49, in load_model
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] model = initialize_model(
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 55, in initialize_model
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] return model_class(vllm_config=vllm_config, prefix=prefix)
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_next.py", line 1218, in __init__
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] self.set_moe_parameters()
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_next.py", line 1158, in set_moe_parameters
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] raise RuntimeError("No Qwen3Next layer found in the model.layers.")
(Worker_PP3 pid=614) ERROR 11-28 02:19:46 [multiproc_executor.py:743] RuntimeError: No Qwen3Next layer found in the model.layers.
(Worker_PP0 pid=611) INFO 11-28 02:19:46 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_PP3 pid=614) INFO 11-28 02:19:46 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_PP2 pid=613) INFO 11-28 02:19:46 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_PP4 pid=615) INFO 11-28 02:19:46 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_PP1 pid=612) INFO 11-28 02:19:46 [multiproc_executor.py:702] Parent process exited, terminating worker
[rank0]:[W1128 02:19:47.684829291 ProcessGroupNCCL.cpp:1524] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] EngineCore failed to start.
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] Traceback (most recent call last):
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 833, in run_engine_core
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 606, in __init__
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] super().__init__(
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 102, in __init__
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in __init__
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] super().__init__(vllm_config)
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] self._init_executor()
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 171, in _init_executor
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 653, in wait_for_ready
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] raise e from None
(EngineCore_DP0 pid=555) ERROR 11-28 02:19:48 [core.py:842] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
(EngineCore_DP0 pid=555) Process EngineCore_DP0:
(EngineCore_DP0 pid=555) Traceback (most recent call last):
(EngineCore_DP0 pid=555) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=555) self.run()
(EngineCore_DP0 pid=555) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=555) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 846, in run_engine_core
(EngineCore_DP0 pid=555) raise e
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 833, in run_engine_core
(EngineCore_DP0 pid=555) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=555) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 606, in __init__
(EngineCore_DP0 pid=555) super().__init__(
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 102, in __init__
(EngineCore_DP0 pid=555) self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=555) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 96, in __init__
(EngineCore_DP0 pid=555) super().__init__(vllm_config)
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 101, in __init__
(EngineCore_DP0 pid=555) self._init_executor()
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 171, in _init_executor
(EngineCore_DP0 pid=555) self.workers = WorkerProc.wait_for_ready(unready_workers)
(EngineCore_DP0 pid=555) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=555) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 653, in wait_for_ready
(EngineCore_DP0 pid=555) raise e from None
(EngineCore_DP0 pid=555) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
(APIServer pid=437) Traceback (most recent call last):
(APIServer pid=437) File "/usr/local/bin/vllm", line 10, in <module>
(APIServer pid=437) sys.exit(main())
(APIServer pid=437) ^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main
(APIServer pid=437) args.dispatch_function(args)
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd
(APIServer pid=437) uvloop.run(run_server(args))
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
(APIServer pid=437) return __asyncio.run(
(APIServer pid=437) ^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run
(APIServer pid=437) return runner.run(main)
(APIServer pid=437) ^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=437) return self._loop.run_until_complete(task)
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=437) return await main
(APIServer pid=437) ^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 2024, in run_server
(APIServer pid=437) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 2043, in run_server_worker
(APIServer pid=437) async with build_async_engine_client(
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=437) return await anext(self.gen)
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 195, in build_async_engine_client
(APIServer pid=437) async with build_async_engine_client_from_engine_args(
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=437) return await anext(self.gen)
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 236, in build_async_engine_client_from_engine_args
(APIServer pid=437) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/utils/func_utils.py", line 116, in inner
(APIServer pid=437) return fn(*args, **kwargs)
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 203, in from_vllm_config
(APIServer pid=437) return cls(
(APIServer pid=437) ^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 133, in __init__
(APIServer pid=437) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 121, in make_async_mp_client
(APIServer pid=437) return AsyncMPClient(*client_args)
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 808, in __init__
(APIServer pid=437) super().__init__(
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 469, in __init__
(APIServer pid=437) with launch_core_engines(vllm_config, executor_class, log_stats) as (
(APIServer pid=437) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=437) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=437) next(self.gen)
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 907, in launch_core_engines
(APIServer pid=437) wait_for_engine_startup(
(APIServer pid=437) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 964, in wait_for_engine_startup
(APIServer pid=437) raise RuntimeError(
(APIServer pid=437) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}