| services: | |
| vllm-iquest-optimized: | |
| image: vllm-blackwell-official:latest | |
| container_name: vllm-iquest-nvfp4-hq | |
| environment: | |
| - VLLM_USE_V1=1 | |
| - NVIDIA_VISIBLE_DEVICES=all | |
| - VLLM_LOGGING_LEVEL=DEBUG | |
| volumes: | |
| - ../NVFP4-Packed:/model | |
| ports: | |
| - "8001:8000" | |
| command: > | |
| --model /model --served-model-name iquest-coder-40b-loop --quantization modelopt --trust-remote-code --tensor-parallel-size 1 --gpu-memory-utilization 0.8 --max-model-len 32768 --enforce-eager | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [ gpu ] | |
| benchmarking: | |
| image: vllm-blackwell-official:latest | |
| container_name: iquest-benchmark | |
| volumes: | |
| - ../NVFP4-Packed:/model | |
| - .:/scripts | |
| entrypoint: python3 | |
| command: /scripts/benchmark_optimized.py | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [ gpu ] | |