| VLLM_VERSION="v0.8.4" | |
| BASE_MODEL_NAME="Qwen/Qwen2.5-3B-Instruct" | |
| ADAPTER_NAME="id4thomas/emotion-predictor-Qwen2.5-3B-Instruct" | |
| ADAPTER_DIR="..." # Path to the directory containing the LoRA adapter | |
| docker run --runtime nvidia --gpus all \ | |
| -v ${ADAPTER_DIR}:/vllm-workspace/adapter \ | |
| -v ./cache:/root/.cache/huggingface \ | |
| -p 8010:8000 \ | |
| --ipc=host \ | |
| vllm/vllm-openai:${VLLM_VERSION} \ | |
| --model "Qwen/Qwen2.5-3B-Instruct" \ | |
| --lora-modules '{"name": "${ADAPTER_NAME}", "path": "/vllm-workspace/adapter", "base_model_name": "${BASE_MODEL_NAME}"}' \ | |
| --enable-lora \ | |
| --max-lora-rank 16 \ | |
| --served-model-name ${BASE_MODEL_NAME} \ | |
| --gpu-memory-utilization=0.5 |