CitizenClimate / vllm_server.py
gyrmo's picture
I now have more GPU, therefore I have now reduced the GPU utililisation to 0.8
9b002ee verified
raw
history blame
2.57 kB
# -*- coding: utf-8 -*-
#This is not how you run a notebook
#But I need to start up a vLLM instance to save myself
#So one does as they must
#Import the necessary packages and cry
import subprocess
import socket
import time
import os
#Making sure that the port is in use, otherwise it'll make sure it doesn't crash constantly
def is_port_in_use(port: int) -> bool:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
return s.connect_ex(("localhost", port)) == 0
#Then we start the LLM
def start_vllm():
"""Start vLLM server with LLaMA model"""
#Ensuring the port is not in use
#And if it is, ensuring that the space does not tweak out and crash by trying to stat this again
if is_port_in_use(8000):
print("vLLM already running.")
return None
#Set the model
model_name = os.getenv('Llama 3.3-70B-Instruct-AWQ', 'kosbu/Llama-3.3-70B-Instruct-AWQ')
cmd = [
"python",
"-m", "vllm.entrypoints.openai.api_server",
"--model", model_name,
"--port", "8000",
"--host", "0.0.0.0",
"--gpu-memory-utilization", "0.8",
"--max-model-len", "4096",
"--max-num-seqs", "8",
"--swap-space", "4",
"--trust-remote-code",
"--enforce-eager"
]
print(f"Starting vLLM server with model: {model_name}")
process = subprocess.Popen(cmd)
return process
#I have a function to ensure that vLLM is set up properly because we need to make sure that the model runs
def wait_for_vllm():
""" This function waits for vLLM to become reading before starting the chat."""
print("Waiting for vLLM to become ready...")
while True:
try:
r = requests.get(VLLM_URL, timeout=5)
if r.status_code == 200:
print("vLLM is ready.")
break
except Exception:
#Still waiting for it to come alive
pass
time.sleep(10)
#Main app running!!!
if __name__ == "__main__":
#Start the process!
vllm_process = start_vllm()
#Mic check, is this thing on?
wait_for_vllm()
#If it is, amazing!!! We now get fun text.
try:
print("vLLM server is running. Press Ctrl+C to stop.")
while True:
time.sleep(90) # keep process alive
except KeyboardInterrupt:
print("Shutting down vLLM...")
if vllm_process:
vllm_process.terminate()
vllm_process.wait() # Ensuring it actually closes
print("Cleanup complete. VRAM released.")