Spaces:

gyrmo
/

CitizenClimate

Sleeping

App Files Files

CitizenClimate / vllm_server.py

gyrmo

I now have more GPU, therefore I have now reduced the GPU utililisation to 0.8

9b002ee verified 8 days ago

raw

history blame

2.57 kB

	# -- coding: utf-8 --
	#This is not how you run a notebook
	#But I need to start up a vLLM instance to save myself
	#So one does as they must
	#Import the necessary packages and cry
	import subprocess
	import socket
	import time
	import os

	#Making sure that the port is in use, otherwise it'll make sure it doesn't crash constantly
	def is_port_in_use(port: int) -> bool:
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
	return s.connect_ex(("localhost", port)) == 0



	#Then we start the LLM
	def start_vllm():
	"""Start vLLM server with LLaMA model"""

	#Ensuring the port is not in use
	#And if it is, ensuring that the space does not tweak out and crash by trying to stat this again
	if is_port_in_use(8000):
	print("vLLM already running.")
	return None

	#Set the model
	model_name = os.getenv('Llama 3.3-70B-Instruct-AWQ', 'kosbu/Llama-3.3-70B-Instruct-AWQ')

	cmd = [
	"python",
	"-m", "vllm.entrypoints.openai.api_server",
	"--model", model_name,
	"--port", "8000",
	"--host", "0.0.0.0",
	"--gpu-memory-utilization", "0.8",
	"--max-model-len", "4096",
	"--max-num-seqs", "8",
	"--swap-space", "4",
	"--trust-remote-code",
	"--enforce-eager"
	]

	print(f"Starting vLLM server with model: {model_name}")
	process = subprocess.Popen(cmd)
	return process

	#I have a function to ensure that vLLM is set up properly because we need to make sure that the model runs
	def wait_for_vllm():
	""" This function waits for vLLM to become reading before starting the chat."""

	print("Waiting for vLLM to become ready...")

	while True:
	try:
	r = requests.get(VLLM_URL, timeout=5)
	if r.status_code == 200:
	print("vLLM is ready.")
	break
	except Exception:
	#Still waiting for it to come alive
	pass

	time.sleep(10)

	#Main app running!!!
	if __name__ == "__main__":
	#Start the process!
	vllm_process = start_vllm()

	#Mic check, is this thing on?
	wait_for_vllm()

	#If it is, amazing!!! We now get fun text.
	try:
	print("vLLM server is running. Press Ctrl+C to stop.")
	while True:
	time.sleep(90) # keep process alive
	except KeyboardInterrupt:
	print("Shutting down vLLM...")
	if vllm_process:
	vllm_process.terminate()
	vllm_process.wait() # Ensuring it actually closes
	print("Cleanup complete. VRAM released.")