# -*- coding: utf-8 -*- #This is not how you run a notebook #But I need to start up a vLLM instance to save myself #So one does as they must #Import the necessary packages and cry import subprocess import socket import time import os #Making sure that the port is in use, otherwise it'll make sure it doesn't crash constantly def is_port_in_use(port: int) -> bool: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex(("localhost", port)) == 0 #Then we start the LLM def start_vllm(): """Start vLLM server with LLaMA model""" #Ensuring the port is not in use #And if it is, ensuring that the space does not tweak out and crash by trying to stat this again if is_port_in_use(8000): print("vLLM already running.") return None #Set the model model_name = os.getenv('Llama 3.3-70B-Instruct-AWQ', 'kosbu/Llama-3.3-70B-Instruct-AWQ') cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", model_name, "--port", "8000", "--host", "0.0.0.0", "--gpu-memory-utilization", "0.8", "--max-model-len", "4096", "--max-num-seqs", "8", "--swap-space", "4", "--trust-remote-code", "--enforce-eager" ] print(f"Starting vLLM server with model: {model_name}") process = subprocess.Popen(cmd) return process #I have a function to ensure that vLLM is set up properly because we need to make sure that the model runs def wait_for_vllm(): """ This function waits for vLLM to become reading before starting the chat.""" print("Waiting for vLLM to become ready...") while True: try: r = requests.get(VLLM_URL, timeout=5) if r.status_code == 200: print("vLLM is ready.") break except Exception: #Still waiting for it to come alive pass time.sleep(10) #Main app running!!! if __name__ == "__main__": #Start the process! vllm_process = start_vllm() #Mic check, is this thing on? wait_for_vllm() #If it is, amazing!!! We now get fun text. try: print("vLLM server is running. Press Ctrl+C to stop.") while True: time.sleep(90) # keep process alive except KeyboardInterrupt: print("Shutting down vLLM...") if vllm_process: vllm_process.terminate() vllm_process.wait() # Ensuring it actually closes print("Cleanup complete. VRAM released.")