Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| #This is not how you run a notebook | |
| #But I need to start up a vLLM instance to save myself | |
| #So one does as they must | |
| #Import the necessary packages and cry | |
| import subprocess | |
| import socket | |
| import time | |
| import os | |
| #Making sure that the port is in use, otherwise it'll make sure it doesn't crash constantly | |
| def is_port_in_use(port: int) -> bool: | |
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
| return s.connect_ex(("localhost", port)) == 0 | |
| #Then we start the LLM | |
| def start_vllm(): | |
| """Start vLLM server with LLaMA model""" | |
| #Ensuring the port is not in use | |
| #And if it is, ensuring that the space does not tweak out and crash by trying to stat this again | |
| if is_port_in_use(8000): | |
| print("vLLM already running.") | |
| return None | |
| #Set the model | |
| model_name = os.getenv('Llama 3.3-70B-Instruct-AWQ', 'kosbu/Llama-3.3-70B-Instruct-AWQ') | |
| cmd = [ | |
| "python", | |
| "-m", "vllm.entrypoints.openai.api_server", | |
| "--model", model_name, | |
| "--port", "8000", | |
| "--host", "0.0.0.0", | |
| "--gpu-memory-utilization", "0.8", | |
| "--max-model-len", "4096", | |
| "--max-num-seqs", "8", | |
| "--swap-space", "4", | |
| "--trust-remote-code", | |
| "--enforce-eager" | |
| ] | |
| print(f"Starting vLLM server with model: {model_name}") | |
| process = subprocess.Popen(cmd) | |
| return process | |
| #I have a function to ensure that vLLM is set up properly because we need to make sure that the model runs | |
| def wait_for_vllm(): | |
| """ This function waits for vLLM to become reading before starting the chat.""" | |
| print("Waiting for vLLM to become ready...") | |
| while True: | |
| try: | |
| r = requests.get(VLLM_URL, timeout=5) | |
| if r.status_code == 200: | |
| print("vLLM is ready.") | |
| break | |
| except Exception: | |
| #Still waiting for it to come alive | |
| pass | |
| time.sleep(10) | |
| #Main app running!!! | |
| if __name__ == "__main__": | |
| #Start the process! | |
| vllm_process = start_vllm() | |
| #Mic check, is this thing on? | |
| wait_for_vllm() | |
| #If it is, amazing!!! We now get fun text. | |
| try: | |
| print("vLLM server is running. Press Ctrl+C to stop.") | |
| while True: | |
| time.sleep(90) # keep process alive | |
| except KeyboardInterrupt: | |
| print("Shutting down vLLM...") | |
| if vllm_process: | |
| vllm_process.terminate() | |
| vllm_process.wait() # Ensuring it actually closes | |
| print("Cleanup complete. VRAM released.") |