# -*- coding: utf-8 -*-
#This is not how you run a notebook
#But I need to start up a vLLM instance to save myself
#So one does as they must
#Import the necessary packages and cry
import subprocess
import socket
import time
import os

#Making sure that the port is in use, otherwise it'll make sure it doesn't crash constantly
def is_port_in_use(port: int) -> bool:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("localhost", port)) == 0


#Then we start the LLM
def start_vllm():
    """Start vLLM server with LLaMA model"""

    #Ensuring the port is not in use
    #And if it is, ensuring that the space does not tweak out and crash by trying to stat this again
    if is_port_in_use(8000):
        print("vLLM already running.")
        return None    
    
    #Set the model
    model_name = os.getenv('Llama 3.3-70B-Instruct-AWQ', 'kosbu/Llama-3.3-70B-Instruct-AWQ')

    cmd = [
        "python",
        "-m", "vllm.entrypoints.openai.api_server",
        "--model", model_name,
        "--port", "8000",
        "--host", "0.0.0.0",
        "--gpu-memory-utilization", "0.8",
        "--max-model-len", "4096",
        "--max-num-seqs", "8",
        "--swap-space", "4",
        "--trust-remote-code",
        "--enforce-eager"
    ]

    print(f"Starting vLLM server with model: {model_name}")
    process = subprocess.Popen(cmd)
    return process

#I have a function to ensure that vLLM is set up properly because we need to make sure that the model runs 
def wait_for_vllm():
    """ This function waits for vLLM to become reading before starting the chat."""
    
    print("Waiting for vLLM to become ready...")

    while True:
        try:
            r = requests.get(VLLM_URL, timeout=5)
            if r.status_code == 200:
                print("vLLM is ready.")
                break
        except Exception:
            #Still waiting for it to come alive
            pass

        time.sleep(10)

#Main app running!!!
if __name__ == "__main__":
    #Start the process!
    vllm_process = start_vllm()

    #Mic check, is this thing on?
    wait_for_vllm()

    #If it is, amazing!!! We now get fun text.
    try:
        print("vLLM server is running. Press Ctrl+C to stop.")
        while True:
            time.sleep(90)  # keep process alive
    except KeyboardInterrupt:
        print("Shutting down vLLM...")
        if vllm_process:
            vllm_process.terminate()
            vllm_process.wait() # Ensuring it actually closes
            print("Cleanup complete. VRAM released.")