import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import torch
import spaces
import re


# Initialize the model and tokenizer
print("Loading VibeThinker model...")
model = AutoModelForCausalLM.from_pretrained(
    "WeiboAI/VibeThinker-1.5B",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    "WeiboAI/VibeThinker-1.5B", 
    trust_remote_code=True
)
print("Model loaded successfully!")


@spaces.GPU
def respond(message, history):
    """
    Generate streaming response for the chatbot.
    
    Args:
        message: The user's current message
        history: List of previous conversation messages
    """
    # Build messages from history
    messages = history if history else []
    
    # Add current message
    messages.append({"role": "user", "content": message})
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    # Generation config - using dict format as in official docs
    generation_config = dict(
        max_new_tokens=4000,
        do_sample=True,
        temperature=0.6,
        top_p=0.95,
        top_k=None
    )
    
    # Generate - passing GenerationConfig exactly as in docs
    generated_ids = model.generate(
        **model_inputs,
        generation_config=GenerationConfig(**generation_config)
    )
    
    # Trim input from output - exactly as in official docs
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    
    # Decode - skip special tokens will help but we'll also filter manually
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # For streaming effect, yield character by character
    partial_response = ""
    for char in response:
        partial_response += char
        yield partial_response


# Create the Gradio interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    css="""
    .header-link { text-decoration: none; color: inherit; }
    .header-link:hover { text-decoration: underline; }
    """
) as demo:
    gr.Markdown(
        """
        # 💭 VibeThinker Chatbot
        Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model.
        
        <a href="https://huggingface.co/spaces/akhaliq/anycoder" class="header-link">Built with anycoder</a>
        """
    )
    
    gr.ChatInterface(
        fn=respond,
        type="messages",
        title="",
        description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.",
        examples=[
            "What is 2 + 2?",
            "Tell me a short joke",
            "What is the capital of France?",
            "Explain AI in one sentence",
        ],
        cache_examples=False,
        chatbot=gr.Chatbot(allow_tags=["think"]),
    )
    
    gr.Markdown(
        """
        ### About VibeThinker
        VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations.
        The model uses temperature sampling (0.6) for balanced creativity and coherence.
        
        **Powered by ZeroGPU** for efficient GPU resource allocation.
        """
    )

if __name__ == "__main__":
    demo.launch()