import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig import torch import spaces import re # Initialize the model and tokenizer print("Loading VibeThinker model...") model = AutoModelForCausalLM.from_pretrained( "WeiboAI/VibeThinker-1.5B", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "WeiboAI/VibeThinker-1.5B", trust_remote_code=True ) print("Model loaded successfully!") @spaces.GPU def respond(message, history): """ Generate streaming response for the chatbot. Args: message: The user's current message history: List of previous conversation messages """ # Build messages from history messages = history if history else [] # Add current message messages.append({"role": "user", "content": message}) # Apply chat template text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize model_inputs = tokenizer([text], return_tensors="pt").to(model.device) # Generation config - using dict format as in official docs generation_config = dict( max_new_tokens=4000, do_sample=True, temperature=0.6, top_p=0.95, top_k=None ) # Generate - passing GenerationConfig exactly as in docs generated_ids = model.generate( **model_inputs, generation_config=GenerationConfig(**generation_config) ) # Trim input from output - exactly as in official docs generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] # Decode - skip special tokens will help but we'll also filter manually response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # For streaming effect, yield character by character partial_response = "" for char in response: partial_response += char yield partial_response # Create the Gradio interface with gr.Blocks( theme=gr.themes.Soft(), css=""" .header-link { text-decoration: none; color: inherit; } .header-link:hover { text-decoration: underline; } """ ) as demo: gr.Markdown( """ # 💭 VibeThinker Chatbot Chat with [WeiboAI/VibeThinker-1.5B](https://huggingface.co/WeiboAI/VibeThinker-1.5B) - a powerful conversational AI model. Built with anycoder """ ) gr.ChatInterface( fn=respond, type="messages", title="", description="Ask me anything! I'm powered by VibeThinker with ZeroGPU acceleration.", examples=[ "What is 2 + 2?", "Tell me a short joke", "What is the capital of France?", "Explain AI in one sentence", ], cache_examples=False, chatbot=gr.Chatbot(allow_tags=["think"]), ) gr.Markdown( """ ### About VibeThinker VibeThinker is a 1.5B parameter conversational AI model designed for engaging and thoughtful conversations. The model uses temperature sampling (0.6) for balanced creativity and coherence. **Powered by ZeroGPU** for efficient GPU resource allocation. """ ) if __name__ == "__main__": demo.launch()