import spaces import gradio as gr from transformers import pipeline, TextIteratorStreamer import torch import threading # Load model and tokenizer model_name = "krish10/Qwen3_14B_16bit_Sleep" pipe = pipeline("text-generation", model=model_name, device=0) tokenizer = pipe.tokenizer model = pipe.model # Fixed generation config MAX_TOKENS = 3000 TEMPERATURE = 0.1 TOP_P = 0.9 @spaces.GPU def respond_stream(user_input): # Validate input if not user_input.strip(): return "❌ Error: Input text is required." # Use the entire input directly in the prompt prompt = ( f"Instruction: \n\n{user_input.strip()}" ) # Wrap into message for chat template messages = [{"role": "user", "content": prompt}] prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Tokenize and prepare streamer inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( input_ids=inputs["input_ids"], streamer=streamer, max_new_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_text = "" for token in streamer: partial_text += token yield partial_text # Build Gradio interface with gr.Blocks() as demo: gr.Markdown("## 🤖 Sleep trained Qwen3-14b") with gr.Column(): user_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your full input here") output_box = gr.Textbox(label="Model Response", lines=15, interactive=False) generate_btn = gr.Button("Generate") generate_btn.click( fn=respond_stream, inputs=[user_input], outputs=[output_box] ) # Launch the app if __name__ == "__main__": demo.launch()