|
|
import spaces |
|
|
import gradio as gr |
|
|
from transformers import pipeline, TextIteratorStreamer |
|
|
import torch |
|
|
import threading |
|
|
|
|
|
|
|
|
model_name = "krish10/Qwen3_14B_16bit_Sleep" |
|
|
pipe = pipeline("text-generation", model=model_name, device=0) |
|
|
tokenizer = pipe.tokenizer |
|
|
model = pipe.model |
|
|
|
|
|
|
|
|
MAX_TOKENS = 3000 |
|
|
TEMPERATURE = 0.1 |
|
|
TOP_P = 0.9 |
|
|
|
|
|
@spaces.GPU |
|
|
def respond_stream(user_input): |
|
|
|
|
|
if not user_input.strip(): |
|
|
return "❌ Error: Input text is required." |
|
|
|
|
|
|
|
|
prompt = ( |
|
|
f"Instruction: \n\n{user_input.strip()}" |
|
|
) |
|
|
|
|
|
|
|
|
messages = [{"role": "user", "content": prompt}] |
|
|
prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
|
|
|
inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda") |
|
|
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
|
|
|
|
generation_kwargs = dict( |
|
|
input_ids=inputs["input_ids"], |
|
|
streamer=streamer, |
|
|
max_new_tokens=MAX_TOKENS, |
|
|
temperature=TEMPERATURE, |
|
|
top_p=TOP_P, |
|
|
do_sample=True, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) |
|
|
thread.start() |
|
|
|
|
|
partial_text = "" |
|
|
for token in streamer: |
|
|
partial_text += token |
|
|
yield partial_text |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## 🤖 Sleep trained Qwen3-14b") |
|
|
|
|
|
with gr.Column(): |
|
|
user_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your full input here") |
|
|
|
|
|
output_box = gr.Textbox(label="Model Response", lines=15, interactive=False) |
|
|
generate_btn = gr.Button("Generate") |
|
|
|
|
|
generate_btn.click( |
|
|
fn=respond_stream, |
|
|
inputs=[user_input], |
|
|
outputs=[output_box] |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|