Sleep

Sleeping

App Files Files Community

Sleep / app.py

krish10

Update app.py

f5f63c4 verified 7 months ago

raw

history blame contribute delete

2.04 kB

	import spaces
	import gradio as gr
	from transformers import pipeline, TextIteratorStreamer
	import torch
	import threading

	# Load model and tokenizer
	model_name = "krish10/Qwen3_14B_16bit_Sleep"
	pipe = pipeline("text-generation", model=model_name, device=0)
	tokenizer = pipe.tokenizer
	model = pipe.model

	# Fixed generation config
	MAX_TOKENS = 3000
	TEMPERATURE = 0.1
	TOP_P = 0.9

	@spaces.GPU
	def respond_stream(user_input):
	# Validate input
	if not user_input.strip():
	return "❌ Error: Input text is required."

	# Use the entire input directly in the prompt
	prompt = (
	f"Instruction: \n\n{user_input.strip()}"
	)

	# Wrap into message for chat template
	messages = [{"role": "user", "content": prompt}]
	prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

	# Tokenize and prepare streamer
	inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	generation_kwargs = dict(
	input_ids=inputs["input_ids"],
	streamer=streamer,
	max_new_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	top_p=TOP_P,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)

	thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	partial_text = ""
	for token in streamer:
	partial_text += token
	yield partial_text

	# Build Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## 🤖 Sleep trained Qwen3-14b")

	with gr.Column():
	user_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your full input here")

	output_box = gr.Textbox(label="Model Response", lines=15, interactive=False)
	generate_btn = gr.Button("Generate")

	generate_btn.click(
	fn=respond_stream,
	inputs=[user_input],
	outputs=[output_box]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()