Spaces:

ntdservices
/

chatbot

Runtime error

App Files Files Community

chatbot / app.py

ntdservices

Update app.py

119b74d verified 5 months ago

raw

history blame contribute delete

3.93 kB

	# Hugging Face Space – tiny streaming chatbot + JSON API
	# • Gradio UI with incremental token streaming
	# • POST /api/generate → {"response": "..."}
	# • Easily swap the model path / prompt template later
	#
	# Tested on HF free CPU – 16 vCPU, 16 GB RAM
	# ---------------------------------------------------------

	import os, asyncio
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	from fastapi.staticfiles import StaticFiles
	app = FastAPI()
	app.mount("/static", StaticFiles(directory="static"), name="static")
	# keep the next line so Gradio UI is still available at /
	app = gr.mount_gradio_app(app, demo, path="/")

	MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-GGUF"
	GGUF_FILE = "tinyllama-1.1b-chat.q4_K_M.gguf" # 2 GB, 4-bit
	N_CTX = 4096 # tokens of context
	MAX_TOKENS = 512 # generation limit

	# ---------- model load (one-time) ----------
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE)
	llm = Llama(model_path=model_path,
	n_ctx=N_CTX,
	n_threads=int(os.getenv("NUM_CPU", "8")), # feel free to tune
	n_gpu_layers=0, # CPU-only
	logits_all=False,
	use_mlock=True)

	SYSTEM_PROMPT = "You are a helpful, concise news assistant."

	# ---------- streaming generation ----------
	def stream_chat(prompt, history):
	# Llama.cpp wants the full conversation in a single string
	dialogue = [f"<\|system\|>{SYSTEM_PROMPT}"]
	for user, bot in history:
	dialogue.append(f"<\|user\|>{user}")
	dialogue.append(f"<\|assistant\|>{bot}")
	dialogue.append(f"<\|user\|>{prompt}")
	final_prompt = "\n".join(dialogue)

	stream = llm.create_completion(
	final_prompt,
	max_tokens=MAX_TOKENS,
	temperature=0.7,
	top_p=0.9,
	stream=True,
	stop=["<\|user\|>", "<\|assistant\|>", "</s>"],
	)
	partial = ""
	for chunk in stream:
	token = chunk["choices"][0]["text"]
	partial += token
	yield partial

	# ---------- Gradio interface ----------
	with gr.Blocks(title="Tiny Chatbot") as demo:
	gr.Markdown("### TinyLlama Chatbot – streams as it thinks")
	chatbot = gr.Chatbot()
	with gr.Row():
	txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4)
	send_btn = gr.Button("Send", variant="primary")

	def user_submit(message, chat_history):
	chat_history = chat_history + [[message, ""]]
	return "", chat_history

	def bot_reply(chat_history):
	user_msg = chat_history[-1][0]
	gen = stream_chat(user_msg, chat_history[:-1])
	for answer in gen:
	chat_history[-1][1] = answer
	yield chat_history

	txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then(
	bot_reply, chatbot, chatbot
	)
	send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then(
	bot_reply, chatbot, chatbot
	)

	# ---------- JSON API ----------
	app = FastAPI()
	app = gr.mount_gradio_app(app, demo, path="/") # UI on root “/”

	class GenRequest(BaseModel):
	prompt: str
	max_tokens: int \| None = None

	@app.post("/api/generate")
	async def api_generate(req: GenRequest):
	if not req.prompt:
	raise HTTPException(400, detail="prompt missing")
	gen = llm.create_completion(
	f"<\|system\|>{SYSTEM_PROMPT}\n<\|user\|>{req.prompt}",
	max_tokens=req.max_tokens or MAX_TOKENS,
	temperature=0.7,
	top_p=0.9,
	stream=False,
	stop=["<\|user\|>", "<\|assistant\|>", "</s>"],
	)
	return {"response": gen["choices"][0]["text"].strip()}

	if __name__ == "__main__": # HF launches `python app.py`
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))