chatbot / app.py
ntdservices's picture
Update app.py
119b74d verified
# Hugging Face Space – tiny streaming chatbot + JSON API
# β€’ Gradio UI with incremental token streaming
# β€’ POST /api/generate β†’ {"response": "..."}
# β€’ Easily swap the model path / prompt template later
#
# Tested on HF free CPU – 16 vCPU, 16 GB RAM
# ---------------------------------------------------------
import os, asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi.staticfiles import StaticFiles
app = FastAPI()
app.mount("/static", StaticFiles(directory="static"), name="static")
# keep the next line so Gradio UI is still available at /
app = gr.mount_gradio_app(app, demo, path="/")
MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-GGUF"
GGUF_FILE = "tinyllama-1.1b-chat.q4_K_M.gguf" # 2 GB, 4-bit
N_CTX = 4096 # tokens of context
MAX_TOKENS = 512 # generation limit
# ---------- model load (one-time) ----------
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE)
llm = Llama(model_path=model_path,
n_ctx=N_CTX,
n_threads=int(os.getenv("NUM_CPU", "8")), # feel free to tune
n_gpu_layers=0, # CPU-only
logits_all=False,
use_mlock=True)
SYSTEM_PROMPT = "You are a helpful, concise news assistant."
# ---------- streaming generation ----------
def stream_chat(prompt, history):
# Llama.cpp wants the full conversation in a single string
dialogue = [f"<|system|>{SYSTEM_PROMPT}"]
for user, bot in history:
dialogue.append(f"<|user|>{user}")
dialogue.append(f"<|assistant|>{bot}")
dialogue.append(f"<|user|>{prompt}")
final_prompt = "\n".join(dialogue)
stream = llm.create_completion(
final_prompt,
max_tokens=MAX_TOKENS,
temperature=0.7,
top_p=0.9,
stream=True,
stop=["<|user|>", "<|assistant|>", "</s>"],
)
partial = ""
for chunk in stream:
token = chunk["choices"][0]["text"]
partial += token
yield partial
# ---------- Gradio interface ----------
with gr.Blocks(title="Tiny Chatbot") as demo:
gr.Markdown("### TinyLlama Chatbot – streams as it thinks")
chatbot = gr.Chatbot()
with gr.Row():
txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4)
send_btn = gr.Button("Send", variant="primary")
def user_submit(message, chat_history):
chat_history = chat_history + [[message, ""]]
return "", chat_history
def bot_reply(chat_history):
user_msg = chat_history[-1][0]
gen = stream_chat(user_msg, chat_history[:-1])
for answer in gen:
chat_history[-1][1] = answer
yield chat_history
txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then(
bot_reply, chatbot, chatbot
)
send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then(
bot_reply, chatbot, chatbot
)
# ---------- JSON API ----------
app = FastAPI()
app = gr.mount_gradio_app(app, demo, path="/") # UI on root β€œ/”
class GenRequest(BaseModel):
prompt: str
max_tokens: int | None = None
@app.post("/api/generate")
async def api_generate(req: GenRequest):
if not req.prompt:
raise HTTPException(400, detail="prompt missing")
gen = llm.create_completion(
f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}",
max_tokens=req.max_tokens or MAX_TOKENS,
temperature=0.7,
top_p=0.9,
stream=False,
stop=["<|user|>", "<|assistant|>", "</s>"],
)
return {"response": gen["choices"][0]["text"].strip()}
if __name__ == "__main__": # HF launches `python app.py`
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))