Spaces:
Runtime error
Runtime error
| # Hugging Face Space β tiny streaming chatbot + JSON API | |
| # β’ Gradio UI with incremental token streaming | |
| # β’ POST /api/generate β {"response": "..."} | |
| # β’ Easily swap the model path / prompt template later | |
| # | |
| # Tested on HF free CPU β 16 vCPU, 16 GB RAM | |
| # --------------------------------------------------------- | |
| import os, asyncio | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| from fastapi.staticfiles import StaticFiles | |
| app = FastAPI() | |
| app.mount("/static", StaticFiles(directory="static"), name="static") | |
| # keep the next line so Gradio UI is still available at / | |
| app = gr.mount_gradio_app(app, demo, path="/") | |
| MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-GGUF" | |
| GGUF_FILE = "tinyllama-1.1b-chat.q4_K_M.gguf" # 2 GB, 4-bit | |
| N_CTX = 4096 # tokens of context | |
| MAX_TOKENS = 512 # generation limit | |
| # ---------- model load (one-time) ---------- | |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE) | |
| llm = Llama(model_path=model_path, | |
| n_ctx=N_CTX, | |
| n_threads=int(os.getenv("NUM_CPU", "8")), # feel free to tune | |
| n_gpu_layers=0, # CPU-only | |
| logits_all=False, | |
| use_mlock=True) | |
| SYSTEM_PROMPT = "You are a helpful, concise news assistant." | |
| # ---------- streaming generation ---------- | |
| def stream_chat(prompt, history): | |
| # Llama.cpp wants the full conversation in a single string | |
| dialogue = [f"<|system|>{SYSTEM_PROMPT}"] | |
| for user, bot in history: | |
| dialogue.append(f"<|user|>{user}") | |
| dialogue.append(f"<|assistant|>{bot}") | |
| dialogue.append(f"<|user|>{prompt}") | |
| final_prompt = "\n".join(dialogue) | |
| stream = llm.create_completion( | |
| final_prompt, | |
| max_tokens=MAX_TOKENS, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stream=True, | |
| stop=["<|user|>", "<|assistant|>", "</s>"], | |
| ) | |
| partial = "" | |
| for chunk in stream: | |
| token = chunk["choices"][0]["text"] | |
| partial += token | |
| yield partial | |
| # ---------- Gradio interface ---------- | |
| with gr.Blocks(title="Tiny Chatbot") as demo: | |
| gr.Markdown("### TinyLlama Chatbot β streams as it thinks") | |
| chatbot = gr.Chatbot() | |
| with gr.Row(): | |
| txt = gr.Textbox(show_label=False, placeholder="Paste or typeβ¦", lines=4) | |
| send_btn = gr.Button("Send", variant="primary") | |
| def user_submit(message, chat_history): | |
| chat_history = chat_history + [[message, ""]] | |
| return "", chat_history | |
| def bot_reply(chat_history): | |
| user_msg = chat_history[-1][0] | |
| gen = stream_chat(user_msg, chat_history[:-1]) | |
| for answer in gen: | |
| chat_history[-1][1] = answer | |
| yield chat_history | |
| txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then( | |
| bot_reply, chatbot, chatbot | |
| ) | |
| send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then( | |
| bot_reply, chatbot, chatbot | |
| ) | |
| # ---------- JSON API ---------- | |
| app = FastAPI() | |
| app = gr.mount_gradio_app(app, demo, path="/") # UI on root β/β | |
| class GenRequest(BaseModel): | |
| prompt: str | |
| max_tokens: int | None = None | |
| async def api_generate(req: GenRequest): | |
| if not req.prompt: | |
| raise HTTPException(400, detail="prompt missing") | |
| gen = llm.create_completion( | |
| f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}", | |
| max_tokens=req.max_tokens or MAX_TOKENS, | |
| temperature=0.7, | |
| top_p=0.9, | |
| stream=False, | |
| stop=["<|user|>", "<|assistant|>", "</s>"], | |
| ) | |
| return {"response": gen["choices"][0]["text"].strip()} | |
| if __name__ == "__main__": # HF launches `python app.py` | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860))) | |