File size: 3,929 Bytes
c4bf23b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119b74d
 
 
 
 
 
c4bf23b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Hugging Face Space – tiny streaming chatbot + JSON API
#   • Gradio UI with incremental token streaming
#   • POST /api/generate  → {"response": "..."}
#   • Easily swap the model path / prompt template later
#
# Tested on HF free CPU – 16 vCPU, 16 GB RAM
# ---------------------------------------------------------

import os, asyncio
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

from fastapi.staticfiles import StaticFiles
app = FastAPI()
app.mount("/static", StaticFiles(directory="static"), name="static")
# keep the next line so Gradio UI is still available at /
app = gr.mount_gradio_app(app, demo, path="/")

MODEL_REPO   = "TheBloke/TinyLlama-1.1B-Chat-GGUF"
GGUF_FILE    = "tinyllama-1.1b-chat.q4_K_M.gguf"   # 2 GB, 4-bit
N_CTX        = 4096                                # tokens of context
MAX_TOKENS   = 512                                 # generation limit

# ---------- model load (one-time) ----------
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=GGUF_FILE)
llm = Llama(model_path=model_path,
            n_ctx=N_CTX,
            n_threads=int(os.getenv("NUM_CPU", "8")),  # feel free to tune
            n_gpu_layers=0,                            # CPU-only
            logits_all=False,
            use_mlock=True)

SYSTEM_PROMPT = "You are a helpful, concise news assistant."

# ---------- streaming generation ----------
def stream_chat(prompt, history):
    # Llama.cpp wants the full conversation in a single string
    dialogue = [f"<|system|>{SYSTEM_PROMPT}"]
    for user, bot in history:
        dialogue.append(f"<|user|>{user}")
        dialogue.append(f"<|assistant|>{bot}")
    dialogue.append(f"<|user|>{prompt}")
    final_prompt = "\n".join(dialogue)

    stream = llm.create_completion(
        final_prompt,
        max_tokens=MAX_TOKENS,
        temperature=0.7,
        top_p=0.9,
        stream=True,
        stop=["<|user|>", "<|assistant|>", "</s>"],
    )
    partial = ""
    for chunk in stream:
        token = chunk["choices"][0]["text"]
        partial += token
        yield partial

# ---------- Gradio interface ----------
with gr.Blocks(title="Tiny Chatbot") as demo:
    gr.Markdown("### TinyLlama Chatbot – streams as it thinks")
    chatbot = gr.Chatbot()
    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="Paste or type…", lines=4)
    send_btn = gr.Button("Send", variant="primary")

    def user_submit(message, chat_history):
        chat_history = chat_history + [[message, ""]]
        return "", chat_history

    def bot_reply(chat_history):
        user_msg = chat_history[-1][0]
        gen = stream_chat(user_msg, chat_history[:-1])
        for answer in gen:
            chat_history[-1][1] = answer
            yield chat_history

    txt.submit(user_submit, [txt, chatbot], [txt, chatbot]).then(
        bot_reply, chatbot, chatbot
    )
    send_btn.click(user_submit, [txt, chatbot], [txt, chatbot]).then(
        bot_reply, chatbot, chatbot
    )

# ---------- JSON API ----------
app = FastAPI()
app = gr.mount_gradio_app(app, demo, path="/")      # UI on root “/”

class GenRequest(BaseModel):
    prompt: str
    max_tokens: int | None = None

@app.post("/api/generate")
async def api_generate(req: GenRequest):
    if not req.prompt:
        raise HTTPException(400, detail="prompt missing")
    gen = llm.create_completion(
        f"<|system|>{SYSTEM_PROMPT}\n<|user|>{req.prompt}",
        max_tokens=req.max_tokens or MAX_TOKENS,
        temperature=0.7,
        top_p=0.9,
        stream=False,
        stop=["<|user|>", "<|assistant|>", "</s>"],
    )
    return {"response": gen["choices"][0]["text"].strip()}

if __name__ == "__main__":       # HF launches `python app.py`
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", 7860)))