| | from fastapi import FastAPI, Request |
| | from huggingface_hub import InferenceClient |
| | import os |
| |
|
| | app = FastAPI() |
| |
|
| | HF_TOKEN = os.getenv("HF_TOKEN") |
| |
|
| | |
| | client = InferenceClient(token=HF_TOKEN, model="meta-llama/Llama-3.2-3B-Instruct") |
| | |
| | |
| |
|
| |
|
| | @app.get("/") |
| | def root(): |
| | return {"message": "Gemma 3 API on CPU"} |
| |
|
| |
|
| | @app.post("/generate") |
| | async def generate(request: Request): |
| | body = await request.json() |
| | prompt = body.get("prompt", "") |
| | messages = [ |
| | {"role": "user", "content": prompt} |
| | ] |
| | out = client.chat_completion(messages) |
| | print(out) |
| | response = out.choices[0].message.content |
| | return {"response": response} |