Spaces:

JeanLima2024
/

Multi-Models

Paused

App Files Files Community

Jean Lima commited on Jan 5

Commit

34fdab0

1 Parent(s): e83b189

Migrate to GGUF (Q4) for CPU/RAM optimization

Browse files

Files changed (2) hide show

app.py +70 -80
requirements.txt +1 -4

app.py CHANGED Viewed

@@ -7,38 +7,50 @@ from datetime import datetime
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
-from huggingface_hub import InferenceClient
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 # ============ Configuração ============
 HF_TOKEN = os.environ.get("HF_TOKEN")
 API_KEY = os.environ.get("API_KEY", HF_TOKEN)
-# ============ Modelo Local - LFM2-8B-A1B (CPU) ============
-print("🔄 Carregando LFM2-8B-A1B localmente...")
-LOCAL_MODEL_NAME = "LiquidAI/LFM2-8B-A1B"
-# Carregar tokenizer e modelo para CPU
-chat_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_NAME, token=HF_TOKEN, trust_remote_code=True)
-chat_model = AutoModelForCausalLM.from_pretrained(
-    LOCAL_MODEL_NAME,
-    token=HF_TOKEN,
-    trust_remote_code=True,
-    dtype=torch.float16,  # Economia de memória (Corrigido de torch_dtype)
-    device_map="cpu",
-    low_cpu_mem_usage=True
-)
-print("✅ LFM2-8B-A1B carregado com sucesso!")
 # ============ Clientes de Modelos (Inference API) ============
-# Visão - Análise de imagens (Inference API)
 vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it")
-# Embeddings - Vetores semânticos (Inference API)
 embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3")
 # Classificação Zero-Shot (Multilíngue - PT/EN/ES...)
@@ -50,39 +62,22 @@ summarize_client = InferenceClient(token=HF_TOKEN, model="csebuetnlp/mT5_multili
 # Análise de Sentimento (Multilíngue - PT/EN/ES...)
 sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
 # ============ Função de Chat Local ============
 def generate_local_chat(messages, max_tokens=1024, temperature=0.7):
-    """Gera resposta usando o modelo local LFM2-8B-A1B"""
-    # Formatar mensagens no formato ChatML
-    formatted_prompt = ""
-    for msg in messages:
-        role = msg.get("role", "user")
-        content = msg.get("content", "")
-        if isinstance(content, list):
-            # Extrair texto de conteúdo multimodal
-            content = " ".join([item.get("text", "") for item in content if item.get("type") == "text"])
-        formatted_prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
-    formatted_prompt += "<|im_start|>assistant\n"
-    # Tokenizar
-    inputs = chat_tokenizer(formatted_prompt, return_tensors="pt")
-    # Gerar resposta
-    with torch.no_grad():
-        outputs = chat_model.generate(
-            inputs.input_ids,
-            max_new_tokens=max_tokens,
-            temperature=temperature,
-            do_sample=temperature > 0,
-            pad_token_id=chat_tokenizer.eos_token_id,
-            eos_token_id=chat_tokenizer.eos_token_id
-        )
-    # Decodificar resposta
-    response = chat_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
-    return response.strip()
 # ============ Cache ============
@@ -127,7 +122,7 @@ def has_image_content(messages):
 app = FastAPI(
     title="DGGirl Multi-Modal API",
     description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento",
-    version="4.0.0"
 )
 app.add_middleware(
@@ -142,7 +137,7 @@ app.add_middleware(
 @app.get("/", response_class=HTMLResponse)
 async def home():
     endpoints_html = """
-        <div class="endpoint"><span class="method">POST</span> <code>/v1/chat/completions</code><p>💬 Chat inteligente (LFM2-8B) + Visão (Gemma 3)</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/embeddings</code><p>🔢 Vetores semânticos para RAG (BGE-M3)</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/classify</code><p>🏷️ Classificação zero-shot de textos</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/summarize</code><p>📝 Resumir textos longos</p></div>
@@ -152,7 +147,7 @@ async def home():
     <!DOCTYPE html>
     <html>
     <head>
-        <title>DGGirl API v4</title>
         <style>
             body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 40px auto; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); min-height: 100vh; }}
             .container {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 20px; box-shadow: 0 10px 40px rgba(0,0,0,0.3); }}
@@ -174,14 +169,14 @@ async def home():
     </head>
     <body>
         <div class="container">
-            <h1>🤖 DGGirl API v4 - Multi-Modal</h1>
             <p>Status: <span class="status">● OPERACIONAL</span></p>
             {endpoints_html}
             <div class="models">
                 <h3>🧠 Modelos Ativos</h3>
-                <span class="model-tag">LiquidAI/LFM2-8B-A1B</span>
                 <span class="model-tag">Gemma 3 27B Vision</span>
                 <span class="model-tag">BGE-M3 Embeddings</span>
                 <span class="model-tag">XLM-RoBERTa Classification</span>
@@ -198,10 +193,6 @@ async def home():
                     <div class="stat-value">6</div>
                     <div>Endpoints</div>
                 </div>
-                <div class="stat">
-                    <div class="stat-value">6</div>
-                    <div>Modelos</div>
-                </div>
             </div>
             <p style="margin-top: 25px; text-align: center;">
@@ -227,8 +218,7 @@ async def chat_completions(request: Request):
         # Detectar se precisa de visão
         has_vision = model == "vision" or has_image_content(raw_messages)
-        model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B"
-        client = vision_client if has_vision else chat_client
         # Cache (apenas para texto)
         cache_key = get_cache_key(raw_messages, model_used)
@@ -237,7 +227,7 @@ async def chat_completions(request: Request):
             if cached:
                 return cached
-        # Processar mensagens de visão
         if has_vision:
             last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None)
             if not last_user_msg:
@@ -262,12 +252,7 @@ async def chat_completions(request: Request):
                 messages = [{"role": "user", "content": vision_content}]
             else:
                 messages = raw_messages
-        else:
-            messages = raw_messages
-        # Gerar resposta
-        if has_vision:
-            # Usar Inference API para visão
             response = vision_client.chat_completion(
                 messages=messages,
                 max_tokens=body.get("max_tokens", 1024),
@@ -275,12 +260,15 @@ async def chat_completions(request: Request):
             )
             response_content = response.choices[0].message.content
         else:
-            # Usar modelo local para texto
-            response_content = generate_local_chat(
-                messages=messages,
-                max_tokens=body.get("max_tokens", 1024),
-                temperature=body.get("temperature", 0.7)
-            )
         result = {
             "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
@@ -374,7 +362,7 @@ async def classify_text(request: Request):
             "labels": result.labels if hasattr(result, 'labels') else labels,
             "scores": result.scores if hasattr(result, 'scores') else [],
             "predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None,
-            "model": "bart-large-mnli"
         }
         set_cached_response(cache_key, response)
@@ -418,7 +406,7 @@ async def summarize_text(request: Request):
             "summary": summary,
             "summary_length": len(summary),
             "compression_ratio": round(len(summary) / len(text) * 100, 2),
-            "model": "bart-large-cnn"
         }
         set_cached_response(cache_key, response)
@@ -449,14 +437,16 @@ async def analyze_sentiment(request: Request):
         result = sentiment_client.text_classification(text)
-        # Mapear labels para português
         label_map = {
             "positive": "positivo",
             "negative": "negativo",
             "neutral": "neutro",
             "POSITIVE": "positivo",
             "NEGATIVE": "negativo",
-            "NEUTRAL": "neutro"
         }
         if isinstance(result, list) and len(result) > 0:
@@ -474,7 +464,7 @@ async def analyze_sentiment(request: Request):
             "sentiment_raw": label,
             "confidence": round(score, 4),
             "all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [],
-            "model": "roberta-sentiment"
         }
         set_cached_response(cache_key, response)
@@ -490,11 +480,11 @@ async def list_models():
     return {
         "object": "list",
         "data": [
-            {"id": "lfm2-8b", "object": "model", "owned_by": "liquidai", "description": "Chat rápido e versátil"},
             {"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"},
             {"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"},
             {"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"},
-            {"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue (45 idiomas)"},
             {"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"}
         ]
     }
@@ -505,9 +495,9 @@ async def health():
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "cache_size": len(response_cache),
-        "version": "4.0.0",
         "models": {
-            "chat": "LiquidAI/LFM2-8B-A1B",
             "vision": "google/gemma-3-27b-it",
             "embeddings": "BAAI/bge-m3",
             "classify": "joeddav/xlm-roberta-large-xnli",

 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
+from huggingface_hub import InferenceClient, hf_hub_download
+from llama_cpp import Llama
 # ============ Configuração ============
 HF_TOKEN = os.environ.get("HF_TOKEN")
 API_KEY = os.environ.get("API_KEY", HF_TOKEN)
+# ============ Modelo Local - LFM2-8B-A1B (GGUF - CPU Otimizado) ============
+print("🔄 Baixando e carregando LFM2-8B-A1B (GGUF)...")
+# Baixar modelo GGUF (Q4_K_M para equilíbrio entre qualidade e memória ~5.5GB)
+REPO_ID = "bartowski/LiquidAI_LFM2-8B-A1B-GGUF"
+FILENAME = "LiquidAI_LFM2-8B-A1B-Q4_K_M.gguf"
+try:
+    model_path = hf_hub_download(
+        repo_id=REPO_ID,
+        filename=FILENAME,
+        token=HF_TOKEN
+    )
+    print(f"✅ Modelo baixado em: {model_path}")
+    # Carregar modelo com llama.cpp
+    chat_model = Llama(
+        model_path=model_path,
+        n_ctx=4096,           # Contexto
+        n_threads=8,          # Threads da CPU
+        n_batch=512,
+        verbose=False
+    )
+    print("✅ LFM2-8B-A1B carregado com sucesso na memória!")
+except Exception as e:
+    print(f"❌ Erro ao carregar modelo: {e}")
+    chat_model = None
 # ============ Clientes de Modelos (Inference API) ============
+# Visão - Análise de imagens
 vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it")
+# Embeddings - Vetores semânticos
 embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3")
 # Classificação Zero-Shot (Multilíngue - PT/EN/ES...)
 # Análise de Sentimento (Multilíngue - PT/EN/ES...)
 sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
 # ============ Função de Chat Local ============
 def generate_local_chat(messages, max_tokens=1024, temperature=0.7):
+    """Gera resposta usando o modelo local LFM2-8B-A1B (GGUF)"""
+    if not chat_model:
+        return "Erro: Modelo não carregado."
+    # Usar chat_completion nativo do llama-cpp-python (já lida com templates)
+    output = chat_model.create_chat_completion(
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stop=["<|im_end|>", "<|endoftext|>"]
+    )
+    return output['choices'][0]['message']['content']
 # ============ Cache ============
 app = FastAPI(
     title="DGGirl Multi-Modal API",
     description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento",
+    version="4.1.0"
 )
 app.add_middleware(
 @app.get("/", response_class=HTMLResponse)
 async def home():
     endpoints_html = """
+        <div class="endpoint"><span class="method">POST</span> <code>/v1/chat/completions</code><p>💬 Chat inteligente (LFM2-8B GGUF) + Visão (Gemma 3)</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/embeddings</code><p>🔢 Vetores semânticos para RAG (BGE-M3)</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/classify</code><p>🏷️ Classificação zero-shot de textos</p></div>
         <div class="endpoint"><span class="method">POST</span> <code>/v1/summarize</code><p>📝 Resumir textos longos</p></div>
     <!DOCTYPE html>
     <html>
     <head>
+        <title>DGGirl API v4.1</title>
         <style>
             body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 40px auto; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); min-height: 100vh; }}
             .container {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 20px; box-shadow: 0 10px 40px rgba(0,0,0,0.3); }}
     </head>
     <body>
         <div class="container">
+            <h1>🤖 DGGirl API v4.1 - CPU Optimized</h1>
             <p>Status: <span class="status">● OPERACIONAL</span></p>
             {endpoints_html}
             <div class="models">
                 <h3>🧠 Modelos Ativos</h3>
+                <span class="model-tag">LFM2-8B-A1B (GGUF Q4)</span>
                 <span class="model-tag">Gemma 3 27B Vision</span>
                 <span class="model-tag">BGE-M3 Embeddings</span>
                 <span class="model-tag">XLM-RoBERTa Classification</span>
                     <div class="stat-value">6</div>
                     <div>Endpoints</div>
                 </div>
             </div>
             <p style="margin-top: 25px; text-align: center;">
         # Detectar se precisa de visão
         has_vision = model == "vision" or has_image_content(raw_messages)
+        model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B-GGUF"
         # Cache (apenas para texto)
         cache_key = get_cache_key(raw_messages, model_used)
             if cached:
                 return cached
+        # Gerar resposta
         if has_vision:
             last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None)
             if not last_user_msg:
                 messages = [{"role": "user", "content": vision_content}]
             else:
                 messages = raw_messages
             response = vision_client.chat_completion(
                 messages=messages,
                 max_tokens=body.get("max_tokens", 1024),
             )
             response_content = response.choices[0].message.content
         else:
+            # Usar modelo local (GGUF) para texto
+            try:
+                response_content = generate_local_chat(
+                    messages=raw_messages,
+                    max_tokens=body.get("max_tokens", 1024),
+                    temperature=body.get("temperature", 0.7)
+                )
+            except Exception as e:
+                response_content = f"Error generating response: {str(e)}"
         result = {
             "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
             "labels": result.labels if hasattr(result, 'labels') else labels,
             "scores": result.scores if hasattr(result, 'scores') else [],
             "predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None,
+            "model": "xlm-roberta-large-xnli"
         }
         set_cached_response(cache_key, response)
             "summary": summary,
             "summary_length": len(summary),
             "compression_ratio": round(len(summary) / len(text) * 100, 2),
+            "model": "mt5-multilingual"
         }
         set_cached_response(cache_key, response)
         result = sentiment_client.text_classification(text)
+        # Mapear labels
         label_map = {
             "positive": "positivo",
             "negative": "negativo",
             "neutral": "neutro",
             "POSITIVE": "positivo",
             "NEGATIVE": "negativo",
+            "NEUTRAL": "neutro",
+            "1 star": "negativo",
+            "5 stars": "positivo"
         }
         if isinstance(result, list) and len(result) > 0:
             "sentiment_raw": label,
             "confidence": round(score, 4),
             "all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [],
+            "model": "distilbert-base-multilingual"
         }
         set_cached_response(cache_key, response)
     return {
         "object": "list",
         "data": [
+            {"id": "lfm2-8b-gguf", "object": "model", "owned_by": "liquidai", "description": "Chat rápido (GGUF Q4)"},
             {"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"},
             {"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"},
             {"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"},
+            {"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue"},
             {"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"}
         ]
     }
         "status": "healthy",
         "timestamp": datetime.now().isoformat(),
         "cache_size": len(response_cache),
+        "version": "4.1.0",
         "models": {
+            "chat": "LiquidAI/LFM2-8B-A1B-GGUF (Q4)",
             "vision": "google/gemma-3-27b-it",
             "embeddings": "BAAI/bge-m3",
             "classify": "joeddav/xlm-roberta-large-xnli",

requirements.txt CHANGED Viewed

@@ -2,7 +2,4 @@ fastapi==0.109.0
 uvicorn[standard]==0.27.0
 huggingface-hub>=0.25.0
 python-multipart==0.0.6
-torch>=2.0.0
-git+https://github.com/huggingface/transformers.git
-accelerate>=0.27.0
-sentencepiece

 uvicorn[standard]==0.27.0
 huggingface-hub>=0.25.0
 python-multipart==0.0.6
+llama-cpp-python>=0.2.70