Jean Lima commited on
Commit
34fdab0
·
1 Parent(s): e83b189

Migrate to GGUF (Q4) for CPU/RAM optimization

Browse files
Files changed (2) hide show
  1. app.py +70 -80
  2. requirements.txt +1 -4
app.py CHANGED
@@ -7,38 +7,50 @@ from datetime import datetime
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import JSONResponse, HTMLResponse
9
  from fastapi.middleware.cors import CORSMiddleware
10
- from huggingface_hub import InferenceClient
11
- import torch
12
- from transformers import AutoModelForCausalLM, AutoTokenizer
13
 
14
  # ============ Configuração ============
15
 
16
  HF_TOKEN = os.environ.get("HF_TOKEN")
17
  API_KEY = os.environ.get("API_KEY", HF_TOKEN)
18
 
19
- # ============ Modelo Local - LFM2-8B-A1B (CPU) ============
20
 
21
- print("🔄 Carregando LFM2-8B-A1B localmente...")
22
- LOCAL_MODEL_NAME = "LiquidAI/LFM2-8B-A1B"
23
 
24
- # Carregar tokenizer e modelo para CPU
25
- chat_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_NAME, token=HF_TOKEN, trust_remote_code=True)
26
- chat_model = AutoModelForCausalLM.from_pretrained(
27
- LOCAL_MODEL_NAME,
28
- token=HF_TOKEN,
29
- trust_remote_code=True,
30
- dtype=torch.float16, # Economia de memória (Corrigido de torch_dtype)
31
- device_map="cpu",
32
- low_cpu_mem_usage=True
33
- )
34
- print("✅ LFM2-8B-A1B carregado com sucesso!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # ============ Clientes de Modelos (Inference API) ============
37
 
38
- # Visão - Análise de imagens (Inference API)
39
  vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it")
40
 
41
- # Embeddings - Vetores semânticos (Inference API)
42
  embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3")
43
 
44
  # Classificação Zero-Shot (Multilíngue - PT/EN/ES...)
@@ -50,39 +62,22 @@ summarize_client = InferenceClient(token=HF_TOKEN, model="csebuetnlp/mT5_multili
50
  # Análise de Sentimento (Multilíngue - PT/EN/ES...)
51
  sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
52
 
53
-
54
  # ============ Função de Chat Local ============
55
 
56
  def generate_local_chat(messages, max_tokens=1024, temperature=0.7):
57
- """Gera resposta usando o modelo local LFM2-8B-A1B"""
58
- # Formatar mensagens no formato ChatML
59
- formatted_prompt = ""
60
- for msg in messages:
61
- role = msg.get("role", "user")
62
- content = msg.get("content", "")
63
- if isinstance(content, list):
64
- # Extrair texto de conteúdo multimodal
65
- content = " ".join([item.get("text", "") for item in content if item.get("type") == "text"])
66
- formatted_prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
67
- formatted_prompt += "<|im_start|>assistant\n"
68
-
69
- # Tokenizar
70
- inputs = chat_tokenizer(formatted_prompt, return_tensors="pt")
71
 
72
- # Gerar resposta
73
- with torch.no_grad():
74
- outputs = chat_model.generate(
75
- inputs.input_ids,
76
- max_new_tokens=max_tokens,
77
- temperature=temperature,
78
- do_sample=temperature > 0,
79
- pad_token_id=chat_tokenizer.eos_token_id,
80
- eos_token_id=chat_tokenizer.eos_token_id
81
- )
82
-
83
- # Decodificar resposta
84
- response = chat_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
85
- return response.strip()
86
 
87
  # ============ Cache ============
88
 
@@ -127,7 +122,7 @@ def has_image_content(messages):
127
  app = FastAPI(
128
  title="DGGirl Multi-Modal API",
129
  description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento",
130
- version="4.0.0"
131
  )
132
 
133
  app.add_middleware(
@@ -142,7 +137,7 @@ app.add_middleware(
142
  @app.get("/", response_class=HTMLResponse)
143
  async def home():
144
  endpoints_html = """
145
- <div class="endpoint"><span class="method">POST</span> <code>/v1/chat/completions</code><p>💬 Chat inteligente (LFM2-8B) + Visão (Gemma 3)</p></div>
146
  <div class="endpoint"><span class="method">POST</span> <code>/v1/embeddings</code><p>🔢 Vetores semânticos para RAG (BGE-M3)</p></div>
147
  <div class="endpoint"><span class="method">POST</span> <code>/v1/classify</code><p>🏷️ Classificação zero-shot de textos</p></div>
148
  <div class="endpoint"><span class="method">POST</span> <code>/v1/summarize</code><p>📝 Resumir textos longos</p></div>
@@ -152,7 +147,7 @@ async def home():
152
  <!DOCTYPE html>
153
  <html>
154
  <head>
155
- <title>DGGirl API v4</title>
156
  <style>
157
  body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 40px auto; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); min-height: 100vh; }}
158
  .container {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 20px; box-shadow: 0 10px 40px rgba(0,0,0,0.3); }}
@@ -174,14 +169,14 @@ async def home():
174
  </head>
175
  <body>
176
  <div class="container">
177
- <h1>🤖 DGGirl API v4 - Multi-Modal</h1>
178
  <p>Status: <span class="status">● OPERACIONAL</span></p>
179
 
180
  {endpoints_html}
181
 
182
  <div class="models">
183
  <h3>🧠 Modelos Ativos</h3>
184
- <span class="model-tag">LiquidAI/LFM2-8B-A1B</span>
185
  <span class="model-tag">Gemma 3 27B Vision</span>
186
  <span class="model-tag">BGE-M3 Embeddings</span>
187
  <span class="model-tag">XLM-RoBERTa Classification</span>
@@ -198,10 +193,6 @@ async def home():
198
  <div class="stat-value">6</div>
199
  <div>Endpoints</div>
200
  </div>
201
- <div class="stat">
202
- <div class="stat-value">6</div>
203
- <div>Modelos</div>
204
- </div>
205
  </div>
206
 
207
  <p style="margin-top: 25px; text-align: center;">
@@ -227,8 +218,7 @@ async def chat_completions(request: Request):
227
 
228
  # Detectar se precisa de visão
229
  has_vision = model == "vision" or has_image_content(raw_messages)
230
- model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B"
231
- client = vision_client if has_vision else chat_client
232
 
233
  # Cache (apenas para texto)
234
  cache_key = get_cache_key(raw_messages, model_used)
@@ -237,7 +227,7 @@ async def chat_completions(request: Request):
237
  if cached:
238
  return cached
239
 
240
- # Processar mensagens de visão
241
  if has_vision:
242
  last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None)
243
  if not last_user_msg:
@@ -262,12 +252,7 @@ async def chat_completions(request: Request):
262
  messages = [{"role": "user", "content": vision_content}]
263
  else:
264
  messages = raw_messages
265
- else:
266
- messages = raw_messages
267
 
268
- # Gerar resposta
269
- if has_vision:
270
- # Usar Inference API para visão
271
  response = vision_client.chat_completion(
272
  messages=messages,
273
  max_tokens=body.get("max_tokens", 1024),
@@ -275,12 +260,15 @@ async def chat_completions(request: Request):
275
  )
276
  response_content = response.choices[0].message.content
277
  else:
278
- # Usar modelo local para texto
279
- response_content = generate_local_chat(
280
- messages=messages,
281
- max_tokens=body.get("max_tokens", 1024),
282
- temperature=body.get("temperature", 0.7)
283
- )
 
 
 
284
 
285
  result = {
286
  "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
@@ -374,7 +362,7 @@ async def classify_text(request: Request):
374
  "labels": result.labels if hasattr(result, 'labels') else labels,
375
  "scores": result.scores if hasattr(result, 'scores') else [],
376
  "predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None,
377
- "model": "bart-large-mnli"
378
  }
379
 
380
  set_cached_response(cache_key, response)
@@ -418,7 +406,7 @@ async def summarize_text(request: Request):
418
  "summary": summary,
419
  "summary_length": len(summary),
420
  "compression_ratio": round(len(summary) / len(text) * 100, 2),
421
- "model": "bart-large-cnn"
422
  }
423
 
424
  set_cached_response(cache_key, response)
@@ -449,14 +437,16 @@ async def analyze_sentiment(request: Request):
449
 
450
  result = sentiment_client.text_classification(text)
451
 
452
- # Mapear labels para português
453
  label_map = {
454
  "positive": "positivo",
455
  "negative": "negativo",
456
  "neutral": "neutro",
457
  "POSITIVE": "positivo",
458
  "NEGATIVE": "negativo",
459
- "NEUTRAL": "neutro"
 
 
460
  }
461
 
462
  if isinstance(result, list) and len(result) > 0:
@@ -474,7 +464,7 @@ async def analyze_sentiment(request: Request):
474
  "sentiment_raw": label,
475
  "confidence": round(score, 4),
476
  "all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [],
477
- "model": "roberta-sentiment"
478
  }
479
 
480
  set_cached_response(cache_key, response)
@@ -490,11 +480,11 @@ async def list_models():
490
  return {
491
  "object": "list",
492
  "data": [
493
- {"id": "lfm2-8b", "object": "model", "owned_by": "liquidai", "description": "Chat rápido e versátil"},
494
  {"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"},
495
  {"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"},
496
  {"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"},
497
- {"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue (45 idiomas)"},
498
  {"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"}
499
  ]
500
  }
@@ -505,9 +495,9 @@ async def health():
505
  "status": "healthy",
506
  "timestamp": datetime.now().isoformat(),
507
  "cache_size": len(response_cache),
508
- "version": "4.0.0",
509
  "models": {
510
- "chat": "LiquidAI/LFM2-8B-A1B",
511
  "vision": "google/gemma-3-27b-it",
512
  "embeddings": "BAAI/bge-m3",
513
  "classify": "joeddav/xlm-roberta-large-xnli",
 
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import JSONResponse, HTMLResponse
9
  from fastapi.middleware.cors import CORSMiddleware
10
+ from huggingface_hub import InferenceClient, hf_hub_download
11
+ from llama_cpp import Llama
 
12
 
13
  # ============ Configuração ============
14
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
  API_KEY = os.environ.get("API_KEY", HF_TOKEN)
17
 
18
+ # ============ Modelo Local - LFM2-8B-A1B (GGUF - CPU Otimizado) ============
19
 
20
+ print("🔄 Baixando e carregando LFM2-8B-A1B (GGUF)...")
 
21
 
22
+ # Baixar modelo GGUF (Q4_K_M para equilíbrio entre qualidade e memória ~5.5GB)
23
+ REPO_ID = "bartowski/LiquidAI_LFM2-8B-A1B-GGUF"
24
+ FILENAME = "LiquidAI_LFM2-8B-A1B-Q4_K_M.gguf"
25
+
26
+ try:
27
+ model_path = hf_hub_download(
28
+ repo_id=REPO_ID,
29
+ filename=FILENAME,
30
+ token=HF_TOKEN
31
+ )
32
+ print(f"✅ Modelo baixado em: {model_path}")
33
+
34
+ # Carregar modelo com llama.cpp
35
+ chat_model = Llama(
36
+ model_path=model_path,
37
+ n_ctx=4096, # Contexto
38
+ n_threads=8, # Threads da CPU
39
+ n_batch=512,
40
+ verbose=False
41
+ )
42
+ print("✅ LFM2-8B-A1B carregado com sucesso na memória!")
43
+
44
+ except Exception as e:
45
+ print(f"❌ Erro ao carregar modelo: {e}")
46
+ chat_model = None
47
 
48
  # ============ Clientes de Modelos (Inference API) ============
49
 
50
+ # Visão - Análise de imagens
51
  vision_client = InferenceClient(token=HF_TOKEN, model="google/gemma-3-27b-it")
52
 
53
+ # Embeddings - Vetores semânticos
54
  embed_client = InferenceClient(token=HF_TOKEN, model="BAAI/bge-m3")
55
 
56
  # Classificação Zero-Shot (Multilíngue - PT/EN/ES...)
 
62
  # Análise de Sentimento (Multilíngue - PT/EN/ES...)
63
  sentiment_client = InferenceClient(token=HF_TOKEN, model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
64
 
 
65
  # ============ Função de Chat Local ============
66
 
67
  def generate_local_chat(messages, max_tokens=1024, temperature=0.7):
68
+ """Gera resposta usando o modelo local LFM2-8B-A1B (GGUF)"""
69
+ if not chat_model:
70
+ return "Erro: Modelo não carregado."
71
+
72
+ # Usar chat_completion nativo do llama-cpp-python ( lida com templates)
73
+ output = chat_model.create_chat_completion(
74
+ messages=messages,
75
+ max_tokens=max_tokens,
76
+ temperature=temperature,
77
+ stop=["<|im_end|>", "<|endoftext|>"]
78
+ )
 
 
 
79
 
80
+ return output['choices'][0]['message']['content']
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  # ============ Cache ============
83
 
 
122
  app = FastAPI(
123
  title="DGGirl Multi-Modal API",
124
  description="API compatível com OpenAI para chat, visão, embeddings, classificação, sumarização e sentimento",
125
+ version="4.1.0"
126
  )
127
 
128
  app.add_middleware(
 
137
  @app.get("/", response_class=HTMLResponse)
138
  async def home():
139
  endpoints_html = """
140
+ <div class="endpoint"><span class="method">POST</span> <code>/v1/chat/completions</code><p>💬 Chat inteligente (LFM2-8B GGUF) + Visão (Gemma 3)</p></div>
141
  <div class="endpoint"><span class="method">POST</span> <code>/v1/embeddings</code><p>🔢 Vetores semânticos para RAG (BGE-M3)</p></div>
142
  <div class="endpoint"><span class="method">POST</span> <code>/v1/classify</code><p>🏷️ Classificação zero-shot de textos</p></div>
143
  <div class="endpoint"><span class="method">POST</span> <code>/v1/summarize</code><p>📝 Resumir textos longos</p></div>
 
147
  <!DOCTYPE html>
148
  <html>
149
  <head>
150
+ <title>DGGirl API v4.1</title>
151
  <style>
152
  body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 40px auto; padding: 20px; background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); min-height: 100vh; }}
153
  .container {{ background: rgba(255,255,255,0.95); padding: 40px; border-radius: 20px; box-shadow: 0 10px 40px rgba(0,0,0,0.3); }}
 
169
  </head>
170
  <body>
171
  <div class="container">
172
+ <h1>🤖 DGGirl API v4.1 - CPU Optimized</h1>
173
  <p>Status: <span class="status">● OPERACIONAL</span></p>
174
 
175
  {endpoints_html}
176
 
177
  <div class="models">
178
  <h3>🧠 Modelos Ativos</h3>
179
+ <span class="model-tag">LFM2-8B-A1B (GGUF Q4)</span>
180
  <span class="model-tag">Gemma 3 27B Vision</span>
181
  <span class="model-tag">BGE-M3 Embeddings</span>
182
  <span class="model-tag">XLM-RoBERTa Classification</span>
 
193
  <div class="stat-value">6</div>
194
  <div>Endpoints</div>
195
  </div>
 
 
 
 
196
  </div>
197
 
198
  <p style="margin-top: 25px; text-align: center;">
 
218
 
219
  # Detectar se precisa de visão
220
  has_vision = model == "vision" or has_image_content(raw_messages)
221
+ model_used = "google/gemma-3-27b-it" if has_vision else "LiquidAI/LFM2-8B-A1B-GGUF"
 
222
 
223
  # Cache (apenas para texto)
224
  cache_key = get_cache_key(raw_messages, model_used)
 
227
  if cached:
228
  return cached
229
 
230
+ # Gerar resposta
231
  if has_vision:
232
  last_user_msg = next((msg for msg in reversed(raw_messages) if msg.get("role") == "user"), None)
233
  if not last_user_msg:
 
252
  messages = [{"role": "user", "content": vision_content}]
253
  else:
254
  messages = raw_messages
 
 
255
 
 
 
 
256
  response = vision_client.chat_completion(
257
  messages=messages,
258
  max_tokens=body.get("max_tokens", 1024),
 
260
  )
261
  response_content = response.choices[0].message.content
262
  else:
263
+ # Usar modelo local (GGUF) para texto
264
+ try:
265
+ response_content = generate_local_chat(
266
+ messages=raw_messages,
267
+ max_tokens=body.get("max_tokens", 1024),
268
+ temperature=body.get("temperature", 0.7)
269
+ )
270
+ except Exception as e:
271
+ response_content = f"Error generating response: {str(e)}"
272
 
273
  result = {
274
  "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
 
362
  "labels": result.labels if hasattr(result, 'labels') else labels,
363
  "scores": result.scores if hasattr(result, 'scores') else [],
364
  "predicted_label": result.labels[0] if hasattr(result, 'labels') and result.labels else None,
365
+ "model": "xlm-roberta-large-xnli"
366
  }
367
 
368
  set_cached_response(cache_key, response)
 
406
  "summary": summary,
407
  "summary_length": len(summary),
408
  "compression_ratio": round(len(summary) / len(text) * 100, 2),
409
+ "model": "mt5-multilingual"
410
  }
411
 
412
  set_cached_response(cache_key, response)
 
437
 
438
  result = sentiment_client.text_classification(text)
439
 
440
+ # Mapear labels
441
  label_map = {
442
  "positive": "positivo",
443
  "negative": "negativo",
444
  "neutral": "neutro",
445
  "POSITIVE": "positivo",
446
  "NEGATIVE": "negativo",
447
+ "NEUTRAL": "neutro",
448
+ "1 star": "negativo",
449
+ "5 stars": "positivo"
450
  }
451
 
452
  if isinstance(result, list) and len(result) > 0:
 
464
  "sentiment_raw": label,
465
  "confidence": round(score, 4),
466
  "all_scores": [{"label": r.label, "score": round(r.score, 4)} for r in result] if isinstance(result, list) else [],
467
+ "model": "distilbert-base-multilingual"
468
  }
469
 
470
  set_cached_response(cache_key, response)
 
480
  return {
481
  "object": "list",
482
  "data": [
483
+ {"id": "lfm2-8b-gguf", "object": "model", "owned_by": "liquidai", "description": "Chat rápido (GGUF Q4)"},
484
  {"id": "gemma-3-vision", "object": "model", "owned_by": "google", "description": "Análise de imagens"},
485
  {"id": "bge-m3", "object": "model", "owned_by": "baai", "description": "Embeddings multilíngue"},
486
  {"id": "xlm-roberta-classify", "object": "model", "owned_by": "joeddav", "description": "Classificação zero-shot multilíngue"},
487
+ {"id": "mt5-summarize", "object": "model", "owned_by": "csebuetnlp", "description": "Sumarização multilíngue"},
488
  {"id": "distilbert-sentiment", "object": "model", "owned_by": "lxyuan", "description": "Análise de sentimento multilíngue"}
489
  ]
490
  }
 
495
  "status": "healthy",
496
  "timestamp": datetime.now().isoformat(),
497
  "cache_size": len(response_cache),
498
+ "version": "4.1.0",
499
  "models": {
500
+ "chat": "LiquidAI/LFM2-8B-A1B-GGUF (Q4)",
501
  "vision": "google/gemma-3-27b-it",
502
  "embeddings": "BAAI/bge-m3",
503
  "classify": "joeddav/xlm-roberta-large-xnli",
requirements.txt CHANGED
@@ -2,7 +2,4 @@ fastapi==0.109.0
2
  uvicorn[standard]==0.27.0
3
  huggingface-hub>=0.25.0
4
  python-multipart==0.0.6
5
- torch>=2.0.0
6
- git+https://github.com/huggingface/transformers.git
7
- accelerate>=0.27.0
8
- sentencepiece
 
2
  uvicorn[standard]==0.27.0
3
  huggingface-hub>=0.25.0
4
  python-multipart==0.0.6
5
+ llama-cpp-python>=0.2.70