Spaces:

JinHuang1203
/

BeFM

Sleeping

Jn-Huang commited on 17 days ago

Commit

0600d50

1 Parent(s): 89babab

Reduce vLLM GPU memory utilization to 0.7 to avoid OOM on T4 GPU

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ def load_model():
         enable_lora=True,
         max_lora_rank=64,
         dtype="float16",
-        gpu_memory_utilization=0.9,
         trust_remote_code=True,
     )
@@ -82,8 +82,11 @@ def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9)
 def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
     # Build conversation in Llama 3.1 chat format
     messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": system_prompt})
     # History is already in dict format: [{"role": "user", "content": "..."}, ...]
     for msg in (history or []):

         enable_lora=True,
         max_lora_rank=64,
         dtype="float16",
+        gpu_memory_utilization=0.7,  # Reduced from 0.9 to avoid OOM on T4 GPU
         trust_remote_code=True,
     )
 def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
     # Build conversation in Llama 3.1 chat format
     messages = []
+    # Add system prompt (use default if not provided)
+    if not system_prompt:
+        system_prompt = "You are Be.FM, a helpful and knowledgeable AI assistant. Provide clear, accurate, and concise responses."
+    messages.append({"role": "system", "content": system_prompt})
     # History is already in dict format: [{"role": "user", "content": "..."}, ...]
     for msg in (history or []):