Jn-Huang commited on
Commit
0600d50
·
1 Parent(s): 89babab

Reduce vLLM GPU memory utilization to 0.7 to avoid OOM on T4 GPU

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -25,7 +25,7 @@ def load_model():
25
  enable_lora=True,
26
  max_lora_rank=64,
27
  dtype="float16",
28
- gpu_memory_utilization=0.9,
29
  trust_remote_code=True,
30
  )
31
 
@@ -82,8 +82,11 @@ def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9)
82
  def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
83
  # Build conversation in Llama 3.1 chat format
84
  messages = []
85
- if system_prompt:
86
- messages.append({"role": "system", "content": system_prompt})
 
 
 
87
 
88
  # History is already in dict format: [{"role": "user", "content": "..."}, ...]
89
  for msg in (history or []):
 
25
  enable_lora=True,
26
  max_lora_rank=64,
27
  dtype="float16",
28
+ gpu_memory_utilization=0.7, # Reduced from 0.9 to avoid OOM on T4 GPU
29
  trust_remote_code=True,
30
  )
31
 
 
82
  def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
83
  # Build conversation in Llama 3.1 chat format
84
  messages = []
85
+
86
+ # Add system prompt (use default if not provided)
87
+ if not system_prompt:
88
+ system_prompt = "You are Be.FM, a helpful and knowledgeable AI assistant. Provide clear, accurate, and concise responses."
89
+ messages.append({"role": "system", "content": system_prompt})
90
 
91
  # History is already in dict format: [{"role": "user", "content": "..."}, ...]
92
  for msg in (history or []):