Spaces:
Sleeping
Sleeping
Jn-Huang
commited on
Commit
·
0600d50
1
Parent(s):
89babab
Reduce vLLM GPU memory utilization to 0.7 to avoid OOM on T4 GPU
Browse files
app.py
CHANGED
|
@@ -25,7 +25,7 @@ def load_model():
|
|
| 25 |
enable_lora=True,
|
| 26 |
max_lora_rank=64,
|
| 27 |
dtype="float16",
|
| 28 |
-
gpu_memory_utilization=0.9
|
| 29 |
trust_remote_code=True,
|
| 30 |
)
|
| 31 |
|
|
@@ -82,8 +82,11 @@ def generate_response(messages, max_new_tokens=512, temperature=0.7, top_p=0.9)
|
|
| 82 |
def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
|
| 83 |
# Build conversation in Llama 3.1 chat format
|
| 84 |
messages = []
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# History is already in dict format: [{"role": "user", "content": "..."}, ...]
|
| 89 |
for msg in (history or []):
|
|
|
|
| 25 |
enable_lora=True,
|
| 26 |
max_lora_rank=64,
|
| 27 |
dtype="float16",
|
| 28 |
+
gpu_memory_utilization=0.7, # Reduced from 0.9 to avoid OOM on T4 GPU
|
| 29 |
trust_remote_code=True,
|
| 30 |
)
|
| 31 |
|
|
|
|
| 82 |
def chat_fn(message, history, system_prompt, max_new_tokens, temperature, top_p):
|
| 83 |
# Build conversation in Llama 3.1 chat format
|
| 84 |
messages = []
|
| 85 |
+
|
| 86 |
+
# Add system prompt (use default if not provided)
|
| 87 |
+
if not system_prompt:
|
| 88 |
+
system_prompt = "You are Be.FM, a helpful and knowledgeable AI assistant. Provide clear, accurate, and concise responses."
|
| 89 |
+
messages.append({"role": "system", "content": system_prompt})
|
| 90 |
|
| 91 |
# History is already in dict format: [{"role": "user", "content": "..."}, ...]
|
| 92 |
for msg in (history or []):
|