massage command
Browse files
README.md
CHANGED
|
@@ -49,7 +49,7 @@ Final estimate: PPL = 5.4403 +/- 0.03421 (wiki.test.raw, compare to Q8_0 at 5.31
|
|
| 49 |
# This example for 24GB VRAM + 96 GB RAM + 16 physical core CPU
|
| 50 |
# Offload first ffn layers 0-9 on GPU VRAM.
|
| 51 |
# Leave remaining ffn layers on CPU RAM.
|
| 52 |
-
./build/bin/llama-server
|
| 53 |
--model ubergarm/Qwen3-235B-A22B-GGUF/Qwen3-235B-A22B-mix-IQ3_K-00001-of-00003.gguf \
|
| 54 |
--alias ubergarm/Qwen3-235B-A22B-mix-IQ3_K \
|
| 55 |
-fa -fmoe \
|
|
@@ -59,7 +59,7 @@ Final estimate: PPL = 5.4403 +/- 0.03421 (wiki.test.raw, compare to Q8_0 at 5.31
|
|
| 59 |
-ot blk\.[0-9]\.ffn.*=CUDA0 \
|
| 60 |
-ot "blk.*\.ffn.*=CPU \
|
| 61 |
-ngl 99 \
|
| 62 |
-
--threads 16
|
| 63 |
-ub 4096 -b 4096 \
|
| 64 |
--host 127.0.0.1 \
|
| 65 |
--port 8080
|
|
|
|
| 49 |
# This example for 24GB VRAM + 96 GB RAM + 16 physical core CPU
|
| 50 |
# Offload first ffn layers 0-9 on GPU VRAM.
|
| 51 |
# Leave remaining ffn layers on CPU RAM.
|
| 52 |
+
./build/bin/llama-server \
|
| 53 |
--model ubergarm/Qwen3-235B-A22B-GGUF/Qwen3-235B-A22B-mix-IQ3_K-00001-of-00003.gguf \
|
| 54 |
--alias ubergarm/Qwen3-235B-A22B-mix-IQ3_K \
|
| 55 |
-fa -fmoe \
|
|
|
|
| 59 |
-ot blk\.[0-9]\.ffn.*=CUDA0 \
|
| 60 |
-ot "blk.*\.ffn.*=CPU \
|
| 61 |
-ngl 99 \
|
| 62 |
+
--threads 16 \
|
| 63 |
-ub 4096 -b 4096 \
|
| 64 |
--host 127.0.0.1 \
|
| 65 |
--port 8080
|