Update eval configs

Files changed (2) hide show

nemo-evaluator-launcher-configs/{local_nvidia-nemotron-nano-3-30b-a3b-base.yaml → local_nvidia-nemotron-3-nano-30b-a3b-base.yaml} +28 -13
nemo-evaluator-launcher-configs/local_qwen3-30b-a3b-base.yaml +26 -12

nemo-evaluator-launcher-configs/{local_nvidia-nemotron-nano-3-30b-a3b-base.yaml → local_nvidia-nemotron-3-nano-30b-a3b-base.yaml} RENAMED Viewed

@@ -12,19 +12,38 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 defaults:
   - execution: local
-  - deployment: none
   - _self_
 execution:
-  output_dir: NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
   # mode: sequential  # enables sequential execution
-target:
-  api_endpoint:
-    model_id: nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
-    url: http://0.0.0.0:8000/v1/chat/completions  # locally hosted endpoint
 # specify the benchmarks to evaluate
 evaluation:
@@ -35,29 +54,25 @@ evaluation:
       params:
         max_retries: 5  # number of retries for API requests
         request_timeout: 360  # timeout for API requests in seconds
-        parallelism: 1  # number of parallel requests
         extra:
-          tokenizer: nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
           tokenizer_backend: huggingface
   tasks:
     - name: adlr_mmlu_pro_5_shot_base
     - name: adlr_mmlu
     - name: adlr_agieval_en_cot
-    - name: adlr_gpqa_diamond_cot_5_shot
     - name: adlr_humaneval_greedy
-    - name: adlr_humaneval_sampled
     - name: adlr_mbpp_sanitized_3_shot_greedy
-    - name: adlr_mbpp_sanitized_3_shot_sampled
     - name: adlr_gsm8k_cot_8_shot
     - name: adlr_minerva_math_nemo_4_shot
     - name: adlr_math_500_4_shot_sampled
-    - name: adlr_commonsense_qa_7_shot
     - name: adlr_arc_challenge_llama_25_shot
     - name: hellaswag
     - name: openbookqa
     - name: piqa
     - name: adlr_race
     - name: adlr_winogrande_5_shot
-    - name: social_iqa
     - name: adlr_global_mmlu_lite_5_shot
     - name: adlr_mgsm_native_cot_8_shot

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+#
+# How to use:
+#
+# 1. copy this file locally or clone the repository
+# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
+# 3. export your HF token in the terminal (some benchmark datasets might be gated)
+# 4. run `nemo-evaluator-launcher run --config path/to/local_nvidia-nemotron-nano-3-30b-a3b-base.yaml`
+#
+# ⚠️  WARNING:
+#     Always run full evaluations (without limit_samples) for actual benchmark results.
+#     Using a subset of samples is solely for testing configuration and setup.
+#     Results from such test runs should NEVER be used to compare models or
+#     report benchmark performance.
 defaults:
   - execution: local
+  - deployment: vllm
   - _self_
 execution:
+  output_dir: NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
   # mode: sequential  # enables sequential execution
+# specify deployment arguments
+deployment:
+  image: vllm/vllm-openai:v0.12.0
+  checkpoint_path: null
+  hf_model_handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+  served_model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  extra_args: "--max-model-len 262144 --mamba_ssm_cache_dtype float32 --no-enable-prefix-caching"
 # specify the benchmarks to evaluate
 evaluation:
       params:
         max_retries: 5  # number of retries for API requests
         request_timeout: 360  # timeout for API requests in seconds
+        parallelism: 4  # number of parallel requests
+        # limit_samples: 10 # uncomment to limit number of samples for quick testing
         extra:
+          tokenizer: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
           tokenizer_backend: huggingface
   tasks:
     - name: adlr_mmlu_pro_5_shot_base
     - name: adlr_mmlu
     - name: adlr_agieval_en_cot
     - name: adlr_humaneval_greedy
     - name: adlr_mbpp_sanitized_3_shot_greedy
     - name: adlr_gsm8k_cot_8_shot
     - name: adlr_minerva_math_nemo_4_shot
     - name: adlr_math_500_4_shot_sampled
     - name: adlr_arc_challenge_llama_25_shot
     - name: hellaswag
     - name: openbookqa
     - name: piqa
     - name: adlr_race
     - name: adlr_winogrande_5_shot
     - name: adlr_global_mmlu_lite_5_shot
     - name: adlr_mgsm_native_cot_8_shot

nemo-evaluator-launcher-configs/local_qwen3-30b-a3b-base.yaml CHANGED Viewed

@@ -12,19 +12,37 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 defaults:
   - execution: local
-  - deployment: none
   - _self_
 execution:
-  output_dir: qwen3-30b-a3b-base
   # mode: sequential  # enables sequential execution
-target:
-  api_endpoint:
-    model_id: Qwen/Qwen3-30B-A3B-Base
-    url: http://0.0.0.0:8000/v1/chat/completions  # locally hosted endpoint
 # specify the benchmarks to evaluate
 evaluation:
@@ -35,7 +53,8 @@ evaluation:
       params:
         max_retries: 5  # number of retries for API requests
         request_timeout: 360  # timeout for API requests in seconds
-        parallelism: 1  # number of parallel requests
         extra:
           tokenizer: Qwen/Qwen3-30B-A3B-Base
           tokenizer_backend: huggingface
@@ -43,21 +62,16 @@ evaluation:
     - name: adlr_mmlu_pro_5_shot_base
     - name: adlr_mmlu
     - name: adlr_agieval_en_cot
-    - name: adlr_gpqa_diamond_cot_5_shot
     - name: adlr_humaneval_greedy
-    - name: adlr_humaneval_sampled
     - name: adlr_mbpp_sanitized_3_shot_greedy
-    - name: adlr_mbpp_sanitized_3_shot_sampled
     - name: adlr_gsm8k_cot_8_shot
     - name: adlr_minerva_math_nemo_4_shot
     - name: adlr_math_500_4_shot_sampled
-    - name: adlr_commonsense_qa_7_shot
     - name: adlr_arc_challenge_llama_25_shot
     - name: hellaswag
     - name: openbookqa
     - name: piqa
     - name: adlr_race
     - name: adlr_winogrande_5_shot
-    - name: social_iqa
     - name: adlr_global_mmlu_lite_5_shot
     - name: adlr_mgsm_native_cot_8_shot

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+#
+# How to use:
+#
+# 1. copy this file locally
+# 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
+# 3. export your HF token in the terminal (some benchmark datasets might be gated)
+# 4. run `nemo-evaluator-launcher run --config path/to/local_qwen3-30b-a3b-base.yaml`
+#
+# ⚠️  WARNING:
+#     Always run full evaluations (without limit_samples) for actual benchmark results.
+#     Using a subset of samples is solely for testing configuration and setup.
+#     Results from such test runs should NEVER be used to compare models or
+#     report benchmark performance.
 defaults:
   - execution: local
+  - deployment: vllm
   - _self_
 execution:
+  output_dir: Qwen3-30B-A3B-Base
   # mode: sequential  # enables sequential execution
+# specify deployment arguments
+deployment:
+  image: vllm/vllm-openai:v0.11.0
+  checkpoint_path: null
+  hf_model_handle: Qwen/Qwen3-30B-A3B-Base
+  served_model_name: Qwen/Qwen3-30B-A3B-Base
+  tensor_parallel_size: 1
+  data_parallel_size: 1
 # specify the benchmarks to evaluate
 evaluation:
       params:
         max_retries: 5  # number of retries for API requests
         request_timeout: 360  # timeout for API requests in seconds
+        parallelism: 4  # number of parallel requests
+        # limit_samples: 10 # uncomment to limit number of samples for quick testing
         extra:
           tokenizer: Qwen/Qwen3-30B-A3B-Base
           tokenizer_backend: huggingface
     - name: adlr_mmlu_pro_5_shot_base
     - name: adlr_mmlu
     - name: adlr_agieval_en_cot
     - name: adlr_humaneval_greedy
     - name: adlr_mbpp_sanitized_3_shot_greedy
     - name: adlr_gsm8k_cot_8_shot
     - name: adlr_minerva_math_nemo_4_shot
     - name: adlr_math_500_4_shot_sampled
     - name: adlr_arc_challenge_llama_25_shot
     - name: hellaswag
     - name: openbookqa
     - name: piqa
     - name: adlr_race
     - name: adlr_winogrande_5_shot
     - name: adlr_global_mmlu_lite_5_shot
     - name: adlr_mgsm_native_cot_8_shot