sanjeevnv commited on
Commit
da8e0df
·
1 Parent(s): fd23d10

Update eval configs

Browse files
nemo-evaluator-launcher-configs/{local_nvidia-nemotron-nano-3-30b-a3b-base.yaml → local_nvidia-nemotron-3-nano-30b-a3b-base.yaml} RENAMED
@@ -12,19 +12,38 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  defaults:
16
  - execution: local
17
- - deployment: none
18
  - _self_
19
 
20
  execution:
21
- output_dir: NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
22
  # mode: sequential # enables sequential execution
23
 
24
- target:
25
- api_endpoint:
26
- model_id: nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
27
- url: http://0.0.0.0:8000/v1/chat/completions # locally hosted endpoint
 
 
 
 
 
28
 
29
  # specify the benchmarks to evaluate
30
  evaluation:
@@ -35,29 +54,25 @@ evaluation:
35
  params:
36
  max_retries: 5 # number of retries for API requests
37
  request_timeout: 360 # timeout for API requests in seconds
38
- parallelism: 1 # number of parallel requests
 
39
  extra:
40
- tokenizer: nvidia/NVIDIA-Nemotron-Nano-3-30B-A3B-Base-BF16
41
  tokenizer_backend: huggingface
42
  tasks:
43
  - name: adlr_mmlu_pro_5_shot_base
44
  - name: adlr_mmlu
45
  - name: adlr_agieval_en_cot
46
- - name: adlr_gpqa_diamond_cot_5_shot
47
  - name: adlr_humaneval_greedy
48
- - name: adlr_humaneval_sampled
49
  - name: adlr_mbpp_sanitized_3_shot_greedy
50
- - name: adlr_mbpp_sanitized_3_shot_sampled
51
  - name: adlr_gsm8k_cot_8_shot
52
  - name: adlr_minerva_math_nemo_4_shot
53
  - name: adlr_math_500_4_shot_sampled
54
- - name: adlr_commonsense_qa_7_shot
55
  - name: adlr_arc_challenge_llama_25_shot
56
  - name: hellaswag
57
  - name: openbookqa
58
  - name: piqa
59
  - name: adlr_race
60
  - name: adlr_winogrande_5_shot
61
- - name: social_iqa
62
  - name: adlr_global_mmlu_lite_5_shot
63
  - name: adlr_mgsm_native_cot_8_shot
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ #
16
+ #
17
+ # How to use:
18
+ #
19
+ # 1. copy this file locally or clone the repository
20
+ # 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
21
+ # 3. export your HF token in the terminal (some benchmark datasets might be gated)
22
+ # 4. run `nemo-evaluator-launcher run --config path/to/local_nvidia-nemotron-nano-3-30b-a3b-base.yaml`
23
+ #
24
+ # ⚠️ WARNING:
25
+ # Always run full evaluations (without limit_samples) for actual benchmark results.
26
+ # Using a subset of samples is solely for testing configuration and setup.
27
+ # Results from such test runs should NEVER be used to compare models or
28
+ # report benchmark performance.
29
  defaults:
30
  - execution: local
31
+ - deployment: vllm
32
  - _self_
33
 
34
  execution:
35
+ output_dir: NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
36
  # mode: sequential # enables sequential execution
37
 
38
+ # specify deployment arguments
39
+ deployment:
40
+ image: vllm/vllm-openai:v0.12.0
41
+ checkpoint_path: null
42
+ hf_model_handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
43
+ served_model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
44
+ tensor_parallel_size: 1
45
+ data_parallel_size: 1
46
+ extra_args: "--max-model-len 262144 --mamba_ssm_cache_dtype float32 --no-enable-prefix-caching"
47
 
48
  # specify the benchmarks to evaluate
49
  evaluation:
 
54
  params:
55
  max_retries: 5 # number of retries for API requests
56
  request_timeout: 360 # timeout for API requests in seconds
57
+ parallelism: 4 # number of parallel requests
58
+ # limit_samples: 10 # uncomment to limit number of samples for quick testing
59
  extra:
60
+ tokenizer: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
61
  tokenizer_backend: huggingface
62
  tasks:
63
  - name: adlr_mmlu_pro_5_shot_base
64
  - name: adlr_mmlu
65
  - name: adlr_agieval_en_cot
 
66
  - name: adlr_humaneval_greedy
 
67
  - name: adlr_mbpp_sanitized_3_shot_greedy
 
68
  - name: adlr_gsm8k_cot_8_shot
69
  - name: adlr_minerva_math_nemo_4_shot
70
  - name: adlr_math_500_4_shot_sampled
 
71
  - name: adlr_arc_challenge_llama_25_shot
72
  - name: hellaswag
73
  - name: openbookqa
74
  - name: piqa
75
  - name: adlr_race
76
  - name: adlr_winogrande_5_shot
 
77
  - name: adlr_global_mmlu_lite_5_shot
78
  - name: adlr_mgsm_native_cot_8_shot
nemo-evaluator-launcher-configs/local_qwen3-30b-a3b-base.yaml CHANGED
@@ -12,19 +12,37 @@
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  defaults:
16
  - execution: local
17
- - deployment: none
18
  - _self_
19
 
20
  execution:
21
- output_dir: qwen3-30b-a3b-base
22
  # mode: sequential # enables sequential execution
23
 
24
- target:
25
- api_endpoint:
26
- model_id: Qwen/Qwen3-30B-A3B-Base
27
- url: http://0.0.0.0:8000/v1/chat/completions # locally hosted endpoint
 
 
 
 
28
 
29
  # specify the benchmarks to evaluate
30
  evaluation:
@@ -35,7 +53,8 @@ evaluation:
35
  params:
36
  max_retries: 5 # number of retries for API requests
37
  request_timeout: 360 # timeout for API requests in seconds
38
- parallelism: 1 # number of parallel requests
 
39
  extra:
40
  tokenizer: Qwen/Qwen3-30B-A3B-Base
41
  tokenizer_backend: huggingface
@@ -43,21 +62,16 @@ evaluation:
43
  - name: adlr_mmlu_pro_5_shot_base
44
  - name: adlr_mmlu
45
  - name: adlr_agieval_en_cot
46
- - name: adlr_gpqa_diamond_cot_5_shot
47
  - name: adlr_humaneval_greedy
48
- - name: adlr_humaneval_sampled
49
  - name: adlr_mbpp_sanitized_3_shot_greedy
50
- - name: adlr_mbpp_sanitized_3_shot_sampled
51
  - name: adlr_gsm8k_cot_8_shot
52
  - name: adlr_minerva_math_nemo_4_shot
53
  - name: adlr_math_500_4_shot_sampled
54
- - name: adlr_commonsense_qa_7_shot
55
  - name: adlr_arc_challenge_llama_25_shot
56
  - name: hellaswag
57
  - name: openbookqa
58
  - name: piqa
59
  - name: adlr_race
60
  - name: adlr_winogrande_5_shot
61
- - name: social_iqa
62
  - name: adlr_global_mmlu_lite_5_shot
63
  - name: adlr_mgsm_native_cot_8_shot
 
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
+ #
16
+ #
17
+ # How to use:
18
+ #
19
+ # 1. copy this file locally
20
+ # 2. (optional) uncomment limit_samples in the config file to run with 10 samples for quick testing
21
+ # 3. export your HF token in the terminal (some benchmark datasets might be gated)
22
+ # 4. run `nemo-evaluator-launcher run --config path/to/local_qwen3-30b-a3b-base.yaml`
23
+ #
24
+ # ⚠️ WARNING:
25
+ # Always run full evaluations (without limit_samples) for actual benchmark results.
26
+ # Using a subset of samples is solely for testing configuration and setup.
27
+ # Results from such test runs should NEVER be used to compare models or
28
+ # report benchmark performance.
29
  defaults:
30
  - execution: local
31
+ - deployment: vllm
32
  - _self_
33
 
34
  execution:
35
+ output_dir: Qwen3-30B-A3B-Base
36
  # mode: sequential # enables sequential execution
37
 
38
+ # specify deployment arguments
39
+ deployment:
40
+ image: vllm/vllm-openai:v0.11.0
41
+ checkpoint_path: null
42
+ hf_model_handle: Qwen/Qwen3-30B-A3B-Base
43
+ served_model_name: Qwen/Qwen3-30B-A3B-Base
44
+ tensor_parallel_size: 1
45
+ data_parallel_size: 1
46
 
47
  # specify the benchmarks to evaluate
48
  evaluation:
 
53
  params:
54
  max_retries: 5 # number of retries for API requests
55
  request_timeout: 360 # timeout for API requests in seconds
56
+ parallelism: 4 # number of parallel requests
57
+ # limit_samples: 10 # uncomment to limit number of samples for quick testing
58
  extra:
59
  tokenizer: Qwen/Qwen3-30B-A3B-Base
60
  tokenizer_backend: huggingface
 
62
  - name: adlr_mmlu_pro_5_shot_base
63
  - name: adlr_mmlu
64
  - name: adlr_agieval_en_cot
 
65
  - name: adlr_humaneval_greedy
 
66
  - name: adlr_mbpp_sanitized_3_shot_greedy
 
67
  - name: adlr_gsm8k_cot_8_shot
68
  - name: adlr_minerva_math_nemo_4_shot
69
  - name: adlr_math_500_4_shot_sampled
 
70
  - name: adlr_arc_challenge_llama_25_shot
71
  - name: hellaswag
72
  - name: openbookqa
73
  - name: piqa
74
  - name: adlr_race
75
  - name: adlr_winogrande_5_shot
 
76
  - name: adlr_global_mmlu_lite_5_shot
77
  - name: adlr_mgsm_native_cot_8_shot