| { | |
| "core": { | |
| "model_name": "Qwen/Qwen2.5-3B-Instruct", | |
| "lora_rank": 16, | |
| "max_seq_length": 2048, | |
| "load_in_4bit": 0, | |
| "model_dir": "Final/Qwen2.5-3B-Instruct-calib-grpo-low", | |
| "dataset_name": "gsm8k", | |
| "dataset_split": "train", | |
| "test_dataset_split": "test", | |
| "trainer_type": "grpo_dpo", | |
| "calibration": true | |
| }, | |
| "training": { | |
| "learning_rate": 5e-06, | |
| "weight_decay": 0.1, | |
| "max_grad_norm": 0.1, | |
| "per_device_train_batch_size": 1, | |
| "gradient_accumulation_steps": 4, | |
| "max_steps": 1000, | |
| "seed": 0 | |
| }, | |
| "sched_optim": { | |
| "lr_scheduler_type": "cosine", | |
| "warmup_ratio": 0.1, | |
| "optim": "adamw_8bit", | |
| "adam_beta1": 0.9, | |
| "adam_beta2": 0.99 | |
| }, | |
| "generation": { | |
| "num_generations": 8, | |
| "max_prompt_length": 1024, | |
| "max_completion_length": 1024 | |
| }, | |
| "algorithm": { | |
| "loss_type": "grpo", | |
| "epsilon": 0.2, | |
| "epsilon_high": 0.2, | |
| "mask_truncated_completions": 0, | |
| "scale_rewards": "group", | |
| "importance_sampling_level": "token" | |
| }, | |
| "dpo": { | |
| "lambda_pair": 0.01, | |
| "pair_threshold": 2.0, | |
| "beta_dpo": 0.2, | |
| "pair_mining": "all", | |
| "max_pairs_per_group": 6, | |
| "implicit_ref": true | |
| }, | |
| "logging": { | |
| "logging_steps": 1, | |
| "save_steps": 50, | |
| "report_to": "wandb", | |
| "wandb_api_key": null | |
| } | |
| } |