{ "core": { "model_name": "Qwen/Qwen2.5-3B-Instruct", "lora_rank": 16, "max_seq_length": 2048, "load_in_4bit": 0, "model_dir": "Final/Qwen2.5-3B-Instruct-calib-grpo-low", "dataset_name": "gsm8k", "dataset_split": "train", "test_dataset_split": "test", "trainer_type": "grpo_dpo", "calibration": true }, "training": { "learning_rate": 5e-06, "weight_decay": 0.1, "max_grad_norm": 0.1, "per_device_train_batch_size": 1, "gradient_accumulation_steps": 4, "max_steps": 1000, "seed": 0 }, "sched_optim": { "lr_scheduler_type": "cosine", "warmup_ratio": 0.1, "optim": "adamw_8bit", "adam_beta1": 0.9, "adam_beta2": 0.99 }, "generation": { "num_generations": 8, "max_prompt_length": 1024, "max_completion_length": 1024 }, "algorithm": { "loss_type": "grpo", "epsilon": 0.2, "epsilon_high": 0.2, "mask_truncated_completions": 0, "scale_rewards": "group", "importance_sampling_level": "token" }, "dpo": { "lambda_pair": 0.01, "pair_threshold": 2.0, "beta_dpo": 0.2, "pair_mining": "all", "max_pairs_per_group": 6, "implicit_ref": true }, "logging": { "logging_steps": 1, "save_steps": 50, "report_to": "wandb", "wandb_api_key": null } }