|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
|
|
|
from nemo_run.config import get_nemorun_home |
|
|
|
|
|
from .utils import DEFAULT_NEMO_HOME |
|
|
|
|
|
|
|
|
def parse_cli_args(): |
|
|
""" |
|
|
Command line arguments correspong to Slurm cluster and NeMo2.0 for running pre-training and |
|
|
fine-tuning experiments. |
|
|
""" |
|
|
parser = argparse.ArgumentParser(description="NeMo2.0 Performance Pretraining and Fine-Tuning") |
|
|
|
|
|
parser.add_argument( |
|
|
"-a", |
|
|
"--account", |
|
|
type=str, |
|
|
help="Slurm account to use for experiment", |
|
|
required=True, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-p", |
|
|
"--partition", |
|
|
type=str, |
|
|
help="Slurm partition to use for experiment", |
|
|
required=True, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-g", |
|
|
"--gpu", |
|
|
type=str, |
|
|
choices=["h100", "b200", "gb200"], |
|
|
help="Target gpu type.", |
|
|
required=True, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-l", |
|
|
"--log_dir", |
|
|
type=str, |
|
|
help=f"Directory for logging experiment results. Defaults to {get_nemorun_home()}", |
|
|
required=False, |
|
|
default=get_nemorun_home(), |
|
|
) |
|
|
parser.add_argument( |
|
|
"-t", |
|
|
"--time_limit", |
|
|
type=str, |
|
|
help="Maximum time limit to run experiment for. Defaults to 30 minutes (format- 'HH:MM:SS')", |
|
|
required=False, |
|
|
default="00:30:00", |
|
|
) |
|
|
container_img_msg = [ |
|
|
"NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'", |
|
|
"Make sure your NGC credentials are accessible in your environment.", |
|
|
] |
|
|
parser.add_argument( |
|
|
"-i", |
|
|
"--container_image", |
|
|
type=str, |
|
|
help=" ".join(container_img_msg), |
|
|
required=False, |
|
|
default="nvcr.io/nvidia/nemo:dev", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-c", |
|
|
"--compute_dtype", |
|
|
type=str, |
|
|
choices=["bf16", "fp8"], |
|
|
help="Compute precision. Options- bf16 or fp8. Defaults to bf16", |
|
|
required=False, |
|
|
default="bf16", |
|
|
) |
|
|
fp8_recipe_msg = ( |
|
|
"FP8 recipe. Options- ds (per-tensor delayed scaling), cs (per-tensor current scaling), " |
|
|
"mxfp8, ss (subchannel scaling). Defaults to ds" |
|
|
) |
|
|
parser.add_argument( |
|
|
"-fr", |
|
|
"--fp8_recipe", |
|
|
type=str, |
|
|
choices=["ds", "cs", "mxfp8", "ss"], |
|
|
help=fp8_recipe_msg, |
|
|
required=False, |
|
|
default="ds", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-en", |
|
|
"--enable_nsys", |
|
|
help="Enable Nsys profiling. Diabled by default", |
|
|
action="store_true", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-em", |
|
|
"--enable_memory_profile", |
|
|
help="Enable memory usage profiling. Diabled by default", |
|
|
action="store_true", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-mp", |
|
|
"--memory_profile_out_path", |
|
|
type=str, |
|
|
help="Path to the output file of memory profiling", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tb", |
|
|
"--tensorboard", |
|
|
help="Enable tensorboard logging. Disabled by default", |
|
|
action="store_true", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-wd", |
|
|
"--wandb", |
|
|
help="Enable wandb logging. Disabled by default", |
|
|
action="store_true", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-wdk", |
|
|
"--wandb_key", |
|
|
type=str, |
|
|
help="wandb key. Needed for wandb logger projetion to server", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-wdp", |
|
|
"--wandb_prj_name", |
|
|
type=str, |
|
|
help="wandb project name", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-wdj", |
|
|
"--wandb_job_name", |
|
|
type=str, |
|
|
help="wandb job name", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-f", |
|
|
"--finetuning", |
|
|
choices=["sft", "lora"], |
|
|
help="Finetuning scheme to use. Defaults to 'lora'", |
|
|
default='lora', |
|
|
) |
|
|
parser.add_argument( |
|
|
"-hf", |
|
|
"--hf_token", |
|
|
type=str, |
|
|
help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.", |
|
|
default=None, |
|
|
) |
|
|
nemo_home_msg = [ |
|
|
"Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searches", |
|
|
"for models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints already", |
|
|
f"exist here. Missing files will be downloaded here from HuggingFace. Defaults to {DEFAULT_NEMO_HOME}", |
|
|
] |
|
|
parser.add_argument( |
|
|
"-nh", |
|
|
"--nemo_home", |
|
|
type=str, |
|
|
help=" ".join(nemo_home_msg), |
|
|
default=DEFAULT_NEMO_HOME, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-d", |
|
|
"--dryrun", |
|
|
help="If true, prints sbatch script to terminal without launching experiment.", |
|
|
required=False, |
|
|
action="store_true", |
|
|
) |
|
|
parser.add_argument( |
|
|
"-tp", |
|
|
"--tensor_parallel_size", |
|
|
type=int, |
|
|
help="Intra-layer model parallelism. Splits tensors across GPU ranks.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-pp", |
|
|
"--pipeline_parallel_size", |
|
|
type=int, |
|
|
help="Inter-layer model parallelism. Splits transformer layers across GPU ranks.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-cp", |
|
|
"--context_parallel_size", |
|
|
type=int, |
|
|
help="Splits network input along sequence dimension across GPU ranks.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-vp", |
|
|
"--virtual_pipeline_parallel_size", |
|
|
type=int, |
|
|
help="Number of virtual blocks per pipeline model parallel rank is the virtual model parallel size.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ep", |
|
|
"--expert_parallel_size", |
|
|
type=int, |
|
|
help="Distributes Moe Experts across sub data parallel dimension.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-et", |
|
|
"--expert_tensor_parallel_size", |
|
|
type=lambda x: int(x) if x is not None else None, |
|
|
nargs="?", |
|
|
const=None, |
|
|
help="Intra-layer tensor model parallelsm for expert layer. Splits tensors across GPU ranks.\ |
|
|
Use -et/--expert_tensor_parallel_size <space> for None or -et/--expert_tensor_parallel_size <int>", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-mb", |
|
|
"--micro_batch_size", |
|
|
type=int, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-gb", |
|
|
"--global_batch_size", |
|
|
type=int, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ng", |
|
|
"--num_gpus", |
|
|
type=int, |
|
|
help="Number of gpus.", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-gn", |
|
|
"--gpus_per_node", |
|
|
type=int, |
|
|
help="Number of gpus per node. Defaults to 8", |
|
|
required=False, |
|
|
default=8, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ms", |
|
|
"--max_steps", |
|
|
type=int, |
|
|
help="Number of train steps. Defaults to 100", |
|
|
required=False, |
|
|
default=100, |
|
|
) |
|
|
|
|
|
def bool_arg(arg): |
|
|
if arg.lower() in ['true', '1', 't', 'yes', 'y']: |
|
|
return True |
|
|
elif arg.lower() in ['false', '0', 'f', 'no', 'n']: |
|
|
return False |
|
|
else: |
|
|
raise ValueError(f"Invalid value for boolean argument: {arg}") |
|
|
|
|
|
parser.add_argument( |
|
|
"-cg", |
|
|
"--cuda_graphs", |
|
|
help="Enable CUDA graphs. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-fsdp", |
|
|
"--use_mcore_fsdp", |
|
|
help="Enable Megatron Core (Mcore) FSDP. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-fsdp_db", |
|
|
"--use_fsdp_double_buffer", |
|
|
help="Enable FSDP double buffer. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ubr", |
|
|
"--use_user_buffer_registration", |
|
|
help="Enable user buffer registration. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-sharp", |
|
|
"--use_sharp", |
|
|
help="Enable sharp. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-rl", |
|
|
"--recompute_layers", |
|
|
type=int, |
|
|
help="Number of Transformer layers to recompute, where all the intermediate " |
|
|
"activations of a Transformer layer are computed. Defaults to None", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-ol", |
|
|
"--activation_offload_layers", |
|
|
type=int, |
|
|
help="Number of Transformer layers to offload to the CPU memory. Defaults to None", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"--nccl_communicator_config_path", |
|
|
type=str, |
|
|
help="Path to NCCL communicator config yaml file", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
|
|
|
def list_of_strings(arg): |
|
|
return arg.split(',') |
|
|
|
|
|
parser.add_argument( |
|
|
"-rm", |
|
|
"--recompute_modules", |
|
|
nargs="*", |
|
|
const=None, |
|
|
type=str, |
|
|
help="List of modules to perform selective activation recompute. " |
|
|
"Users can provide 0 or any number of arguments. Defaults to None", |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-cm", |
|
|
"--custom_mounts", |
|
|
type=list_of_strings, |
|
|
help="Comma separated string of mounts", |
|
|
required=False, |
|
|
default=[], |
|
|
) |
|
|
parser.add_argument( |
|
|
"--use_hf_tokenizer", |
|
|
help="Use HuggingFace tokenizer. Disabled by default. Null tokenizer will be used if not provided.", |
|
|
action="store_true", |
|
|
required=False, |
|
|
) |
|
|
parser.add_argument( |
|
|
"-dcdfr", |
|
|
"--dump_config_diff_from_base_recipe", |
|
|
help="Dump the config diff from the base recipe. Defaults to False", |
|
|
action="store_true", |
|
|
required=False, |
|
|
default=False, |
|
|
) |
|
|
parser.add_argument( |
|
|
"--keep_fsdp_fp8_transpose_cache", |
|
|
help="Keep FSDP FP8 transpose cache. Disabled by default", |
|
|
type=bool_arg, |
|
|
required=False, |
|
|
default=None, |
|
|
) |
|
|
|
|
|
return parser |
|
|
|