Create Trainining_Config
Browse files- Trainining_Config +244 -0
Trainining_Config
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configure these values.
|
| 2 |
+
|
| 3 |
+
# 'lora' or 'full'
|
| 4 |
+
# lora - train a small network for a character or style, or both. quite versatile.
|
| 5 |
+
# full - requires lots of vram, trains very slowly, needs a lot of data and concepts.
|
| 6 |
+
export MODEL_TYPE='lora'
|
| 7 |
+
|
| 8 |
+
# SDXL is trained by default, but you will need to enable one of these options for anything else.
|
| 9 |
+
|
| 10 |
+
# Set this to 'true' if you are training a Stable Diffusion 3 checkpoint.
|
| 11 |
+
# Use MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers"
|
| 12 |
+
export STABLE_DIFFUSION_3=false
|
| 13 |
+
# Similarly, this is to train PixArt Sigma (1K or 2K) models.
|
| 14 |
+
# Use MODEL_NAME="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS"
|
| 15 |
+
export PIXART_SIGMA=false
|
| 16 |
+
# For old Stable Diffusion 1.x/2.x models, you'll enable this.
|
| 17 |
+
# Use MODEL_NAME="stabilityai/stable-diffusion-2-1"
|
| 18 |
+
export STABLE_DIFFUSION_LEGACY=false
|
| 19 |
+
# For Kwai-Kolors, enable KOLORS.
|
| 20 |
+
# Use MODEL_NAME="kwai-kolors/kolors-diffusers"
|
| 21 |
+
export KOLORS=false
|
| 22 |
+
# For Flux, if you have 8 GPUs and DeepSpeed configured.
|
| 23 |
+
# Use MODEL_NAME="black-forest-labs/FLUX.1-dev"
|
| 24 |
+
export FLUX=true
|
| 25 |
+
|
| 26 |
+
# ControlNet model training is only supported when MODEL_TYPE='full'
|
| 27 |
+
# See this document for more information: https://github.com/bghira/SimpleTuner/blob/main/documentation/CONTROLNET.md
|
| 28 |
+
# DeepFloyd, PixArt, and SD3 do not currently support ControlNet model training.
|
| 29 |
+
export CONTROLNET=false
|
| 30 |
+
|
| 31 |
+
# DoRA enhances the training style of LoRA, but it will run more slowly at the same rank.
|
| 32 |
+
# See: https://arxiv.org/abs/2402.09353
|
| 33 |
+
# See: https://github.com/huggingface/peft/pull/1474
|
| 34 |
+
export USE_DORA=false
|
| 35 |
+
|
| 36 |
+
# BitFit freeze strategy for the u-net causes everything but the biases to be frozen.
|
| 37 |
+
# This may help retain the full model's underlying capabilities. LoRA is currently not tested/known to work.
|
| 38 |
+
#if [[ "$MODEL_TYPE" == "full" ]]; then
|
| 39 |
+
# # When training a full model, we will rely on BitFit to keep the u-net intact.
|
| 40 |
+
# export USE_BITFIT=true
|
| 41 |
+
#elif [[ "$MODEL_TYPE" == "lora" ]]; then
|
| 42 |
+
# # LoRA can not use BitFit.
|
| 43 |
+
# export USE_BITFIT=false
|
| 44 |
+
#elif [[ "$MODEL_TYPE" == "deepfloyd-full" ]]; then
|
| 45 |
+
# export USE_BITFIT=true
|
| 46 |
+
#fi
|
| 47 |
+
|
| 48 |
+
# Restart where we left off. Change this to "checkpoint-1234" to start from a specific checkpoint.
|
| 49 |
+
export RESUME_CHECKPOINT="latest"
|
| 50 |
+
|
| 51 |
+
# How often to checkpoint. Depending on your learning rate, you may wish to change this.
|
| 52 |
+
# For the default settings with 10 gradient accumulations, more frequent checkpoints might be preferable at first.
|
| 53 |
+
export CHECKPOINTING_STEPS=100
|
| 54 |
+
# This is how many checkpoints we will keep. Two is safe, but three is safer.
|
| 55 |
+
export CHECKPOINTING_LIMIT=3
|
| 56 |
+
|
| 57 |
+
# This is decided as a relatively conservative 'constant' learning rate.
|
| 58 |
+
# Adjust higher or lower depending on how burnt your model becomes.
|
| 59 |
+
# export LEARNING_RATE=8e-7 #@param {type:"number"}
|
| 60 |
+
export LEARNING_RATE=0.0001 #@param {type:"number"}
|
| 61 |
+
|
| 62 |
+
# Using a Huggingface Hub model:
|
| 63 |
+
export MODEL_NAME="black-forest-labs/FLUX.1-dev"
|
| 64 |
+
# Using a local path to a huggingface hub model or saved checkpoint:
|
| 65 |
+
#export MODEL_NAME="/datasets/models/pipeline"
|
| 66 |
+
|
| 67 |
+
# Make DEBUG_EXTRA_ARGS empty to disable wandb.
|
| 68 |
+
export DEBUG_EXTRA_ARGS="--report_to=wandb"
|
| 69 |
+
export TRACKER_PROJECT_NAME="hvvshimFluxV1"
|
| 70 |
+
export TRACKER_RUN_NAME="flux-V1"
|
| 71 |
+
|
| 72 |
+
# Max number of steps OR epochs can be used. Not both.
|
| 73 |
+
export MAX_NUM_STEPS=3000
|
| 74 |
+
# Will likely overtrain, but that's fine.
|
| 75 |
+
export NUM_EPOCHS=0
|
| 76 |
+
|
| 77 |
+
# A convenient prefix for all of your training paths.
|
| 78 |
+
# These may be absolute or relative paths. Here, we are using relative paths.
|
| 79 |
+
# The output will just be in a folder called "output/models" by default.
|
| 80 |
+
export DATALOADER_CONFIG="config/multidatabackend.json"
|
| 81 |
+
export OUTPUT_DIR="output/models"
|
| 82 |
+
|
| 83 |
+
# Set this to "true" to push your model to Hugging Face Hub.
|
| 84 |
+
export PUSH_TO_HUB="true"
|
| 85 |
+
# If PUSH_TO_HUB and PUSH_CHECKPOINTS are both enabled, every saved checkpoint will be pushed to Hugging Face Hub.
|
| 86 |
+
export PUSH_CHECKPOINTS="true"
|
| 87 |
+
# This will be the model name for your final hub upload, eg. "yourusername/yourmodelname"
|
| 88 |
+
# It defaults to the wandb project name, but you can override this here.
|
| 89 |
+
export HUB_MODEL_NAME=$TRACKER_PROJECT_NAME
|
| 90 |
+
|
| 91 |
+
# By default, images will be resized so their SMALLER EDGE is 1024 pixels, maintaining aspect ratio.
|
| 92 |
+
# Setting this value to 768px might result in more reasonable training data sizes for SDXL.
|
| 93 |
+
export RESOLUTION=1024
|
| 94 |
+
# If you want to have the training data resized by pixel area (Megapixels) rather than edge length,
|
| 95 |
+
# set this value to "area" instead of "pixel", and uncomment the next RESOLUTION declaration.
|
| 96 |
+
export RESOLUTION_TYPE="pixel"
|
| 97 |
+
#export RESOLUTION=1 # 1.0 Megapixel training sizes
|
| 98 |
+
# If RESOLUTION_TYPE="pixel", the minimum resolution specifies the smaller edge length, measured in pixels. Recommended: 1024.
|
| 99 |
+
# If RESOLUTION_TYPE="area", the minimum resolution specifies the total image area, measured in megapixels. Recommended: 1.
|
| 100 |
+
export MINIMUM_RESOLUTION=$RESOLUTION
|
| 101 |
+
|
| 102 |
+
# How many decimals to round aspect buckets to.
|
| 103 |
+
#export ASPECT_BUCKET_ROUNDING=2
|
| 104 |
+
|
| 105 |
+
# Use this to append an instance prompt to each caption, used for adding trigger words.
|
| 106 |
+
# This has not been tested in SDXL.
|
| 107 |
+
#export INSTANCE_PROMPT="lotr style "
|
| 108 |
+
# If you also supply a user prompt library or `--use_prompt_library`, this will be added to those lists.
|
| 109 |
+
export VALIDATION_PROMPT="a portrait of hvvshim man on the moon"
|
| 110 |
+
export VALIDATION_GUIDANCE=3.5
|
| 111 |
+
# You'll want to set this to 0.7 if you are training a terminal SNR model.
|
| 112 |
+
export VALIDATION_GUIDANCE_RESCALE=0.0
|
| 113 |
+
# How frequently we will save and run a pipeline for validations.
|
| 114 |
+
export VALIDATION_STEPS=100
|
| 115 |
+
export VALIDATION_NUM_INFERENCE_STEPS=20
|
| 116 |
+
export VALIDATION_NEGATIVE_PROMPT="blurry, cropped, ugly,fat"
|
| 117 |
+
export VALIDATION_SEED=42
|
| 118 |
+
export VALIDATION_RESOLUTION=$RESOLUTION
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# Adjust this for your GPU memory size. This, and resolution, are the biggest VRAM killers.
|
| 122 |
+
export TRAIN_BATCH_SIZE=1
|
| 123 |
+
# Accumulate your update gradient over many steps, to save VRAM while still having higher effective batch size:
|
| 124 |
+
# effective batch size = ($TRAIN_BATCH_SIZE * $GRADIENT_ACCUMULATION_STEPS).
|
| 125 |
+
export GRADIENT_ACCUMULATION_STEPS=1
|
| 126 |
+
# How many images to encode at once with the VAE. Can increase VRAM use.
|
| 127 |
+
export VAE_BATCH_SIZE=1
|
| 128 |
+
|
| 129 |
+
# Use any standard scheduler type. constant, polynomial, constant_with_warmup
|
| 130 |
+
export LR_SCHEDULE="constant_with_warmup"
|
| 131 |
+
# A warmup period allows the model and the EMA weights more importantly to familiarise itself with the current quanta.
|
| 132 |
+
# For the cosine or sine type schedules, the warmup period defines the interval between peaks or valleys.
|
| 133 |
+
# Use a sine schedule to simulate a warmup period, or a Cosine period to simulate a polynomial start.
|
| 134 |
+
export LR_WARMUP_STEPS=$((MAX_NUM_STEPS / 10))
|
| 135 |
+
# export LR_WARMUP_STEPS=1000
|
| 136 |
+
|
| 137 |
+
# Caption dropout probability. Set to 0.1 for 10% of captions dropped out. Set to 0 to disable.
|
| 138 |
+
# You may wish to disable dropout if you want to limit your changes strictly to the prompts you show the model.
|
| 139 |
+
# You may wish to increase the rate of dropout if you want to more broadly adopt your changes across the model.
|
| 140 |
+
export CAPTION_DROPOUT_PROBABILITY=0.1
|
| 141 |
+
|
| 142 |
+
export METADATA_UPDATE_INTERVAL=65
|
| 143 |
+
|
| 144 |
+
# How many workers to use for VAE caching.
|
| 145 |
+
export MAX_WORKERS=32
|
| 146 |
+
# Read and write batch sizes for VAE caching.
|
| 147 |
+
export READ_BATCH_SIZE=25
|
| 148 |
+
export WRITE_BATCH_SIZE=64
|
| 149 |
+
# How many images to process at once (resize, crop, transform) during VAE caching.
|
| 150 |
+
export IMAGE_PROCESSING_BATCH_SIZE=32
|
| 151 |
+
# When using large batch sizes, you'll need to increase the pool connection limit.
|
| 152 |
+
export AWS_MAX_POOL_CONNECTIONS=128
|
| 153 |
+
# For very large systems, setting this can reduce CPU overhead of torch spawning an unnecessarily large number of threads.
|
| 154 |
+
export TORCH_NUM_THREADS=8
|
| 155 |
+
|
| 156 |
+
# If this is set, any images that fail to open will be DELETED to avoid re-checking them every time.
|
| 157 |
+
export DELETE_ERRORED_IMAGES=0
|
| 158 |
+
# If this is set, any images that are too small for the minimum resolution size will be DELETED.
|
| 159 |
+
export DELETE_SMALL_IMAGES=0
|
| 160 |
+
|
| 161 |
+
# Bytedance recommends these be set to "trailing" so that inference and training behave in a more congruent manner.
|
| 162 |
+
# To follow the original SDXL training strategy, use "leading" instead, though results are generally worse.
|
| 163 |
+
export TRAINING_SCHEDULER_TIMESTEP_SPACING="trailing"
|
| 164 |
+
export INFERENCE_SCHEDULER_TIMESTEP_SPACING="trailing"
|
| 165 |
+
|
| 166 |
+
# Removing this option or unsetting it uses vanilla training. Setting it reweights the loss by the position of the timestep in the noise schedule.
|
| 167 |
+
# A value "5" is recommended by the researchers. A value of "20" is the least impact, and "1" is the most impact.
|
| 168 |
+
export MIN_SNR_GAMMA=5
|
| 169 |
+
|
| 170 |
+
# Set this to an explicit value of "false" to disable Xformers. Probably required for AMD users.
|
| 171 |
+
export USE_XFORMERS=false
|
| 172 |
+
|
| 173 |
+
# There's basically no reason to unset this. However, to disable it, use an explicit value of "false".
|
| 174 |
+
# This will save a lot of memory consumption when enabled.
|
| 175 |
+
export USE_GRADIENT_CHECKPOINTING=true
|
| 176 |
+
|
| 177 |
+
##
|
| 178 |
+
# Options below here may require a bit more complicated configuration, so they are not simple variables.
|
| 179 |
+
##
|
| 180 |
+
|
| 181 |
+
# TF32 is great on Ampere or Ada, not sure about earlier generations.
|
| 182 |
+
export ALLOW_TF32=true
|
| 183 |
+
|
| 184 |
+
# AdamW 8Bit is a robust and lightweight choice. Adafactor might reduce memory consumption, and Dadaptation is slow and experimental.
|
| 185 |
+
# AdamW is the default optimizer, but it uses a lot of memory and is slower than AdamW8Bit or Adafactor.
|
| 186 |
+
# NOTE: When training a quantised base model, you can't use adamw_bf16. Instead, try adafactor or adamw.
|
| 187 |
+
# Choices: adamw, adamw8bit, adafactor, dadaptation, adamw_bf16
|
| 188 |
+
export OPTIMIZER="adamw_bf16"
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# EMA is a strong regularisation method that uses a lot of extra VRAM to hold two copies of the weights.
|
| 192 |
+
# This is worthwhile on large training runs, but not so much for smaller training runs.
|
| 193 |
+
# NOTE: EMA is not currently applied to LoRA.
|
| 194 |
+
export USE_EMA=false
|
| 195 |
+
export EMA_DECAY=0.999
|
| 196 |
+
|
| 197 |
+
export TRAINER_EXTRA_ARGS="--base_model_precision=int8-quanto"
|
| 198 |
+
## For offset noise training:
|
| 199 |
+
# Not recommended for terminal SNR models.
|
| 200 |
+
export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --keep_vae_loaded"
|
| 201 |
+
export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --lora_rank=32"
|
| 202 |
+
export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --lora_alpha=32"
|
| 203 |
+
export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --text_encoder_1_precision=no_change --text_encoder_2_precision=no_change"
|
| 204 |
+
|
| 205 |
+
## For terminal SNR training:
|
| 206 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr"
|
| 207 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=trailing --inference_scheduler_timestep_spacing=trailing"
|
| 208 |
+
## You may benefit from directing training toward a specific weighted subset of timesteps.
|
| 209 |
+
# In this example, we train the final 25% of the timestep schedule with a 3x bias.
|
| 210 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=later --timestep_bias_portion=0.25 --timestep_bias_multiplier=3"
|
| 211 |
+
# In this example, we train the earliest 25% of the timestep schedule with a 5x bias.
|
| 212 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=earlier --timestep_bias_portion=0.25 --timestep_bias_multiplier=5"
|
| 213 |
+
# Here, we designate that specifically, timesteps 200 to 500 should be prioritised.
|
| 214 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=range --timestep_bias_begin=200 --timestep_bias_end=500 --timestep_bias_multiplier=3"
|
| 215 |
+
|
| 216 |
+
## For experimental min-SNR weighted loss training (5 is suggested value by the original researchers):
|
| 217 |
+
# Not recommended for terminal SNR models.
|
| 218 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --snr_gamma=5.0"
|
| 219 |
+
|
| 220 |
+
# For Wasabi S3 filesystem backend (experimental)
|
| 221 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --data_backend=aws --aws_bucket_name=test123"
|
| 222 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_endpoint_url=https://s3.wasabisys.com"
|
| 223 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_access_key=1234567890"
|
| 224 |
+
#export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_secret_access_key=0987654321"
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# Reproducible training. Set to -1 to disable.
|
| 228 |
+
export TRAINING_SEED=42
|
| 229 |
+
|
| 230 |
+
# Mixed precision is the best. You honestly might need to YOLO it in fp16 mode for Google Colab type setups.
|
| 231 |
+
export MIXED_PRECISION="bf16" # Might not be supported on all GPUs. fp32 will be needed for others.
|
| 232 |
+
export PURE_BF16=true
|
| 233 |
+
|
| 234 |
+
# This has to be changed if you're training with multiple GPUs.
|
| 235 |
+
export TRAINING_NUM_PROCESSES=1
|
| 236 |
+
export TRAINING_NUM_MACHINES=1
|
| 237 |
+
export ACCELERATE_EXTRA_ARGS="" # --multi_gpu or other similar flags for huggingface accelerate
|
| 238 |
+
|
| 239 |
+
# With Pytorch 2.1, you might have pretty good luck here.
|
| 240 |
+
# If you're using aspect bucketing however, each resolution change will recompile. Seriously, just don't do it.
|
| 241 |
+
# Well, then again... Pytorch 2.2 has support for dynamic shapes. Why not?
|
| 242 |
+
export TRAINING_DYNAMO_BACKEND='no' # or 'no' if you want to disable torch compile in case of performance issues or lack of support (eg. AMD)
|
| 243 |
+
|
| 244 |
+
export TOKENIZERS_PARALLELISM=false
|