# Configure these values. # 'lora' or 'full' # lora - train a small network for a character or style, or both. quite versatile. # full - requires lots of vram, trains very slowly, needs a lot of data and concepts. export MODEL_TYPE='lora' # Set this to 'true' if you are training a Stable Diffusion 3 checkpoint. # Use MODEL_NAME="stabilityai/stable-diffusion-3-medium-diffusers" export STABLE_DIFFUSION_3=false # Similarly, this is to train PixArt Sigma (1K or 2K) models. # Use MODEL_NAME="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS" export PIXART_SIGMA=false # For old Stable Diffusion 1.x/2.x models, you'll enable this. # Use MODEL_NAME="stabilityai/stable-diffusion-2-1" export STABLE_DIFFUSION_LEGACY=false # For Kwai-Kolors, enable KOLORS. # Use MODEL_NAME="kwai-kolors/kolors-diffusers" export KOLORS=false # For Flux, if you have 8 GPUs and DeepSpeed configured. # Use MODEL_NAME="black-forest-labs/FLUX.1-dev" export FLUX=true # ControlNet model training is only supported when MODEL_TYPE='full' # See this document for more information: https://github.com/bghira/SimpleTuner/blob/main/documentation/CONTROLNET.md # DeepFloyd, PixArt, and SD3 do not currently support ControlNet model training. export CONTROLNET=false # DoRA enhances the training style of LoRA, but it will run more slowly at the same rank. # See: https://arxiv.org/abs/2402.09353 # See: https://github.com/huggingface/peft/pull/1474 export USE_DORA=false # BitFit freeze strategy for the u-net causes everything but the biases to be frozen. # This may help retain the full model's underlying capabilities. LoRA is currently not tested/known to work. #if [[ "$MODEL_TYPE" == "full" ]]; then # # When training a full model, we will rely on BitFit to keep the u-net intact. # export USE_BITFIT=true #elif [[ "$MODEL_TYPE" == "lora" ]]; then # # LoRA can not use BitFit. # export USE_BITFIT=false #elif [[ "$MODEL_TYPE" == "deepfloyd-full" ]]; then # export USE_BITFIT=true #fi # Restart where we left off. Change this to "checkpoint-1234" to start from a specific checkpoint. export RESUME_CHECKPOINT="latest" # How often to checkpoint. Depending on your learning rate, you may wish to change this. # For the default settings with 10 gradient accumulations, more frequent checkpoints might be preferable at first. export CHECKPOINTING_STEPS=150 # This is how many checkpoints we will keep. Two is safe, but three is safer. export CHECKPOINTING_LIMIT=2 # This is decided as a relatively conservative 'constant' learning rate. # Adjust higher or lower depending on how burnt your model becomes. export LEARNING_RATE=8e-7 #@param {type:"number"} # For Prodigy export LEARNING_RATE=1.0 #@param {type:"number"} # Using a Huggingface Hub model: export MODEL_NAME="black-forest-labs/FLUX.1-dev" # Using a local path to a huggingface hub model or saved checkpoint: #export MODEL_NAME="/datasets/models/pipeline" # Make DEBUG_EXTRA_ARGS empty to disable wandb. #export DEBUG_EXTRA_ARGS="--report_to=wandb" export DEBUG_EXTRA_ARGS="--report_to=tensorboard" export TRACKER_PROJECT_NAME="sdxl-training" export TRACKER_RUN_NAME="simpletuner-sdxl" # Max number of steps OR epochs can be used. Not both. export MAX_NUM_STEPS=6000 # Will likely overtrain, but that's fine. export NUM_EPOCHS=0 # A convenient prefix for all of your training paths. # These may be absolute or relative paths. Here, we are using relative paths. # The output will just be in a folder called "output/models" by default. export BASE_DIR="/nvme/home/mikaelh/Stable_Diffusion/bghira/output" export DATALOADER_CONFIG="${BASE_DIR}/multidatabackend_sanna_marin_flux.json" export OUTPUT_DIR="${BASE_DIR}/models" # Set this to "true" to push your model to Hugging Face Hub. export PUSH_TO_HUB="false" # If PUSH_TO_HUB and PUSH_CHECKPOINTS are both enabled, every saved checkpoint will be pushed to Hugging Face Hub. export PUSH_CHECKPOINTS="true" # This will be the model name for your final hub upload, eg. "yourusername/yourmodelname" # It defaults to the wandb project name, but you can override this here. export HUB_MODEL_NAME=$TRACKER_PROJECT_NAME # By default, images will be resized so their SMALLER EDGE is 1024 pixels, maintaining aspect ratio. # Setting this value to 768px might result in more reasonable training data sizes for SDXL. export RESOLUTION=1024 # If you want to have the training data resized by pixel area (Megapixels) rather than edge length, # set this value to "area" instead of "pixel", and uncomment the next RESOLUTION declaration. export RESOLUTION_TYPE="pixel" #export RESOLUTION=1 # 1.0 Megapixel training sizes # If RESOLUTION_TYPE="pixel", the minimum resolution specifies the smaller edge length, measured in pixels. Recommended: 1024. # If RESOLUTION_TYPE="area", the minimum resolution specifies the total image area, measured in megapixels. Recommended: 1. export MINIMUM_RESOLUTION=$RESOLUTION # How many decimals to round aspect buckets to. #export ASPECT_BUCKET_ROUNDING=2 # Use this to append an instance prompt to each caption, used for adding trigger words. # This has not been tested in SDXL. #export INSTANCE_PROMPT="lotr style " # If you also supply a user prompt library or `--use_prompt_library`, this will be added to those lists. export VALIDATION_PROMPT="closeup of sanna marin" export VALIDATION_GUIDANCE=7.5 # You'll want to set this to 0.7 if you are training a terminal SNR model. export VALIDATION_GUIDANCE_RESCALE=0.0 # How frequently we will save and run a pipeline for validations. #export VALIDATION_STEPS=100 export VALIDATION_STEPS=100 export VALIDATION_NUM_INFERENCE_STEPS=20 export VALIDATION_NEGATIVE_PROMPT="blurry, cropped, ugly" export VALIDATION_SEED=42 export VALIDATION_RESOLUTION="1024x1024" # Adjust this for your GPU memory size. This, and resolution, are the biggest VRAM killers. export TRAIN_BATCH_SIZE=1 # Accumulate your update gradient over many steps, to save VRAM while still having higher effective batch size: # effective batch size = ($TRAIN_BATCH_SIZE * $GRADIENT_ACCUMULATION_STEPS). export GRADIENT_ACCUMULATION_STEPS=1 # Use any standard scheduler type. constant, polynomial, constant_with_warmup export LR_SCHEDULE="constant_with_warmup" # A warmup period allows the model and the EMA weights more importantly to familiarise itself with the current quanta. # For the cosine or sine type schedules, the warmup period defines the interval between peaks or valleys. # Use a sine schedule to simulate a warmup period, or a Cosine period to simulate a polynomial start. export LR_WARMUP_STEPS=$((MAX_NUM_STEPS / 10)) #export LR_WARMUP_STEPS=1000 # Caption dropout probability. Set to 0.1 for 10% of captions dropped out. Set to 0 to disable. # You may wish to disable dropout if you want to limit your changes strictly to the prompts you show the model. # You may wish to increase the rate of dropout if you want to more broadly adopt your changes across the model. export CAPTION_DROPOUT_PROBABILITY=0.1 export METADATA_UPDATE_INTERVAL=65 # How many workers to use for VAE caching. export MAX_WORKERS=32 # Read and write batch sizes for VAE caching. export READ_BATCH_SIZE=25 export WRITE_BATCH_SIZE=64 # How many images to encode at once with the VAE. Can increase VRAM use. export VAE_BATCH_SIZE=12 # How many images to process at once (resize, crop, transform) during VAE caching. export IMAGE_PROCESSING_BATCH_SIZE=32 # When using large batch sizes, you'll need to increase the pool connection limit. export AWS_MAX_POOL_CONNECTIONS=128 # For very large systems, setting this can reduce CPU overhead of torch spawning an unnecessarily large number of threads. export TORCH_NUM_THREADS=8 # If this is set, any images that fail to open will be DELETED to avoid re-checking them every time. export DELETE_ERRORED_IMAGES=0 # If this is set, any images that are too small for the minimum resolution size will be DELETED. export DELETE_SMALL_IMAGES=0 # Bytedance recommends these be set to "trailing" so that inference and training behave in a more congruent manner. # To follow the original SDXL training strategy, use "leading" instead, though results are generally worse. export TRAINING_SCHEDULER_TIMESTEP_SPACING="trailing" export INFERENCE_SCHEDULER_TIMESTEP_SPACING="trailing" # Removing this option or unsetting it uses vanilla training. Setting it reweights the loss by the position of the timestep in the noise schedule. # A value "5" is recommended by the researchers. A value of "20" is the least impact, and "1" is the most impact. export MIN_SNR_GAMMA=5 # Set this to an explicit value of "false" to disable Xformers. Probably required for AMD users. export USE_XFORMERS=false # There's basically no reason to unset this. However, to disable it, use an explicit value of "false". # This will save a lot of memory consumption when enabled. export USE_GRADIENT_CHECKPOINTING=true ## # Options below here may require a bit more complicated configuration, so they are not simple variables. ## # TF32 is great on Ampere or Ada, not sure about earlier generations. export ALLOW_TF32=true # AdamW 8Bit is a robust and lightweight choice. Adafactor might reduce memory consumption, and Dadaptation is slow and experimental. # AdamW is the default optimizer, but it uses a lot of memory and is slower than AdamW8Bit or Adafactor. # Choices: adamw, adamw8bit, adafactor, dadaptation #export OPTIMIZER="adamw_bf16" export OPTIMIZER="prodigy" # EMA is a strong regularisation method that uses a lot of extra VRAM to hold two copies of the weights. # This is worthwhile on large training runs, but not so much for smaller training runs. export USE_EMA=false export EMA_DECAY=0.999 export TRAINER_EXTRA_ARGS="--base_model_precision=fp8-quanto --base_model_default_dtype=fp32" ## For offset noise training: # Not recommended for terminal SNR models. #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --offset_noise --noise_offset=0.02" ## For terminal SNR training: #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --prediction_type=v_prediction --rescale_betas_zero_snr" #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --training_scheduler_timestep_spacing=trailing --inference_scheduler_timestep_spacing=trailing" ## You may benefit from directing training toward a specific weighted subset of timesteps. # In this example, we train the final 25% of the timestep schedule with a 3x bias. #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=later --timestep_bias_portion=0.25 --timestep_bias_multiplier=3" # In this example, we train the earliest 25% of the timestep schedule with a 5x bias. #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=earlier --timestep_bias_portion=0.25 --timestep_bias_multiplier=5" # Here, we designate that specifically, timesteps 200 to 500 should be prioritised. #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --timestep_bias_strategy=range --timestep_bias_begin=200 --timestep_bias_end=500 --timestep_bias_multiplier=3" ## For experimental min-SNR weighted loss training (5 is suggested value by the original researchers): # Not recommended for terminal SNR models. #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --snr_gamma=5.0" # For Wasabi S3 filesystem backend (experimental) #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --data_backend=aws --aws_bucket_name=test123" #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_endpoint_url=https://s3.wasabisys.com" #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_access_key=1234567890" #export TRAINER_EXTRA_ARGS="${TRAINER_EXTRA_ARGS} --aws_secret_access_key=0987654321" # Reproducible training. Set to -1 to disable. export TRAINING_SEED=42 # Mixed precision is the best. You honestly might need to YOLO it in fp16 mode for Google Colab type setups. export MIXED_PRECISION="bf16" # Might not be supported on all GPUs. fp32 will be needed for others. export PURE_BF16=true # This has to be changed if you're training with multiple GPUs. export TRAINING_NUM_PROCESSES=1 export TRAINING_NUM_MACHINES=1 export ACCELERATE_EXTRA_ARGS="" # --multi_gpu or other similar flags for huggingface accelerate # With Pytorch 2.1, you might have pretty good luck here. # If you're using aspect bucketing however, each resolution change will recompile. Seriously, just don't do it. # Well, then again... Pytorch 2.2 has support for dynamic shapes. Why not? export TRAINING_DYNAMO_BACKEND='no' # or 'no' if you want to disable torch compile in case of performance issues or lack of support (eg. AMD) export TOKENIZERS_PARALLELISM=false