{ "architectures": [ "VASTDiT3" ], "audio_hidden_size": 1152, "audio_in_channels": 8, "audio_input_size": [ null, null ], "audio_patch_size": [ 4, 1 ], "caption_channels": 4096, "class_dropout_prob": 0.1, "depth": 28, "drop_path": 0.0, "enable_flash_attn": true, "enable_layernorm_kernel": false, "enable_sequence_parallelism": false, "freeze_audio_branch": false, "freeze_video_branch": true, "freeze_y_embedder": false, "hidden_size": 1152, "in_channels": 4, "input_size": [ null, null, null ], "input_sq_size": 512, "mlp_ratio": 4.0, "model_max_length": 300, "model_type": "VASTDiT3", "num_heads": 16, "only_infer_audio": false, "only_train_audio": true, "only_train_temporal": false, "patch_size": [ 1, 2, 2 ], "pred_sigma": true, "qk_norm": true, "skip_y_embedder": false, "spatial_prior_len": 16, "st_prior_channel": 64, "temporal_prior_len": 16, "torch_dtype": "bfloat16", "train_st_prior_attn": false, "train_va_cross_attn": false, "transformers_version": "4.39.3", "weight_init_from": [ "./checkpoints/OpenSora-STDiT-v3/model.safetensors" ] }