Commit ·
bb847d7
1
Parent(s): 6e83b2a
add-missing-configs-and-vae-safetensors (#1)
Browse files- Add model_index, submodule configs, VAE diffusion safetensors (b0951622340ac0f56ad28b30af5950e97351df01)
Co-authored-by: Wang Zhipeng <princepride@users.noreply.huggingface.co>
- audio_vae/config.json +4 -0
- model_index.json +51 -0
- sr/config.json +19 -0
- text_encoder/.gitattributes +36 -0
- text_encoder/config.json +8 -7
- transformer/config.json +15 -0
- vae/config.json +125 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
audio_vae/config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "SAAudioFeatureExtractor",
|
| 3 |
+
"model_type": "sa_audio_feature_extractor"
|
| 4 |
+
}
|
model_index.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "MagiHumanPipeline",
|
| 3 |
+
"_diffusers_version": "0.37.0.dev0",
|
| 4 |
+
"transformer": [
|
| 5 |
+
"magi_human",
|
| 6 |
+
"DiTModel"
|
| 7 |
+
],
|
| 8 |
+
"sr": [
|
| 9 |
+
"magi_human",
|
| 10 |
+
"DiTModel"
|
| 11 |
+
],
|
| 12 |
+
"audio_vae": [
|
| 13 |
+
"magi_human",
|
| 14 |
+
"SAAudioFeatureExtractor"
|
| 15 |
+
],
|
| 16 |
+
"text_encoder": [
|
| 17 |
+
"transformers",
|
| 18 |
+
"T5GemmaForCausalLM"
|
| 19 |
+
],
|
| 20 |
+
"vae": [
|
| 21 |
+
"diffusers",
|
| 22 |
+
"AutoencoderKLWan"
|
| 23 |
+
],
|
| 24 |
+
"fps": 25,
|
| 25 |
+
"num_inference_steps": 8,
|
| 26 |
+
"video_txt_guidance_scale": 5.0,
|
| 27 |
+
"audio_txt_guidance_scale": 5.0,
|
| 28 |
+
"sr_video_txt_guidance_scale": 3.5,
|
| 29 |
+
"shift": 5.0,
|
| 30 |
+
"cfg_number": 1,
|
| 31 |
+
"sr_cfg_number": 2,
|
| 32 |
+
"noise_value": 220,
|
| 33 |
+
"use_cfg_trick": true,
|
| 34 |
+
"cfg_trick_start_frame": 13,
|
| 35 |
+
"cfg_trick_value": 2.0,
|
| 36 |
+
"using_sde_flag": false,
|
| 37 |
+
"sr_audio_noise_scale": 0.7,
|
| 38 |
+
"t5_gemma_target_length": 640,
|
| 39 |
+
"vae_stride": [4, 16, 16],
|
| 40 |
+
"z_dim": 48,
|
| 41 |
+
"patch_size": [1, 2, 2],
|
| 42 |
+
"data_proxy": {
|
| 43 |
+
"t_patch_size": 1,
|
| 44 |
+
"patch_size": 2,
|
| 45 |
+
"frame_receptive_field": 11,
|
| 46 |
+
"spatial_rope_interpolation": "extra",
|
| 47 |
+
"ref_audio_offset": 1000,
|
| 48 |
+
"text_offset": 0,
|
| 49 |
+
"coords_style": "v2"
|
| 50 |
+
}
|
| 51 |
+
}
|
sr/config.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_layers": 40,
|
| 3 |
+
"hidden_size": 5120,
|
| 4 |
+
"head_dim": 128,
|
| 5 |
+
"num_query_groups": 8,
|
| 6 |
+
"video_in_channels": 192,
|
| 7 |
+
"audio_in_channels": 64,
|
| 8 |
+
"text_in_channels": 3584,
|
| 9 |
+
"checkpoint_qk_layernorm_rope": false,
|
| 10 |
+
"mm_layers": [0, 1, 2, 3, 36, 37, 38, 39],
|
| 11 |
+
"local_attn_layers": [
|
| 12 |
+
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
|
| 13 |
+
16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30,
|
| 14 |
+
32, 33, 34, 35, 36, 37, 38, 39
|
| 15 |
+
],
|
| 16 |
+
"enable_attn_gating": true,
|
| 17 |
+
"gelu7_layers": [0, 1, 2, 3],
|
| 18 |
+
"post_norm_layers": []
|
| 19 |
+
}
|
text_encoder/.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
text_encoder/config.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
{
|
| 4 |
"architectures": [
|
| 5 |
"T5GemmaForConditionalGeneration"
|
| 6 |
],
|
|
|
|
|
|
|
| 7 |
"classifier_dropout_rate": 0.0,
|
| 8 |
"decoder": {
|
| 9 |
"attention_bias": false,
|
|
@@ -12,6 +12,7 @@
|
|
| 12 |
"classifier_dropout_rate": 0.0,
|
| 13 |
"cross_attention_hidden_size": 3584,
|
| 14 |
"dropout_rate": 0.0,
|
|
|
|
| 15 |
"final_logit_softcapping": 30.0,
|
| 16 |
"head_dim": 256,
|
| 17 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
@@ -72,17 +73,18 @@
|
|
| 72 |
"rms_norm_eps": 1e-06,
|
| 73 |
"rope_theta": 10000.0,
|
| 74 |
"sliding_window": 4096,
|
| 75 |
-
"torch_dtype": "bfloat16",
|
| 76 |
"use_cache": true,
|
| 77 |
"vocab_size": 256000
|
| 78 |
},
|
| 79 |
"dropout_rate": 0.0,
|
|
|
|
| 80 |
"encoder": {
|
| 81 |
"attention_bias": false,
|
| 82 |
"attention_dropout": 0.0,
|
| 83 |
"attn_logit_softcapping": 50.0,
|
| 84 |
"classifier_dropout_rate": 0.0,
|
| 85 |
"dropout_rate": 0.0,
|
|
|
|
| 86 |
"final_logit_softcapping": 30.0,
|
| 87 |
"head_dim": 256,
|
| 88 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
@@ -142,7 +144,6 @@
|
|
| 142 |
"rms_norm_eps": 1e-06,
|
| 143 |
"rope_theta": 10000.0,
|
| 144 |
"sliding_window": 4096,
|
| 145 |
-
"torch_dtype": "bfloat16",
|
| 146 |
"use_cache": true,
|
| 147 |
"vocab_size": 256000
|
| 148 |
},
|
|
@@ -154,7 +155,7 @@
|
|
| 154 |
"is_encoder_decoder": true,
|
| 155 |
"model_type": "t5gemma",
|
| 156 |
"pad_token_id": 0,
|
| 157 |
-
"
|
| 158 |
-
"
|
| 159 |
-
"
|
| 160 |
}
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
"T5GemmaForConditionalGeneration"
|
| 4 |
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 2,
|
| 7 |
"classifier_dropout_rate": 0.0,
|
| 8 |
"decoder": {
|
| 9 |
"attention_bias": false,
|
|
|
|
| 12 |
"classifier_dropout_rate": 0.0,
|
| 13 |
"cross_attention_hidden_size": 3584,
|
| 14 |
"dropout_rate": 0.0,
|
| 15 |
+
"dtype": "bfloat16",
|
| 16 |
"final_logit_softcapping": 30.0,
|
| 17 |
"head_dim": 256,
|
| 18 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
|
|
| 73 |
"rms_norm_eps": 1e-06,
|
| 74 |
"rope_theta": 10000.0,
|
| 75 |
"sliding_window": 4096,
|
|
|
|
| 76 |
"use_cache": true,
|
| 77 |
"vocab_size": 256000
|
| 78 |
},
|
| 79 |
"dropout_rate": 0.0,
|
| 80 |
+
"dtype": "bfloat16",
|
| 81 |
"encoder": {
|
| 82 |
"attention_bias": false,
|
| 83 |
"attention_dropout": 0.0,
|
| 84 |
"attn_logit_softcapping": 50.0,
|
| 85 |
"classifier_dropout_rate": 0.0,
|
| 86 |
"dropout_rate": 0.0,
|
| 87 |
+
"dtype": "bfloat16",
|
| 88 |
"final_logit_softcapping": 30.0,
|
| 89 |
"head_dim": 256,
|
| 90 |
"hidden_activation": "gelu_pytorch_tanh",
|
|
|
|
| 144 |
"rms_norm_eps": 1e-06,
|
| 145 |
"rope_theta": 10000.0,
|
| 146 |
"sliding_window": 4096,
|
|
|
|
| 147 |
"use_cache": true,
|
| 148 |
"vocab_size": 256000
|
| 149 |
},
|
|
|
|
| 155 |
"is_encoder_decoder": true,
|
| 156 |
"model_type": "t5gemma",
|
| 157 |
"pad_token_id": 0,
|
| 158 |
+
"transformers_version": "4.57.6",
|
| 159 |
+
"use_cache": true,
|
| 160 |
+
"vocab_size": 256000
|
| 161 |
}
|
transformer/config.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_layers": 40,
|
| 3 |
+
"hidden_size": 5120,
|
| 4 |
+
"head_dim": 128,
|
| 5 |
+
"num_query_groups": 8,
|
| 6 |
+
"video_in_channels": 192,
|
| 7 |
+
"audio_in_channels": 64,
|
| 8 |
+
"text_in_channels": 3584,
|
| 9 |
+
"checkpoint_qk_layernorm_rope": false,
|
| 10 |
+
"mm_layers": [0, 1, 2, 3, 36, 37, 38, 39],
|
| 11 |
+
"local_attn_layers": [],
|
| 12 |
+
"enable_attn_gating": true,
|
| 13 |
+
"gelu7_layers": [0, 1, 2, 3],
|
| 14 |
+
"post_norm_layers": []
|
| 15 |
+
}
|
vae/config.json
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"base_dim": 160,
|
| 3 |
+
"decoder_base_dim": 256,
|
| 4 |
+
"z_dim": 48,
|
| 5 |
+
"dim_mult": [
|
| 6 |
+
1,
|
| 7 |
+
2,
|
| 8 |
+
4,
|
| 9 |
+
4
|
| 10 |
+
],
|
| 11 |
+
"num_res_blocks": 2,
|
| 12 |
+
"attn_scales": [],
|
| 13 |
+
"temperal_downsample": [
|
| 14 |
+
false,
|
| 15 |
+
true,
|
| 16 |
+
true
|
| 17 |
+
],
|
| 18 |
+
"dropout": 0.0,
|
| 19 |
+
"latents_mean": [
|
| 20 |
+
-0.2289,
|
| 21 |
+
-0.0052,
|
| 22 |
+
-0.1323,
|
| 23 |
+
-0.2339,
|
| 24 |
+
-0.2799,
|
| 25 |
+
0.0174,
|
| 26 |
+
0.1838,
|
| 27 |
+
0.1557,
|
| 28 |
+
-0.1382,
|
| 29 |
+
0.0542,
|
| 30 |
+
0.2813,
|
| 31 |
+
0.0891,
|
| 32 |
+
0.157,
|
| 33 |
+
-0.0098,
|
| 34 |
+
0.0375,
|
| 35 |
+
-0.1825,
|
| 36 |
+
-0.2246,
|
| 37 |
+
-0.1207,
|
| 38 |
+
-0.0698,
|
| 39 |
+
0.5109,
|
| 40 |
+
0.2665,
|
| 41 |
+
-0.2108,
|
| 42 |
+
-0.2158,
|
| 43 |
+
0.2502,
|
| 44 |
+
-0.2055,
|
| 45 |
+
-0.0322,
|
| 46 |
+
0.1109,
|
| 47 |
+
0.1567,
|
| 48 |
+
-0.0729,
|
| 49 |
+
0.0899,
|
| 50 |
+
-0.2799,
|
| 51 |
+
-0.123,
|
| 52 |
+
-0.0313,
|
| 53 |
+
-0.1649,
|
| 54 |
+
0.0117,
|
| 55 |
+
0.0723,
|
| 56 |
+
-0.2839,
|
| 57 |
+
-0.2083,
|
| 58 |
+
-0.052,
|
| 59 |
+
0.3748,
|
| 60 |
+
0.0152,
|
| 61 |
+
0.1957,
|
| 62 |
+
0.1433,
|
| 63 |
+
-0.2944,
|
| 64 |
+
0.3573,
|
| 65 |
+
-0.0548,
|
| 66 |
+
-0.1681,
|
| 67 |
+
-0.0667
|
| 68 |
+
],
|
| 69 |
+
"latents_std": [
|
| 70 |
+
0.4765,
|
| 71 |
+
1.0364,
|
| 72 |
+
0.4514,
|
| 73 |
+
1.1677,
|
| 74 |
+
0.5313,
|
| 75 |
+
0.499,
|
| 76 |
+
0.4818,
|
| 77 |
+
0.5013,
|
| 78 |
+
0.8158,
|
| 79 |
+
1.0344,
|
| 80 |
+
0.5894,
|
| 81 |
+
1.0901,
|
| 82 |
+
0.6885,
|
| 83 |
+
0.6165,
|
| 84 |
+
0.8454,
|
| 85 |
+
0.4978,
|
| 86 |
+
0.5759,
|
| 87 |
+
0.3523,
|
| 88 |
+
0.7135,
|
| 89 |
+
0.6804,
|
| 90 |
+
0.5833,
|
| 91 |
+
1.4146,
|
| 92 |
+
0.8986,
|
| 93 |
+
0.5659,
|
| 94 |
+
0.7069,
|
| 95 |
+
0.5338,
|
| 96 |
+
0.4889,
|
| 97 |
+
0.4917,
|
| 98 |
+
0.4069,
|
| 99 |
+
0.4999,
|
| 100 |
+
0.6866,
|
| 101 |
+
0.4093,
|
| 102 |
+
0.5709,
|
| 103 |
+
0.6065,
|
| 104 |
+
0.6415,
|
| 105 |
+
0.4944,
|
| 106 |
+
0.5726,
|
| 107 |
+
1.2042,
|
| 108 |
+
0.5458,
|
| 109 |
+
1.6887,
|
| 110 |
+
0.3971,
|
| 111 |
+
1.06,
|
| 112 |
+
0.3943,
|
| 113 |
+
0.5537,
|
| 114 |
+
0.5444,
|
| 115 |
+
0.4089,
|
| 116 |
+
0.7468,
|
| 117 |
+
0.7744
|
| 118 |
+
],
|
| 119 |
+
"is_residual": true,
|
| 120 |
+
"in_channels": 12,
|
| 121 |
+
"out_channels": 12,
|
| 122 |
+
"patch_size": 2,
|
| 123 |
+
"scale_factor_temporal": 4,
|
| 124 |
+
"scale_factor_spatial": 16
|
| 125 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62cd18f19438e35b32ac63020e2852f566e9b02f46b6cdbd87972a356e3c6f4b
|
| 3 |
+
size 2818777808
|