JoyBoy-Su princepride commited on
Commit
bb847d7
·
1 Parent(s): 6e83b2a

add-missing-configs-and-vae-safetensors (#1)

Browse files

- Add model_index, submodule configs, VAE diffusion safetensors (b0951622340ac0f56ad28b30af5950e97351df01)


Co-authored-by: Wang Zhipeng <princepride@users.noreply.huggingface.co>

audio_vae/config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_class_name": "SAAudioFeatureExtractor",
3
+ "model_type": "sa_audio_feature_extractor"
4
+ }
model_index.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "MagiHumanPipeline",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "transformer": [
5
+ "magi_human",
6
+ "DiTModel"
7
+ ],
8
+ "sr": [
9
+ "magi_human",
10
+ "DiTModel"
11
+ ],
12
+ "audio_vae": [
13
+ "magi_human",
14
+ "SAAudioFeatureExtractor"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "T5GemmaForCausalLM"
19
+ ],
20
+ "vae": [
21
+ "diffusers",
22
+ "AutoencoderKLWan"
23
+ ],
24
+ "fps": 25,
25
+ "num_inference_steps": 8,
26
+ "video_txt_guidance_scale": 5.0,
27
+ "audio_txt_guidance_scale": 5.0,
28
+ "sr_video_txt_guidance_scale": 3.5,
29
+ "shift": 5.0,
30
+ "cfg_number": 1,
31
+ "sr_cfg_number": 2,
32
+ "noise_value": 220,
33
+ "use_cfg_trick": true,
34
+ "cfg_trick_start_frame": 13,
35
+ "cfg_trick_value": 2.0,
36
+ "using_sde_flag": false,
37
+ "sr_audio_noise_scale": 0.7,
38
+ "t5_gemma_target_length": 640,
39
+ "vae_stride": [4, 16, 16],
40
+ "z_dim": 48,
41
+ "patch_size": [1, 2, 2],
42
+ "data_proxy": {
43
+ "t_patch_size": 1,
44
+ "patch_size": 2,
45
+ "frame_receptive_field": 11,
46
+ "spatial_rope_interpolation": "extra",
47
+ "ref_audio_offset": 1000,
48
+ "text_offset": 0,
49
+ "coords_style": "v2"
50
+ }
51
+ }
sr/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_layers": 40,
3
+ "hidden_size": 5120,
4
+ "head_dim": 128,
5
+ "num_query_groups": 8,
6
+ "video_in_channels": 192,
7
+ "audio_in_channels": 64,
8
+ "text_in_channels": 3584,
9
+ "checkpoint_qk_layernorm_rope": false,
10
+ "mm_layers": [0, 1, 2, 3, 36, 37, 38, 39],
11
+ "local_attn_layers": [
12
+ 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
13
+ 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30,
14
+ 32, 33, 34, 35, 36, 37, 38, 39
15
+ ],
16
+ "enable_attn_gating": true,
17
+ "gelu7_layers": [0, 1, 2, 3],
18
+ "post_norm_layers": []
19
+ }
text_encoder/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
text_encoder/config.json CHANGED
@@ -1,9 +1,9 @@
1
-
2
-
3
  {
4
  "architectures": [
5
  "T5GemmaForConditionalGeneration"
6
  ],
 
 
7
  "classifier_dropout_rate": 0.0,
8
  "decoder": {
9
  "attention_bias": false,
@@ -12,6 +12,7 @@
12
  "classifier_dropout_rate": 0.0,
13
  "cross_attention_hidden_size": 3584,
14
  "dropout_rate": 0.0,
 
15
  "final_logit_softcapping": 30.0,
16
  "head_dim": 256,
17
  "hidden_activation": "gelu_pytorch_tanh",
@@ -72,17 +73,18 @@
72
  "rms_norm_eps": 1e-06,
73
  "rope_theta": 10000.0,
74
  "sliding_window": 4096,
75
- "torch_dtype": "bfloat16",
76
  "use_cache": true,
77
  "vocab_size": 256000
78
  },
79
  "dropout_rate": 0.0,
 
80
  "encoder": {
81
  "attention_bias": false,
82
  "attention_dropout": 0.0,
83
  "attn_logit_softcapping": 50.0,
84
  "classifier_dropout_rate": 0.0,
85
  "dropout_rate": 0.0,
 
86
  "final_logit_softcapping": 30.0,
87
  "head_dim": 256,
88
  "hidden_activation": "gelu_pytorch_tanh",
@@ -142,7 +144,6 @@
142
  "rms_norm_eps": 1e-06,
143
  "rope_theta": 10000.0,
144
  "sliding_window": 4096,
145
- "torch_dtype": "bfloat16",
146
  "use_cache": true,
147
  "vocab_size": 256000
148
  },
@@ -154,7 +155,7 @@
154
  "is_encoder_decoder": true,
155
  "model_type": "t5gemma",
156
  "pad_token_id": 0,
157
- "torch_dtype": "bfloat16",
158
- "transformers_version": "4.53.0.dev0",
159
- "use_cache": true
160
  }
 
 
 
1
  {
2
  "architectures": [
3
  "T5GemmaForConditionalGeneration"
4
  ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 2,
7
  "classifier_dropout_rate": 0.0,
8
  "decoder": {
9
  "attention_bias": false,
 
12
  "classifier_dropout_rate": 0.0,
13
  "cross_attention_hidden_size": 3584,
14
  "dropout_rate": 0.0,
15
+ "dtype": "bfloat16",
16
  "final_logit_softcapping": 30.0,
17
  "head_dim": 256,
18
  "hidden_activation": "gelu_pytorch_tanh",
 
73
  "rms_norm_eps": 1e-06,
74
  "rope_theta": 10000.0,
75
  "sliding_window": 4096,
 
76
  "use_cache": true,
77
  "vocab_size": 256000
78
  },
79
  "dropout_rate": 0.0,
80
+ "dtype": "bfloat16",
81
  "encoder": {
82
  "attention_bias": false,
83
  "attention_dropout": 0.0,
84
  "attn_logit_softcapping": 50.0,
85
  "classifier_dropout_rate": 0.0,
86
  "dropout_rate": 0.0,
87
+ "dtype": "bfloat16",
88
  "final_logit_softcapping": 30.0,
89
  "head_dim": 256,
90
  "hidden_activation": "gelu_pytorch_tanh",
 
144
  "rms_norm_eps": 1e-06,
145
  "rope_theta": 10000.0,
146
  "sliding_window": 4096,
 
147
  "use_cache": true,
148
  "vocab_size": 256000
149
  },
 
155
  "is_encoder_decoder": true,
156
  "model_type": "t5gemma",
157
  "pad_token_id": 0,
158
+ "transformers_version": "4.57.6",
159
+ "use_cache": true,
160
+ "vocab_size": 256000
161
  }
transformer/config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_layers": 40,
3
+ "hidden_size": 5120,
4
+ "head_dim": 128,
5
+ "num_query_groups": 8,
6
+ "video_in_channels": 192,
7
+ "audio_in_channels": 64,
8
+ "text_in_channels": 3584,
9
+ "checkpoint_qk_layernorm_rope": false,
10
+ "mm_layers": [0, 1, 2, 3, 36, 37, 38, 39],
11
+ "local_attn_layers": [],
12
+ "enable_attn_gating": true,
13
+ "gelu7_layers": [0, 1, 2, 3],
14
+ "post_norm_layers": []
15
+ }
vae/config.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_dim": 160,
3
+ "decoder_base_dim": 256,
4
+ "z_dim": 48,
5
+ "dim_mult": [
6
+ 1,
7
+ 2,
8
+ 4,
9
+ 4
10
+ ],
11
+ "num_res_blocks": 2,
12
+ "attn_scales": [],
13
+ "temperal_downsample": [
14
+ false,
15
+ true,
16
+ true
17
+ ],
18
+ "dropout": 0.0,
19
+ "latents_mean": [
20
+ -0.2289,
21
+ -0.0052,
22
+ -0.1323,
23
+ -0.2339,
24
+ -0.2799,
25
+ 0.0174,
26
+ 0.1838,
27
+ 0.1557,
28
+ -0.1382,
29
+ 0.0542,
30
+ 0.2813,
31
+ 0.0891,
32
+ 0.157,
33
+ -0.0098,
34
+ 0.0375,
35
+ -0.1825,
36
+ -0.2246,
37
+ -0.1207,
38
+ -0.0698,
39
+ 0.5109,
40
+ 0.2665,
41
+ -0.2108,
42
+ -0.2158,
43
+ 0.2502,
44
+ -0.2055,
45
+ -0.0322,
46
+ 0.1109,
47
+ 0.1567,
48
+ -0.0729,
49
+ 0.0899,
50
+ -0.2799,
51
+ -0.123,
52
+ -0.0313,
53
+ -0.1649,
54
+ 0.0117,
55
+ 0.0723,
56
+ -0.2839,
57
+ -0.2083,
58
+ -0.052,
59
+ 0.3748,
60
+ 0.0152,
61
+ 0.1957,
62
+ 0.1433,
63
+ -0.2944,
64
+ 0.3573,
65
+ -0.0548,
66
+ -0.1681,
67
+ -0.0667
68
+ ],
69
+ "latents_std": [
70
+ 0.4765,
71
+ 1.0364,
72
+ 0.4514,
73
+ 1.1677,
74
+ 0.5313,
75
+ 0.499,
76
+ 0.4818,
77
+ 0.5013,
78
+ 0.8158,
79
+ 1.0344,
80
+ 0.5894,
81
+ 1.0901,
82
+ 0.6885,
83
+ 0.6165,
84
+ 0.8454,
85
+ 0.4978,
86
+ 0.5759,
87
+ 0.3523,
88
+ 0.7135,
89
+ 0.6804,
90
+ 0.5833,
91
+ 1.4146,
92
+ 0.8986,
93
+ 0.5659,
94
+ 0.7069,
95
+ 0.5338,
96
+ 0.4889,
97
+ 0.4917,
98
+ 0.4069,
99
+ 0.4999,
100
+ 0.6866,
101
+ 0.4093,
102
+ 0.5709,
103
+ 0.6065,
104
+ 0.6415,
105
+ 0.4944,
106
+ 0.5726,
107
+ 1.2042,
108
+ 0.5458,
109
+ 1.6887,
110
+ 0.3971,
111
+ 1.06,
112
+ 0.3943,
113
+ 0.5537,
114
+ 0.5444,
115
+ 0.4089,
116
+ 0.7468,
117
+ 0.7744
118
+ ],
119
+ "is_residual": true,
120
+ "in_channels": 12,
121
+ "out_channels": 12,
122
+ "patch_size": 2,
123
+ "scale_factor_temporal": 4,
124
+ "scale_factor_spatial": 16
125
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cd18f19438e35b32ac63020e2852f566e9b02f46b6cdbd87972a356e3c6f4b
3
+ size 2818777808