euijinrnd commited on
Commit
69bcc9f
·
verified ·
1 Parent(s): d15f675

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "jellyho/TwinVLA-FT-50k",
3
+ "action_dim": 10,
4
+ "action_head": "DiT",
5
+ "action_len": 20,
6
+ "architectures": [
7
+ "Eagle2_1BTwinVLA"
8
+ ],
9
+ "denoiser": "FM",
10
+ "global_normalization": false,
11
+ "hz_interpolate": 20,
12
+ "interpolate_gripper": false,
13
+ "knowledge_insulation": false,
14
+ "model_path": null,
15
+ "model_type": "Eagle2_1BTwinVLA",
16
+ "modeling": "denoising",
17
+ "normalization": "quantile",
18
+ "num_readouts": 1,
19
+ "readout_token_as_eos": false,
20
+ "share_decoder": true,
21
+ "share_embed_tokens": true,
22
+ "share_vision": true,
23
+ "singlevla_config": {
24
+ "_attn_implementation_autoset": false,
25
+ "_attn_implementation_internal": null,
26
+ "_commit_hash": null,
27
+ "_name_or_path": "/home/jellyho/.cache/huggingface/hub/models--jellyho--TwinVLA-FT-50k/snapshots/85c86f3205c61063327aae4ef6dd5e6bce8fa093/singlevla_config/config.json",
28
+ "action_dim": 10,
29
+ "action_head": "DiT",
30
+ "action_head_hidden_dim": 1024,
31
+ "action_len": 20,
32
+ "add_cross_attention": false,
33
+ "aggregation": "false",
34
+ "architectures": [
35
+ "Eagle2_1BVLA"
36
+ ],
37
+ "auto_map": {},
38
+ "bad_words_ids": null,
39
+ "begin_suppress_tokens": null,
40
+ "bos_token_id": null,
41
+ "chunk_size_feed_forward": 0,
42
+ "cross_attention_hidden_size": null,
43
+ "dataset_statistics_path": null,
44
+ "decoder_start_token_id": null,
45
+ "denoiser": "FM",
46
+ "diffusion_batch": 32,
47
+ "dit_size": "DiT-B",
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "downsample_ratio": 0.5,
51
+ "dynamic_image_size": true,
52
+ "early_stopping": false,
53
+ "efficient_loss": true,
54
+ "enable_cfg": true,
55
+ "encoder_no_repeat_ngram_size": 0,
56
+ "eos_token_id": null,
57
+ "exponential_decay_length_penalty": null,
58
+ "finetuning_task": null,
59
+ "force_image_size": 448,
60
+ "forced_bos_token_id": null,
61
+ "forced_eos_token_id": null,
62
+ "global_normalization": true,
63
+ "hz_interpolate": 20,
64
+ "id2label": {
65
+ "0": "LABEL_0",
66
+ "1": "LABEL_1"
67
+ },
68
+ "image_size": 224,
69
+ "interpolate_gripper": false,
70
+ "is_decoder": false,
71
+ "is_encoder_decoder": false,
72
+ "keep_aspect_ratio": false,
73
+ "knowledge_insulation": false,
74
+ "label2id": {
75
+ "LABEL_0": 0,
76
+ "LABEL_1": 1
77
+ },
78
+ "length_penalty": 1.0,
79
+ "llm_config": {
80
+ "_attn_implementation_autoset": true,
81
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
82
+ "add_cross_attention": false,
83
+ "architectures": [
84
+ "Qwen2ForCausalLM"
85
+ ],
86
+ "attention_dropout": 0.0,
87
+ "auto_map": {
88
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
89
+ "AutoModel": "modeling_qwen2.Qwen2Model",
90
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
91
+ },
92
+ "bad_words_ids": null,
93
+ "begin_suppress_tokens": null,
94
+ "bos_token_id": 151643,
95
+ "chunk_size_feed_forward": 0,
96
+ "cross_attention_hidden_size": null,
97
+ "decoder_start_token_id": null,
98
+ "diversity_penalty": 0.0,
99
+ "do_sample": false,
100
+ "early_stopping": false,
101
+ "encoder_no_repeat_ngram_size": 0,
102
+ "eos_token_id": 151645,
103
+ "exponential_decay_length_penalty": null,
104
+ "finetuning_task": null,
105
+ "forced_bos_token_id": null,
106
+ "forced_eos_token_id": null,
107
+ "hidden_act": "silu",
108
+ "hidden_size": 896,
109
+ "id2label": {
110
+ "0": "LABEL_0",
111
+ "1": "LABEL_1"
112
+ },
113
+ "initializer_range": 0.02,
114
+ "intermediate_size": 4864,
115
+ "is_decoder": false,
116
+ "is_encoder_decoder": false,
117
+ "label2id": {
118
+ "LABEL_0": 0,
119
+ "LABEL_1": 1
120
+ },
121
+ "length_penalty": 1.0,
122
+ "max_length": 20,
123
+ "max_position_embeddings": 32768,
124
+ "max_window_layers": 21,
125
+ "min_length": 0,
126
+ "model_type": "qwen2",
127
+ "no_repeat_ngram_size": 0,
128
+ "num_attention_heads": 14,
129
+ "num_beam_groups": 1,
130
+ "num_beams": 1,
131
+ "num_hidden_layers": 24,
132
+ "num_key_value_heads": 2,
133
+ "num_return_sequences": 1,
134
+ "output_attentions": false,
135
+ "output_hidden_states": false,
136
+ "output_scores": false,
137
+ "pad_token_id": null,
138
+ "prefix": null,
139
+ "problem_type": null,
140
+ "pruned_heads": {},
141
+ "remove_invalid_values": false,
142
+ "repetition_penalty": 1.0,
143
+ "return_dict": true,
144
+ "return_dict_in_generate": false,
145
+ "rms_norm_eps": 1e-06,
146
+ "rope_scaling": null,
147
+ "rope_theta": 1000000.0,
148
+ "sep_token_id": null,
149
+ "sliding_window": 32768,
150
+ "suppress_tokens": null,
151
+ "task_specific_params": null,
152
+ "temperature": 1.0,
153
+ "tf_legacy_loss": false,
154
+ "tie_encoder_decoder": false,
155
+ "tie_word_embeddings": true,
156
+ "tokenizer_class": null,
157
+ "top_k": 50,
158
+ "top_p": 1.0,
159
+ "torch_dtype": "bfloat16",
160
+ "torchscript": false,
161
+ "transformers_version": "4.50.0.dev0",
162
+ "typical_p": 1.0,
163
+ "use_bfloat16": false,
164
+ "use_cache": false,
165
+ "use_sliding_window": false,
166
+ "vocab_size": 151674
167
+ },
168
+ "loss_version": "v4",
169
+ "max_dynamic_patch": 12,
170
+ "max_length": 20,
171
+ "min_dynamic_patch": 1,
172
+ "min_length": 0,
173
+ "mlp_checkpoint": true,
174
+ "model_path": "nvidia/Eagle2-1B",
175
+ "model_type": "Eagle2_1BVLA",
176
+ "modeling": "denoising",
177
+ "no_repeat_ngram_size": 0,
178
+ "normalization": "quantile",
179
+ "num_beam_groups": 1,
180
+ "num_beams": 1,
181
+ "num_readouts": 1,
182
+ "num_return_sequences": 1,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "output_scores": false,
186
+ "pad2square": false,
187
+ "pad_token_id": null,
188
+ "pre_feature_reduction": false,
189
+ "prefix": null,
190
+ "problem_type": null,
191
+ "pruned_heads": {},
192
+ "ps_version": "v2",
193
+ "readout_token_as_eos": false,
194
+ "remove_invalid_values": false,
195
+ "repetition_penalty": 1.0,
196
+ "return_dict": true,
197
+ "return_dict_in_generate": false,
198
+ "return_text": null,
199
+ "select_layer": -1,
200
+ "sep_token_id": null,
201
+ "state_dim": 10,
202
+ "stopping_token": "|",
203
+ "suppress_tokens": null,
204
+ "task_specific_params": null,
205
+ "temperature": 1.0,
206
+ "template": "qwen2-chat",
207
+ "test_denoising_steps": 10,
208
+ "tf_legacy_loss": false,
209
+ "tie_encoder_decoder": false,
210
+ "tie_word_embeddings": true,
211
+ "tokenizer_class": null,
212
+ "top_k": 50,
213
+ "top_p": 1.0,
214
+ "torch_dtype": "bfloat16",
215
+ "torchscript": false,
216
+ "train_denoising_steps": 100,
217
+ "typical_p": 1.0,
218
+ "use_backbone_lora": 0,
219
+ "use_bfloat16": false,
220
+ "use_llm_lora": 0,
221
+ "use_thumbnail": true,
222
+ "vision_config": {
223
+ "_attn_implementation_autoset": true,
224
+ "_name_or_path": "",
225
+ "add_cross_attention": false,
226
+ "architectures": [
227
+ "SiglipVisionModel"
228
+ ],
229
+ "attention_dropout": 0.0,
230
+ "auto_map": {
231
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
232
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
233
+ },
234
+ "bad_words_ids": null,
235
+ "begin_suppress_tokens": null,
236
+ "bos_token_id": null,
237
+ "chunk_size_feed_forward": 0,
238
+ "cross_attention_hidden_size": null,
239
+ "decoder_start_token_id": null,
240
+ "diversity_penalty": 0.0,
241
+ "do_sample": false,
242
+ "drop_path_rate": 0.1,
243
+ "early_stopping": false,
244
+ "encoder_no_repeat_ngram_size": 0,
245
+ "eos_token_id": null,
246
+ "exponential_decay_length_penalty": null,
247
+ "finetuning_task": null,
248
+ "forced_bos_token_id": null,
249
+ "forced_eos_token_id": null,
250
+ "hidden_act": "gelu_pytorch_tanh",
251
+ "hidden_size": 1152,
252
+ "id2label": {
253
+ "0": "LABEL_0",
254
+ "1": "LABEL_1"
255
+ },
256
+ "image_size": 448,
257
+ "intermediate_size": 4304,
258
+ "is_decoder": false,
259
+ "is_encoder_decoder": false,
260
+ "label2id": {
261
+ "LABEL_0": 0,
262
+ "LABEL_1": 1
263
+ },
264
+ "layer_norm_eps": 1e-06,
265
+ "length_penalty": 1.0,
266
+ "max_length": 20,
267
+ "min_length": 0,
268
+ "model_type": "siglip_vision_model",
269
+ "no_repeat_ngram_size": 0,
270
+ "num_attention_heads": 16,
271
+ "num_beam_groups": 1,
272
+ "num_beams": 1,
273
+ "num_channels": 3,
274
+ "num_hidden_layers": 27,
275
+ "num_image_tokens": 1024,
276
+ "num_return_sequences": 1,
277
+ "output_attentions": false,
278
+ "output_hidden_states": false,
279
+ "output_scores": false,
280
+ "pad_token_id": null,
281
+ "patch_size": 14,
282
+ "prefix": null,
283
+ "problem_type": null,
284
+ "projection_dim": 2048,
285
+ "projector_hidden_act": "gelu_fast",
286
+ "pruned_heads": {},
287
+ "remove_invalid_values": false,
288
+ "repetition_penalty": 1.0,
289
+ "return_dict": true,
290
+ "return_dict_in_generate": false,
291
+ "sep_token_id": null,
292
+ "suppress_tokens": null,
293
+ "task_specific_params": null,
294
+ "temperature": 1.0,
295
+ "tf_legacy_loss": false,
296
+ "tie_encoder_decoder": false,
297
+ "tie_word_embeddings": true,
298
+ "tokenizer_class": null,
299
+ "top_k": 50,
300
+ "top_p": 1.0,
301
+ "torch_dtype": "bfloat16",
302
+ "torchscript": false,
303
+ "transformers_version": "4.50.0.dev0",
304
+ "typical_p": 1.0,
305
+ "use_bfloat16": false,
306
+ "vision_use_head": false
307
+ },
308
+ "vocab_size": 151674,
309
+ "vocab_start": null
310
+ },
311
+ "singlevla_config_path": "/home/jellyho/.cache/huggingface/hub/models--jellyho--TwinVLA-FT-50k/snapshots/85c86f3205c61063327aae4ef6dd5e6bce8fa093/singlevla_config/config.json",
312
+ "singlevla_pretrained_path": null,
313
+ "state_dim": 10,
314
+ "torch_dtype": "bfloat16",
315
+ "transformers_version": "4.50.0.dev0"
316
+ }
dataset_statistics.json ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aloha_handover_box": {
3
+ "action": {
4
+ "mean": [
5
+ 0.322445809841156,
6
+ -0.07136808335781097,
7
+ 0.16350804269313812,
8
+ 0.6155836582183838,
9
+ 0.025826361030340195,
10
+ -0.6291796565055847,
11
+ -0.12873482704162598,
12
+ 0.9684553742408752,
13
+ -0.05324767157435417,
14
+ 0.32048356533050537,
15
+ 0.3553526699542999,
16
+ -0.017269087955355644,
17
+ 0.25354716181755066,
18
+ 0.9467635154724121,
19
+ -0.10934817045927048,
20
+ -0.16364239156246185,
21
+ 0.10808151960372925,
22
+ 0.9652227759361267,
23
+ -0.08225584030151367,
24
+ 0.6809535622596741
25
+ ],
26
+ "std": [
27
+ 0.07454723119735718,
28
+ 0.08869021385908127,
29
+ 0.07996607571840286,
30
+ 0.33460575342178345,
31
+ 0.19985812902450562,
32
+ 0.26947152614593506,
33
+ 0.12514586746692657,
34
+ 0.030899910256266594,
35
+ 0.16146336495876312,
36
+ 0.9473028779029846,
37
+ 0.06487792730331421,
38
+ 0.03894990682601929,
39
+ 0.027652453631162643,
40
+ 0.10490529984235764,
41
+ 0.1838434785604477,
42
+ 0.14178059995174408,
43
+ 0.2033252716064453,
44
+ 0.06656672805547714,
45
+ 0.064219169318676,
46
+ 0.732288122177124
47
+ ],
48
+ "max": [
49
+ 0.48683926463127136,
50
+ 0.0484432689845562,
51
+ 0.31490612030029297,
52
+ 0.99891197681427,
53
+ 0.4277522563934326,
54
+ 0.06322141736745834,
55
+ 0.4004654884338379,
56
+ 0.9999857544898987,
57
+ 0.3100079298019409,
58
+ 1.0,
59
+ 0.5334027409553528,
60
+ 0.08494444936513901,
61
+ 0.36568865180015564,
62
+ 0.9999882578849792,
63
+ 0.2546274662017822,
64
+ 0.1172015443444252,
65
+ 0.7982608079910278,
66
+ 0.9999992251396179,
67
+ 0.20094169676303864,
68
+ 1.0
69
+ ],
70
+ "min": [
71
+ 0.1422317922115326,
72
+ -0.2763901352882385,
73
+ -0.0600760243833065,
74
+ -0.14848311245441437,
75
+ -0.6282482743263245,
76
+ -0.9999129176139832,
77
+ -0.42181891202926636,
78
+ 0.7404066324234009,
79
+ -0.6676974296569824,
80
+ -1.0,
81
+ 0.1786160171031952,
82
+ -0.1845615804195404,
83
+ 0.1687021553516388,
84
+ 0.2762398421764374,
85
+ -0.7479667067527771,
86
+ -0.8485982418060303,
87
+ -0.2597721517086029,
88
+ 0.6015138626098633,
89
+ -0.3933228552341461,
90
+ -1.0
91
+ ],
92
+ "q01": [
93
+ 0.1950138956308365,
94
+ -0.24691226959228516,
95
+ -0.015285035967826844,
96
+ -0.04555398792028427,
97
+ -0.4452396559715271,
98
+ -0.996303243637085,
99
+ -0.3760478734970093,
100
+ 0.8516808867454528,
101
+ -0.46342918753623963,
102
+ -1.0,
103
+ 0.21926841557025908,
104
+ -0.1317625629901886,
105
+ 0.1978745412826538,
106
+ 0.5117229986190795,
107
+ -0.6376786828041077,
108
+ -0.6609986042976379,
109
+ -0.19099083304405212,
110
+ 0.6930621123313904,
111
+ -0.2356126993894577,
112
+ -1.0
113
+ ],
114
+ "q99": [
115
+ 0.47150796771049497,
116
+ 0.038070930540561675,
117
+ 0.28182336688041676,
118
+ 0.9817836880683899,
119
+ 0.3871919810771942,
120
+ -0.1345064049959186,
121
+ 0.20285944879054985,
122
+ 0.9992118668556214,
123
+ 0.2293877118825912,
124
+ 1.0,
125
+ 0.49810330152511595,
126
+ 0.0599309906363487,
127
+ 0.3309180569648742,
128
+ 0.9995350050926208,
129
+ 0.1829529863595952,
130
+ 0.03216676786541939,
131
+ 0.7132800936698909,
132
+ 0.9997488117218017,
133
+ 0.08941484957933345,
134
+ 1.0
135
+ ],
136
+ "mask": [
137
+ true,
138
+ true,
139
+ true,
140
+ false,
141
+ false,
142
+ false,
143
+ false,
144
+ false,
145
+ false,
146
+ false,
147
+ true,
148
+ true,
149
+ true,
150
+ false,
151
+ false,
152
+ false,
153
+ false,
154
+ false,
155
+ false,
156
+ false
157
+ ]
158
+ },
159
+ "proprio": {
160
+ "mean": [
161
+ 0.3199884295463562,
162
+ -0.06322039663791656,
163
+ 0.15364451706409454,
164
+ 0.6062000393867493,
165
+ 0.06611454486846924,
166
+ -0.6415385603904724,
167
+ -0.09341777116060257,
168
+ 0.9627954959869385,
169
+ 0.031210733577609062,
170
+ 0.3975388705730438,
171
+ 0.3515152037143707,
172
+ -0.018268277868628502,
173
+ 0.2389151155948639,
174
+ 0.9404945969581604,
175
+ -0.11243173480033875,
176
+ -0.20223592221736908,
177
+ 0.10952325910329819,
178
+ 0.9571213722229004,
179
+ -0.07971788197755814,
180
+ 0.5135871171951294
181
+ ],
182
+ "std": [
183
+ 0.07216393202543259,
184
+ 0.07979436963796616,
185
+ 0.08181631565093994,
186
+ 0.3255034387111664,
187
+ 0.15539786219596863,
188
+ 0.2940728962421417,
189
+ 0.1470193713903427,
190
+ 0.0702429711818695,
191
+ 0.19176805019378662,
192
+ 0.34666576981544495,
193
+ 0.06400732696056366,
194
+ 0.03745417669415474,
195
+ 0.032076891511678696,
196
+ 0.10509590804576874,
197
+ 0.17774498462677002,
198
+ 0.1389302909374237,
199
+ 0.20042844116687775,
200
+ 0.08065077662467957,
201
+ 0.13744565844535828,
202
+ 0.2912951111793518
203
+ ],
204
+ "max": [
205
+ 0.4756927788257599,
206
+ 0.08933446556329727,
207
+ 0.374162495136261,
208
+ 0.9999960064888,
209
+ 0.6316872835159302,
210
+ 0.4776941239833832,
211
+ 0.5646718740463257,
212
+ 1.0,
213
+ 0.9942742586135864,
214
+ 0.9009114503860474,
215
+ 0.525163471698761,
216
+ 0.07913362979888916,
217
+ 0.41776642203330994,
218
+ 0.9999624490737915,
219
+ 0.22881703078746796,
220
+ 0.25566166639328003,
221
+ 0.7931073904037476,
222
+ 1.0,
223
+ 0.8460071086883545,
224
+ 0.7628055810928345
225
+ ],
226
+ "min": [
227
+ 0.15654076635837555,
228
+ -0.24968452751636505,
229
+ -0.005626573693007231,
230
+ -0.12249734997749329,
231
+ -0.7090452313423157,
232
+ -1.0,
233
+ -0.8474201560020447,
234
+ -0.10166153311729431,
235
+ -0.7077121138572693,
236
+ -0.8731642365455627,
237
+ 0.18484462797641754,
238
+ -0.16924390196800232,
239
+ 0.1543029397726059,
240
+ 0.2589649260044098,
241
+ -0.7204840183258057,
242
+ -0.8606683015823364,
243
+ -0.24437052011489868,
244
+ 0.35231029987335205,
245
+ -0.9356772899627686,
246
+ -0.8731642365455627
247
+ ],
248
+ "q01": [
249
+ 0.19023896217346192,
250
+ -0.22257453262805937,
251
+ -0.0035876664519309998,
252
+ -0.01533471692353487,
253
+ -0.2564712131023407,
254
+ -0.9997995805740356,
255
+ -0.35406238079071045,
256
+ 0.6182524561882019,
257
+ -0.3273082673549652,
258
+ -0.11072122812271118,
259
+ 0.2202826726436615,
260
+ -0.12777820765972137,
261
+ 0.17903109550476073,
262
+ 0.48722490668296814,
263
+ -0.6251857328414917,
264
+ -0.6860081076622009,
265
+ -0.17821017920970916,
266
+ 0.6367788648605347,
267
+ -0.562646164894104,
268
+ -0.22518292188644407
269
+ ],
270
+ "q99": [
271
+ 0.4643557775020599,
272
+ 0.0528511093556879,
273
+ 0.3235322630405426,
274
+ 0.9950041770935059,
275
+ 0.42506823778152464,
276
+ 0.17489826560020202,
277
+ 0.36311201095581025,
278
+ 0.9999364614486694,
279
+ 0.7507713937759389,
280
+ 0.7889349985122672,
281
+ 0.49394992589950554,
282
+ 0.051381032466888375,
283
+ 0.33517178773879985,
284
+ 0.999138171672821,
285
+ 0.16684073746204348,
286
+ 0.055357125997543326,
287
+ 0.7065780282020567,
288
+ 0.999832546710968,
289
+ 0.513761196136471,
290
+ 0.6879761123657224
291
+ ],
292
+ "mask": [
293
+ true,
294
+ true,
295
+ true,
296
+ false,
297
+ false,
298
+ false,
299
+ false,
300
+ false,
301
+ false,
302
+ false,
303
+ true,
304
+ true,
305
+ true,
306
+ false,
307
+ false,
308
+ false,
309
+ false,
310
+ false,
311
+ false,
312
+ false
313
+ ]
314
+ },
315
+ "num_transitions": 11829,
316
+ "num_trajectories": 50
317
+ }
318
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec89f703d0b817e56b17386fc63e774c105c317c3d45a93552d54bc348468407
3
+ size 2889539864
singlevla_config/config.json ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "/home/jellyho/.cache/huggingface/hub/models--jellyho--TwinVLA-FT-50k/snapshots/85c86f3205c61063327aae4ef6dd5e6bce8fa093/singlevla_config/config.json",
4
+ "action_dim": 10,
5
+ "action_head": "DiT",
6
+ "action_head_hidden_dim": 1024,
7
+ "action_len": 20,
8
+ "aggregation": "false",
9
+ "architectures": [
10
+ "Eagle2_1BVLA"
11
+ ],
12
+ "auto_map": {},
13
+ "dataset_statistics_path": null,
14
+ "denoiser": "FM",
15
+ "diffusion_batch": 32,
16
+ "dit_size": "DiT-B",
17
+ "downsample_ratio": 0.5,
18
+ "dynamic_image_size": true,
19
+ "efficient_loss": true,
20
+ "enable_cfg": true,
21
+ "force_image_size": 448,
22
+ "global_normalization": true,
23
+ "hz_interpolate": 20,
24
+ "image_size": 224,
25
+ "interpolate_gripper": false,
26
+ "keep_aspect_ratio": false,
27
+ "knowledge_insulation": false,
28
+ "llm_config": {
29
+ "_attn_implementation_autoset": true,
30
+ "_name_or_path": "./pretrained/Qwen2_5-0_5B-Instruct",
31
+ "add_cross_attention": false,
32
+ "architectures": [
33
+ "Qwen2ForCausalLM"
34
+ ],
35
+ "attention_dropout": 0.0,
36
+ "auto_map": {
37
+ "AutoConfig": "configuration_qwen2.Qwen2Config",
38
+ "AutoModel": "modeling_qwen2.Qwen2Model",
39
+ "AutoModelForCausalLM": "modeling_qwen2.Qwen2ForCausalLM"
40
+ },
41
+ "bad_words_ids": null,
42
+ "begin_suppress_tokens": null,
43
+ "bos_token_id": 151643,
44
+ "chunk_size_feed_forward": 0,
45
+ "cross_attention_hidden_size": null,
46
+ "decoder_start_token_id": null,
47
+ "diversity_penalty": 0.0,
48
+ "do_sample": false,
49
+ "early_stopping": false,
50
+ "encoder_no_repeat_ngram_size": 0,
51
+ "eos_token_id": 151645,
52
+ "exponential_decay_length_penalty": null,
53
+ "finetuning_task": null,
54
+ "forced_bos_token_id": null,
55
+ "forced_eos_token_id": null,
56
+ "hidden_act": "silu",
57
+ "hidden_size": 896,
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 4864,
64
+ "is_decoder": false,
65
+ "is_encoder_decoder": false,
66
+ "label2id": {
67
+ "LABEL_0": 0,
68
+ "LABEL_1": 1
69
+ },
70
+ "length_penalty": 1.0,
71
+ "max_length": 20,
72
+ "max_position_embeddings": 32768,
73
+ "max_window_layers": 21,
74
+ "min_length": 0,
75
+ "model_type": "qwen2",
76
+ "no_repeat_ngram_size": 0,
77
+ "num_attention_heads": 14,
78
+ "num_beam_groups": 1,
79
+ "num_beams": 1,
80
+ "num_hidden_layers": 24,
81
+ "num_key_value_heads": 2,
82
+ "num_return_sequences": 1,
83
+ "output_attentions": false,
84
+ "output_hidden_states": false,
85
+ "output_scores": false,
86
+ "pad_token_id": null,
87
+ "prefix": null,
88
+ "problem_type": null,
89
+ "pruned_heads": {},
90
+ "remove_invalid_values": false,
91
+ "repetition_penalty": 1.0,
92
+ "return_dict": true,
93
+ "return_dict_in_generate": false,
94
+ "rms_norm_eps": 1e-06,
95
+ "rope_scaling": null,
96
+ "rope_theta": 1000000.0,
97
+ "sep_token_id": null,
98
+ "sliding_window": 32768,
99
+ "suppress_tokens": null,
100
+ "task_specific_params": null,
101
+ "temperature": 1.0,
102
+ "tf_legacy_loss": false,
103
+ "tie_encoder_decoder": false,
104
+ "tie_word_embeddings": true,
105
+ "tokenizer_class": null,
106
+ "top_k": 50,
107
+ "top_p": 1.0,
108
+ "torch_dtype": "bfloat16",
109
+ "torchscript": false,
110
+ "transformers_version": "4.50.0.dev0",
111
+ "typical_p": 1.0,
112
+ "use_bfloat16": false,
113
+ "use_cache": false,
114
+ "use_sliding_window": false,
115
+ "vocab_size": 151674
116
+ },
117
+ "loss_version": "v4",
118
+ "max_dynamic_patch": 12,
119
+ "min_dynamic_patch": 1,
120
+ "mlp_checkpoint": true,
121
+ "model_path": "nvidia/Eagle2-1B",
122
+ "model_type": "Eagle2_1BVLA",
123
+ "modeling": "denoising",
124
+ "normalization": "quantile",
125
+ "num_readouts": 1,
126
+ "pad2square": false,
127
+ "pre_feature_reduction": false,
128
+ "ps_version": "v2",
129
+ "readout_token_as_eos": false,
130
+ "return_text": null,
131
+ "select_layer": -1,
132
+ "state_dim": 10,
133
+ "stopping_token": "|",
134
+ "template": "qwen2-chat",
135
+ "test_denoising_steps": 10,
136
+ "torch_dtype": "bfloat16",
137
+ "train_denoising_steps": 100,
138
+ "transformers_version": null,
139
+ "use_backbone_lora": 0,
140
+ "use_llm_lora": 0,
141
+ "use_thumbnail": true,
142
+ "vision_config": {
143
+ "_attn_implementation_autoset": true,
144
+ "_name_or_path": "",
145
+ "add_cross_attention": false,
146
+ "architectures": [
147
+ "SiglipVisionModel"
148
+ ],
149
+ "attention_dropout": 0.0,
150
+ "auto_map": {
151
+ "AutoConfig": "configuration_siglip.SiglipVisionConfig",
152
+ "AutoModel": "modeling_siglip.SiglipVisionModel"
153
+ },
154
+ "bad_words_ids": null,
155
+ "begin_suppress_tokens": null,
156
+ "bos_token_id": null,
157
+ "chunk_size_feed_forward": 0,
158
+ "cross_attention_hidden_size": null,
159
+ "decoder_start_token_id": null,
160
+ "diversity_penalty": 0.0,
161
+ "do_sample": false,
162
+ "drop_path_rate": 0.1,
163
+ "early_stopping": false,
164
+ "encoder_no_repeat_ngram_size": 0,
165
+ "eos_token_id": null,
166
+ "exponential_decay_length_penalty": null,
167
+ "finetuning_task": null,
168
+ "forced_bos_token_id": null,
169
+ "forced_eos_token_id": null,
170
+ "hidden_act": "gelu_pytorch_tanh",
171
+ "hidden_size": 1152,
172
+ "id2label": {
173
+ "0": "LABEL_0",
174
+ "1": "LABEL_1"
175
+ },
176
+ "image_size": 448,
177
+ "intermediate_size": 4304,
178
+ "is_decoder": false,
179
+ "is_encoder_decoder": false,
180
+ "label2id": {
181
+ "LABEL_0": 0,
182
+ "LABEL_1": 1
183
+ },
184
+ "layer_norm_eps": 1e-06,
185
+ "length_penalty": 1.0,
186
+ "max_length": 20,
187
+ "min_length": 0,
188
+ "model_type": "siglip_vision_model",
189
+ "no_repeat_ngram_size": 0,
190
+ "num_attention_heads": 16,
191
+ "num_beam_groups": 1,
192
+ "num_beams": 1,
193
+ "num_channels": 3,
194
+ "num_hidden_layers": 27,
195
+ "num_image_tokens": 1024,
196
+ "num_return_sequences": 1,
197
+ "output_attentions": false,
198
+ "output_hidden_states": false,
199
+ "output_scores": false,
200
+ "pad_token_id": null,
201
+ "patch_size": 14,
202
+ "prefix": null,
203
+ "problem_type": null,
204
+ "projection_dim": 2048,
205
+ "projector_hidden_act": "gelu_fast",
206
+ "pruned_heads": {},
207
+ "remove_invalid_values": false,
208
+ "repetition_penalty": 1.0,
209
+ "return_dict": true,
210
+ "return_dict_in_generate": false,
211
+ "sep_token_id": null,
212
+ "suppress_tokens": null,
213
+ "task_specific_params": null,
214
+ "temperature": 1.0,
215
+ "tf_legacy_loss": false,
216
+ "tie_encoder_decoder": false,
217
+ "tie_word_embeddings": true,
218
+ "tokenizer_class": null,
219
+ "top_k": 50,
220
+ "top_p": 1.0,
221
+ "torch_dtype": "bfloat16",
222
+ "torchscript": false,
223
+ "transformers_version": "4.50.0.dev0",
224
+ "typical_p": 1.0,
225
+ "use_bfloat16": false,
226
+ "vision_use_head": false
227
+ },
228
+ "vocab_size": 151674,
229
+ "vocab_start": null
230
+ }