Duplicate from granite-vision-dev/granite-4.1-3b-vision

5c8cba6 11 days ago

3.34 kB

	{
	"architectures": [
	"Granite4VisionForConditionalGeneration"
	],
	"auto_map": {
	"AutoConfig": "configuration.Granite4VisionConfig",
	"AutoModel": "modeling.Granite4VisionForConditionalGeneration",
	"AutoModelForVision2Seq": "modeling.Granite4VisionForConditionalGeneration",
	"AutoModelForImageTextToText": "modeling.Granite4VisionForConditionalGeneration",
	"AutoProcessor": "processing.Granite4VisionProcessor"
	},
	"spatial_target_layers": [
	12,
	15,
	18,
	21
	],
	"spatial_stride": 2,
	"spatial_vision_layer": -1,
	"downsample_rate": "4/8",
	"dtype": "bfloat16",
	"image_grid_pinpoints": [
	[
	384,
	384
	],
	[
	384,
	768
	],
	[
	384,
	1152
	],
	[
	384,
	1536
	],
	[
	384,
	1920
	],
	[
	384,
	2304
	],
	[
	384,
	2688
	],
	[
	384,
	3072
	],
	[
	384,
	3456
	],
	[
	384,
	3840
	],
	[
	768,
	384
	],
	[
	768,
	768
	],
	[
	768,
	1152
	],
	[
	768,
	1536
	],
	[
	768,
	1920
	],
	[
	1152,
	384
	],
	[
	1152,
	768
	],
	[
	1152,
	1152
	],
	[
	1536,
	384
	],
	[
	1536,
	768
	],
	[
	1920,
	384
	],
	[
	1920,
	768
	],
	[
	2304,
	384
	],
	[
	2688,
	384
	],
	[
	3072,
	384
	],
	[
	3456,
	384
	],
	[
	3840,
	384
	]
	],
	"image_seq_length": 576,
	"image_token_index": 100352,
	"initializer_range": 0.02,
	"model_type": "granite4_vision",
	"projector_dropout": 0.1,
	"projector_hidden_act": "gelu",
	"text_config": {
	"architectures": [
	"GraniteForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"attention_multiplier": 0.015625,
	"bos_token_id": 100257,
	"embedding_multiplier": 12.0,
	"eos_token_id": 100257,
	"hidden_act": "silu",
	"hidden_size": 2560,
	"initializer_range": 0.1,
	"intermediate_size": 8192,
	"logits_scaling": 10.0,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "granite",
	"num_attention_heads": 40,
	"num_hidden_layers": 40,
	"num_key_value_heads": 8,
	"pad_token_id": 100256,
	"residual_multiplier": 0.22,
	"rms_norm_eps": 1e-05,
	"rope_scaling": null,
	"rope_theta": 10000000,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.53.3",
	"use_cache": true,
	"vocab_size": 100353
	},
	"tie_word_embeddings": true,
	"transformers_version": "4.57.3",
	"use_spatial_sampling": true,
	"use_image_newline_parameter": true,
	"vision_config": {
	"attention_dropout": 0.0,
	"hidden_act": "gelu_pytorch_tanh",
	"hidden_size": 1152,
	"image_size": 384,
	"intermediate_size": 4304,
	"layer_norm_eps": 1e-06,
	"model_type": "siglip_vision_model",
	"num_attention_heads": 16,
	"num_channels": 3,
	"num_hidden_layers": 27,
	"patch_size": 16
	},
	"vision_feature_select_strategy": "full",
	"deepstack_layer_map": [
	[
	-19,
	9
	],
	[
	-13,
	6
	],
	[
	-7,
	3
	],
	[
	-1,
	0
	]
	]
	}