junnei
/

gemma-3-4b-it-speech

Automatic Speech Recognition

feature-extraction

Model card Files Files and versions

gemma-3-4b-it-speech / config.json

junnei's picture

Update Speech Tower Training

6d2ea2d 8 months ago

3.16 kB

	{
	"architectures": [
	"Gemma3MMForConditionalGeneration"
	],
	"audio_config": {
	"activation": "swish",
	"activation_checkpointing": {
	"interval": 1,
	"module": "transformer",
	"offload": false
	},
	"attention_dim": 1024,
	"attention_heads": 16,
	"batch_norm": false,
	"bias_in_glu": true,
	"causal": true,
	"chunk_size": -1,
	"cnn_layer_norm": true,
	"conv_activation": "swish",
	"conv_glu_type": "swish",
	"depthwise_multiplier": 1,
	"depthwise_seperable_out_channel": 1024,
	"dropout_rate": 0.0,
	"encoder_embedding_config": {
	"input_size": 80
	},
	"ext_pw_kernel_size": 1,
	"ext_pw_out_channel": 1024,
	"input_layer": "nemo_conv",
	"input_size": 80,
	"kernel_size": 3,
	"left_chunk": 18,
	"linear_units": 1536,
	"model_type": "gemma3_audio",
	"nemo_conv_settings": {
	"conv_channels": 1024
	},
	"num_blocks": 24,
	"relative_attention_bias_args": {
	"t5_bias_max_distance": 500,
	"type": "t5"
	},
	"time_reduction": 8,
	"torch_dtype": "bfloat16"
	},
	"audio_token_index": 262143,
	"auto_map": {
	"AutoConfig": "junnei/gemma-3-4b-it-speech--configuration_gemma3mm.Gemma3MMConfig",
	"AutoModel": "junnei/gemma-3-4b-it-speech--modeling_gemma3mm.Gemma3MMForConditionalGeneration"
	},
	"boa_token_index": 256001,
	"boi_token_index": 255999,
	"eoa_token_index": 256002,
	"eoi_token_index": 256000,
	"eos_token_id": [
	1,
	106
	],
	"image_token_index": 262144,
	"initializer_range": 0.02,
	"mm_tokens_per_image": 256,
	"model_type": "gemma3mm",
	"speech_lora": {
	"dp": 0.01,
	"layer": "((layers.self_attn\\.(q\|k\|v\|o)_proj)\|(layers.mlp\\.(gate\|up\|down)_proj))",
	"lora_alpha": 320,
	"r": 320,
	"use_rslora": true
	},
	"text_config": {
	"attention_bias": false,
	"attention_dropout": 0.0,
	"attn_logit_softcapping": null,
	"cache_implementation": "hybrid",
	"final_logit_softcapping": null,
	"head_dim": 256,
	"hidden_activation": "gelu_pytorch_tanh",
	"hidden_size": 2560,
	"initializer_range": 0.02,
	"intermediate_size": 10240,
	"max_position_embeddings": 131072,
	"model_type": "gemma3_text",
	"num_attention_heads": 8,
	"num_hidden_layers": 34,
	"num_key_value_heads": 4,
	"query_pre_attn_scalar": 256,
	"rms_norm_eps": 1e-06,
	"rope_local_base_freq": 10000.0,
	"rope_scaling": {
	"factor": 8.0,
	"rope_type": "linear"
	},
	"rope_theta": 1000000.0,
	"sliding_window": 1024,
	"sliding_window_pattern": 6,
	"torch_dtype": "bfloat16",
	"use_cache": true,
	"vocab_size": 262208
	},
	"torch_dtype": "bfloat16",
	"transformers_version": "4.51.0.dev0",
	"use_cache": false,
	"vision_config": {
	"attention_dropout": 0.0,
	"hidden_act": "gelu_pytorch_tanh",
	"hidden_size": 1152,
	"image_size": 896,
	"intermediate_size": 4304,
	"layer_norm_eps": 1e-06,
	"model_type": "siglip_vision_model",
	"num_attention_heads": 16,
	"num_channels": 3,
	"num_hidden_layers": 27,
	"patch_size": 14,
	"torch_dtype": "bfloat16",
	"vision_use_head": false
	}
	}