shethjenil
/

Indic-STT

Automatic Speech Recognition

wav2vec2-conformer

Model card Files Files and versions

shethjenil commited on Feb 23

Commit

e63e063

·

verified ·

1 Parent(s): d87b0f5

Update modeling_conformer.py

Files changed (1) hide show

modeling_conformer.py +5 -5

modeling_conformer.py CHANGED Viewed

@@ -95,7 +95,7 @@ class Wav2Vec2ConformerRNNT(Wav2Vec2ConformerModel):
         tokens  = torch.full((B, max_len), pad,  dtype=torch.long,  device=enc_out.device)
         starts  = torch.full((B, max_len), -1.0, dtype=enc_out.dtype, device=enc_out.device)
         lengths = torch.zeros(B, dtype=torch.long, device=enc_out.device)
-        hx      = torch.zeros(1, B, H, dtype=enc_out.dtype, device=enc_out.device)
         cx      = torch.zeros_like(hx)
         last    = torch.full((B, 1), blank, dtype=torch.long, device=enc_out.device)
@@ -173,11 +173,11 @@ encoder.pre_encode.conv_module.0,feature_extractor.conv_layers.0.conv
 encoder.pre_encode.out,feature_projection.projection
 """
         if not model.config.multilingual:
-            changes += "encoder.pre_encode.conv_module.{n},feature_extractor.conv_layers.{(n/2)}.conv"
-            changes += f"lang_joint_net.{model.config.language},joint"
         else:
-            changes += "encoder.pre_encode.conv_module.{n},encoder.pre_encode.conv_module.{(n-2)}"
-            changes += "encoder.pre_encode.conv_module.{n},feature_extractor.conv_layers.{(n//3+1)}.conv.{(n%3)}"
         state_dict = state_bridge(state_dict, changes)
         if not model.config.multilingual:
             state_dict = {k: v for k, v in state_dict.items() if "lang_joint_net" not in k}

         tokens  = torch.full((B, max_len), pad,  dtype=torch.long,  device=enc_out.device)
         starts  = torch.full((B, max_len), -1.0, dtype=enc_out.dtype, device=enc_out.device)
         lengths = torch.zeros(B, dtype=torch.long, device=enc_out.device)
+        hx      = torch.zeros(self.config.lstm_layer, B, H, dtype=enc_out.dtype, device=enc_out.device)
         cx      = torch.zeros_like(hx)
         last    = torch.full((B, 1), blank, dtype=torch.long, device=enc_out.device)
 encoder.pre_encode.out,feature_projection.projection
 """
         if not model.config.multilingual:
+            changes += "encoder.pre_encode.conv_module.{n},feature_extractor.conv_layers.{(n/2)}.conv\n"
+            changes += f"lang_joint_net.{model.config.language},joint\n"
         else:
+            changes += "encoder.pre_encode.conv_module.{n},encoder.pre_encode.conv_module.{(n-2)}\n"
+            changes += "encoder.pre_encode.conv_module.{n},feature_extractor.conv_layers.{(n//3+1)}.conv.{(n%3)}\n"
         state_dict = state_bridge(state_dict, changes)
         if not model.config.multilingual:
             state_dict = {k: v for k, v in state_dict.items() if "lang_joint_net" not in k}