shethjenil
/

Indic-STT

@@ -1,96 +1,13 @@
-from datetime import timedelta
-import json
 from huggingface_hub import hf_hub_download
 import torch
 import torch.nn.functional as F
 import torchaudio
 import librosa
-from torch import nn
-from transformers import Wav2Vec2ConformerModel
-from torch_state_bridge import state_bridge
-from torch.nn.utils.rnn import pad_sequence
-from safetensors.torch import load_file
-import webrtcvad
-from torch.utils.data import Dataset , DataLoader
-import srt
-class ChunkedData(Dataset):
-    def __init__(self, wav, sr):
-        if sr != 16000:
-            wav = torchaudio.functional.resample(wav, sr, 16000)
-        self.wav = wav.mean(0, keepdim=True)
-        self.sr = 16000
-        # Sirf timestamps store karo, actual chunk nahi
-        self.ts = self.make_chunk_timestamps(self.wav)
-    def __len__(self):
-        return len(self.ts)
-    def __getitem__(self, i):
-        st, ed = self.ts[i]
-        st_i = int(st * self.sr)
-        ed_i = int(ed * self.sr)
-        chunk = self.wav[:, st_i:ed_i].squeeze()
-        return chunk, self.ts[i]
-    def make_chunk_timestamps(self, wav, sr=16000, ag=2, min_s=10, max_s=15, ms=30):
-        wav_int16 = (wav * 32768).clamp(-32768, 32767).short().squeeze(0)
-        frame_len = int(sr * ms / 1000)
-        num_frames = len(wav_int16) // frame_len
-        wav_int16 = wav_int16[: num_frames * frame_len]
-        frames = wav_int16.view(num_frames, frame_len)
-        vad = webrtcvad.Vad(ag)
-        speech = torch.tensor(
-            [vad.is_speech(frame.numpy().tobytes(), sr) for frame in frames],
-            dtype=torch.bool
-        )
-        timestamps = []
-        total_samples = len(wav_int16)
-        min_len = int(min_s * sr)
-        max_len = int(max_s * sr)
-        st = 0
-        while st < total_samples:
-            ed = min(st + max_len, total_samples)
-            if ed - st < min_len and ed < total_samples:
-                ed = min(st + min_len, total_samples)
-            timestamps.append((
-                round(st / sr, 2),
-                round(ed / sr, 2)
-            ))
-            st = ed
-        return timestamps
-def padding_audio(batch):
-    audios, times = zip(*batch)
-    lengths = torch.tensor([audio.numel() for audio in audios])
-    times = torch.tensor(times, dtype=torch.float32)
-    padded = pad_sequence(audios, batch_first=True)
-    return padded, lengths, times
-def calc_length(lengths, all_paddings=2, kernel_size=3, stride=2, repeat_num=1):
-    add_pad = all_paddings - kernel_size
-    for _ in range(repeat_num):
-        lengths = torch.floor((lengths.float() + add_pad) / stride + 1)
-    return lengths
 class Op(nn.Module):
     def __init__(self, func,allow_self=False):
@@ -155,7 +72,7 @@ class Wav2Vec2ConformerRNNT(Wav2Vec2ConformerModel):
                 l.conv = nn.Sequential(nn.Conv2d(l.conv.in_channels,l.conv.out_channels,l.conv.kernel_size[0],l.conv.stride,1,groups=l.conv.out_channels),nn.Conv2d(l.conv.in_channels,l.conv.out_channels, 1))
         self.feature_extractor.conv_layers.append(Op(lambda x : x.transpose(1, 2)))
-        self.feature_projection.projection = nn.Linear(config.conv_dim[-1] * int(calc_length(torch.tensor(80.),repeat_num=self.config.num_feat_extract_layers)),config.hidden_size)
         self.feature_projection.layer_norm = Op(lambda x:x.permute(0, 2, 1, 3).flatten(2))
         for l in self.encoder.layers:
             l.conv_module.glu = nn.Sequential(l.conv_module.glu,self.mask_layer)
@@ -172,6 +89,12 @@ class Wav2Vec2ConformerRNNT(Wav2Vec2ConformerModel):
         self.mask_layer.cache_pad_mask = (torch.arange(hidden_states.size(1), device=hidden_states.device).unsqueeze(0) >= self.cache_length.unsqueeze(1))
         return super()._mask_hidden_states(hidden_states, mask_time_indices, attention_mask)
     def preprocessing(self, x):
         x, l = x
         l = (l // self.hop + 1).long()
@@ -184,25 +107,12 @@ class Wav2Vec2ConformerRNNT(Wav2Vec2ConformerModel):
         denom = torch.clamp(l[:, None] - 1, min=1)
         σ = (((x - μ[..., None])**2).sum(-1) / denom + 1e-5).sqrt()
         x = ((x - μ[..., None]) / σ[..., None]).masked_fill(m[:, None], 0)
-        self.cache_length = calc_length(l, repeat_num=self.config.num_feat_extract_layers).long()
         return F.pad(x, (0, (-T) % self.pad_to)).transpose(1, 2)
     def forward(self, input_values):
         return self._greedy_decode(super().forward(self.preprocessing(input_values)).last_hidden_state)
-    @torch.inference_mode()
-    def transcribe(self,wav,sr,batch_size):
-        device = next(self.parameters()).device
-        subtitles = []
-        for batch, lengths, timestamp in DataLoader(ChunkedData(wav, sr),batch_size,collate_fn=padding_audio):
-            batch = batch.to(device)
-            lengths = lengths.to(device)
-            timestamp = timestamp.to(device)
-            subtitles.extend(self.make_srt(self.forward((batch, lengths)),timestamp))
-            yield srt.compose(subtitles)
-            del batch
-            del lengths
     def load_state_dict(self, state_dict, strict=True, assign=False):
         state_dict.pop('ctc_decoder.decoder_layers.0.bias', None)
         state_dict.pop('ctc_decoder.decoder_layers.0.weight', None)
@@ -320,60 +230,12 @@ encoder.pre_encode.conv_module.{n},feature_extractor.conv_layers.{(n//3+1)}.conv
         return tokens, starts
-    def make_srt(self, decoded, ts):
-        tokens_list, starts_list = decoded
-        start_token_segment = (
-            self.config.languages.index(self.language)
-            * self.joint.out_features
-        )
-        all_tokens = []
-        all_starts = []
-        all_ends = []
-        device = tokens_list[0].device
-        for tokens, starts, (seg_start, seg_end) in zip(
-                tokens_list, starts_list, ts):
-            tokens = tokens + start_token_segment
-            starts = starts + seg_start
-            all_tokens.append(tokens)
-            all_starts.append(starts)
-            all_ends.append(torch.cat([starts[1:], seg_end[None]]))
-            # newline marker
-            all_tokens.append(torch.tensor([-1], device=device))
-            all_starts.append(torch.tensor([seg_end], device=device))
-            all_ends.append(torch.tensor([seg_end + 0.005], device=device))
-        return [
-            srt.Subtitle(
-                i,
-                timedelta(seconds=float(st)),
-                timedelta(seconds=float(en)),
-                "<line>" if tok == -1 else self.config.vocab[int(tok)]
-            )
-            for i, (tok, st, en) in enumerate(
-                zip(
-                    torch.cat(all_tokens),
-                    torch.cat(all_starts),
-                    torch.cat(all_ends)
-                ), 1
-            )
-        ]
     @classmethod
     def from_pretrained(
             cls,
             pretrained_model_name_or_path,
             config=None,
             language=None,
-            use_jit=False,
             use_quantization=False):
         if config is None:

 from huggingface_hub import hf_hub_download
+from torch import nn
+from transformers import Wav2Vec2ConformerModel
+from safetensors.torch import load_file
+from torch_state_bridge import state_bridge
+import json
 import torch
 import torch.nn.functional as F
 import torchaudio
 import librosa
 class Op(nn.Module):
     def __init__(self, func,allow_self=False):
                 l.conv = nn.Sequential(nn.Conv2d(l.conv.in_channels,l.conv.out_channels,l.conv.kernel_size[0],l.conv.stride,1,groups=l.conv.out_channels),nn.Conv2d(l.conv.in_channels,l.conv.out_channels, 1))
         self.feature_extractor.conv_layers.append(Op(lambda x : x.transpose(1, 2)))
+        self.feature_projection.projection = nn.Linear(config.conv_dim[-1] * int(self.calc_length(torch.tensor(80.),repeat_num=self.config.num_feat_extract_layers)),config.hidden_size)
         self.feature_projection.layer_norm = Op(lambda x:x.permute(0, 2, 1, 3).flatten(2))
         for l in self.encoder.layers:
             l.conv_module.glu = nn.Sequential(l.conv_module.glu,self.mask_layer)
         self.mask_layer.cache_pad_mask = (torch.arange(hidden_states.size(1), device=hidden_states.device).unsqueeze(0) >= self.cache_length.unsqueeze(1))
         return super()._mask_hidden_states(hidden_states, mask_time_indices, attention_mask)
+    def calc_length(self,lengths, all_paddings=2, kernel_size=3, stride=2, repeat_num=1):
+        add_pad = all_paddings - kernel_size
+        for _ in range(repeat_num):
+            lengths = torch.floor((lengths.float() + add_pad) / stride + 1)
+        return lengths
     def preprocessing(self, x):
         x, l = x
         l = (l // self.hop + 1).long()
         denom = torch.clamp(l[:, None] - 1, min=1)
         σ = (((x - μ[..., None])**2).sum(-1) / denom + 1e-5).sqrt()
         x = ((x - μ[..., None]) / σ[..., None]).masked_fill(m[:, None], 0)
+        self.cache_length = self.calc_length(l, repeat_num=self.config.num_feat_extract_layers).long()
         return F.pad(x, (0, (-T) % self.pad_to)).transpose(1, 2)
     def forward(self, input_values):
         return self._greedy_decode(super().forward(self.preprocessing(input_values)).last_hidden_state)
     def load_state_dict(self, state_dict, strict=True, assign=False):
         state_dict.pop('ctc_decoder.decoder_layers.0.bias', None)
         state_dict.pop('ctc_decoder.decoder_layers.0.weight', None)
         return tokens, starts
     @classmethod
     def from_pretrained(
             cls,
             pretrained_model_name_or_path,
             config=None,
             language=None,
             use_quantization=False):
         if config is None: