import os from torch.utils.data import Dataset import pandas as pd import torchaudio from torch.utils.data import DataLoader import torchaudio from torch import nn import matplotlib.pyplot as plt import torch.nn.functional as F from torch.utils.data import DataLoader import torch import torch.optim as optim import numpy as np from ctcdecode import CTCBeamDecoder vocab_dict = {' ': 0, 'ء': 1, 'أ': 2, 'ؤ': 3, 'إ': 4,'ئ': 5, 'ا': 6, 'ب': 7, 'ة': 8,'ت': 9, 'ث': 10, 'ج': 11, 'ح': 12, 'خ': 13, 'د': 14, 'ذ': 15, 'ر': 16, 'ز': 17, 'س': 18, 'ش': 19, 'ص': 20,'ض': 21, 'ط': 22, 'ظ': 23, 'ع': 24, 'غ': 25, 'ـ': 26, 'ف': 27, 'ق': 28, 'ك': 29, 'ل': 30, 'م': 31, 'ن': 32, 'ه': 33, 'و': 34, 'ى': 35, 'ي': 36, 'ً': 37, 'ٌ': 38, 'ٍ': 39, 'َ': 40, 'ُ': 41, 'ِ': 42, 'ّ': 43, 'ْ': 44, 'ٓ': 45, 'ٔ': 46, 'ۜ': 47, '۟': 48, '۠': 49, 'ۢ': 50, 'ۣ': 51, 'ۥ': 52, 'ۦ': 53, 'ۨ': 54, 'ٰ': 55, '۪': 56, 'ٱ': 57, '۫': 58, '۬': 59, 'ۭ': 60, 'sos': 61 , "eos":62 , "blank":63} SOS=61 EOS = 62 BLANK = 63 NUM_CLASSES = len(vocab_dict) def int_to_text( labels): """ Use a character map and convert integer labels to an text sequence """ string = [] for i in labels: string.append(int_to_char[i]) return "".join(string) char_to_int = vocab_dict int_to_char = {v: k for k, v in char_to_int.items()} if torch.cuda.is_available(): device = "cuda" else: device = "cpu" print(f"Using device {device}") def resample_if_necessary( signal, sr): if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) signal = resampler(signal) return signal def mix_down_if_necessary( signal): if signal.shape[0] > 1: signal = torch.mean(signal, dim=0, keepdim=True) return signal def load_wav(wav_path): signal, sr = torchaudio.load(wav_path) signal = resample_if_necessary(signal , sr) signal = mix_down_if_necessary(signal) signal = signal return signal class CNNLayerNorm(nn.Module): """Layer normalization built for cnns input""" def __init__(self, n_feats): super(CNNLayerNorm, self).__init__() self.layer_norm = nn.LayerNorm(n_feats) def forward(self, x): # x (batch, channel, feature, time) x = x.transpose(2, 3).contiguous() # (batch, channel, time, feature) x = self.layer_norm(x) return x.transpose(2, 3).contiguous() # (batch, channel, feature, time) class ResidualCNN(nn.Module): """Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdf except with layer norm instead of batch norm """ def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats): super(ResidualCNN, self).__init__() self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2) self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(dropout) self.layer_norm1 = CNNLayerNorm(n_feats) self.layer_norm2 = CNNLayerNorm(n_feats) def forward(self, x): residual = x # (batch, channel, feature, time) x = self.layer_norm1(x) x = F.gelu(x) x = self.dropout1(x) x = self.cnn1(x) x = self.layer_norm2(x) x = F.gelu(x) x = self.dropout2(x) x = self.cnn2(x) x += residual return x # (batch, channel, feature, time) class BidirectionalGRU(nn.Module): def __init__(self, rnn_dim, hidden_size, dropout, batch_first): super(BidirectionalGRU, self).__init__() self.BiGRU = nn.GRU( input_size=rnn_dim, hidden_size=hidden_size, num_layers=1, batch_first=batch_first, bidirectional=True) self.layer_norm = nn.LayerNorm(rnn_dim) self.dropout = nn.Dropout(dropout) def forward(self, x): x = self.layer_norm(x) x = F.gelu(x) x, _ = self.BiGRU(x) x = self.dropout(x) return x class SpeechRecognitionModel(nn.Module): def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1): super(SpeechRecognitionModel, self).__init__() n_feats = n_feats//2 self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2) # cnn for extracting heirachal features # n residual cnn layers with filter size of 32 self.rescnn_layers = nn.Sequential(*[ ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats) for _ in range(n_cnn_layers) ]) self.fully_connected = nn.Linear(n_feats*32, rnn_dim) self.birnn_layers = nn.Sequential(*[ BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2, hidden_size=rnn_dim, dropout=dropout, batch_first=i==0) for i in range(n_rnn_layers) ]) self.classifier = nn.Sequential( nn.Linear(rnn_dim*2, rnn_dim), # birnn returns rnn_dim*2 nn.GELU(), nn.Dropout(dropout), nn.Linear(rnn_dim, n_class) ) def forward(self, x): x = self.cnn(x) x = self.rescnn_layers(x) sizes = x.size() x = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # (batch, feature, time) x = x.transpose(1, 2) # (batch, time, feature) x = self.fully_connected(x) x = self.birnn_layers(x) x = self.classifier(x) return x hparams = { "n_cnn_layers": 5, "n_rnn_layers": 5, "rnn_dim": 512, "n_class": NUM_CLASSES, # 63 "n_feats": 128, "stride":2, "dropout": 0.35, "learning_rate": 4e-5, "batch_size": 64, "epochs": 20 } model = SpeechRecognitionModel( hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'], hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout'] ).to(device) model=nn.DataParallel(model) # because model was originaly trained on 2x gpus model.load_state_dict(torch.load("1_bigger26werTest.pkl" , map_location="cpu").state_dict()) model.eval() mel_spectrogram = nn.Sequential(torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)).to(device) decoder = CTCBeamDecoder( labels, model_path = "lm (1).binary", beam_width=50, num_processes=4, blank_id=63, ) def get_recitation(signal): sr , signal = signal output = mel_spectrogram(signal) output = model(output[None]) softmax_out = output.softmax(2) labels = list(" ءأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىيًٌٍَُِّْۣٓٔۜ۟۠ۢۥۦٰ۪ۨٱۭ۫۬seb") beam_results, beam_scores, timesteps, out_lens = decoder.decode(softmax_out) return "".join([labels[n] for n in beam_results[0][0][:out_lens[0][0]]]) audio_input = gr.inputs.Audio(source="microphone") output_text = gr.outputs.Textbox(label="Output Text") gr.Interface(fn=recognize_speech, inputs=audio_input, outputs=output_text, title="Speech Recognition", live=True).launch()