devasheeshG
/

whisper_medium_fp16_transformers

@@ -266,7 +266,7 @@ language:
 | Original_Model (54 min) | 52.02 | 47.86 | 66.82 | 33.17 | 23.76 |
 | This_Model (38 min)     | 54.97 | 47.86 | 66.83 | 33.16 | 30.23 |
-### Hindi to English (test.tsv) [Common Voice 14.0](https://commonvoice.mozilla.org/en/datasets)
 **Test done on RTX 3060 on 1000 Samples**

 | Original_Model (54 min) | 52.02 | 47.86 | 66.82 | 33.17 | 23.76 |
 | This_Model (38 min)     | 54.97 | 47.86 | 66.83 | 33.16 | 30.23 |
+### Hindi to English (test.tsv) [Custom Dataset](https://huggingface.co/datasets/devasheeshG/common_voices_14_0_hi2en_hi2hi)
 **Test done on RTX 3060 on 1000 Samples**

__init__.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from transformers import (
-    WhisperForConditionalGeneration, WhisperProcessor, WhisperConfig,
 )
 import torch
 import ffmpeg
@@ -13,6 +15,7 @@ SAMPLE_RATE = 16000
 CHUNK_LENGTH = 30  # 30-second chunks
 N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
 # audio = whisper.load_audio('test.wav')
 def load_audio(file: str, sr: int = SAMPLE_RATE, start_time: int = 0, dtype=np.float16):
     """
@@ -59,55 +62,64 @@ def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
     return array
 class Model:
-    def __init__(self,
-                 model_name_or_path: str,
-                 cuda_visible_device: str = "0",
-                 device: str = 'cuda'   # torch.device("cuda" if torch.cuda.is_available() else "cpu")
-                 ):
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_device
         self.DEVICE = device
         self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
         self.tokenizer = self.processor.tokenizer
         self.config = WhisperConfig.from_pretrained(model_name_or_path)
         self.model = WhisperForConditionalGeneration(
-                config=self.config
-            ).from_pretrained(
-                            pretrained_model_name_or_path = model_name_or_path,
-                            torch_dtype = self.config.torch_dtype,
-                            # device_map=DEVICE,      # 'balanced', 'balanced_low_0', 'sequential', 'cuda', 'cpu'
-                            low_cpu_mem_usage = True,
-                        )
         # Move model to GPU
         if self.model.device.type != self.DEVICE:
-            print(f'Moving model to {self.DEVICE}')
             self.model = self.model.to(self.DEVICE)
             self.model.eval()
         else:
-            print(f'Model is already on {self.DEVICE}')
             self.model.eval()
-        print('dtype of model acc to config: ', self.config.torch_dtype)
-        print('dtype of loaded model: ', self.model.dtype)
-    def transcribe(self, audio, language: str = "english", skip_special_tokens: bool = True) -> str:
-        input_features = self.processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt").input_features.half().to(self.DEVICE)
         with torch.no_grad():
             predicted_ids = self.model.generate(
                 input_features,
-                num_beams = 1,
                 language=language,
                 task="transcribe",
                 use_cache=True,
                 is_multilingual=True,
                 return_timestamps=True,
             )
-        transcription = self.tokenizer.batch_decode(predicted_ids, skip_special_tokens=skip_special_tokens)[0]
-        return transcription.strip()

 from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    WhisperConfig,
 )
 import torch
 import ffmpeg
 CHUNK_LENGTH = 30  # 30-second chunks
 N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
 # audio = whisper.load_audio('test.wav')
 def load_audio(file: str, sr: int = SAMPLE_RATE, start_time: int = 0, dtype=np.float16):
     """
     return array
 class Model:
+    def __init__(
+        self,
+        model_name_or_path: str,
+        cuda_visible_device: str = "0",
+        device: str = "cuda",  # torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    ):
         os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_device
         self.DEVICE = device
         self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
         self.tokenizer = self.processor.tokenizer
         self.config = WhisperConfig.from_pretrained(model_name_or_path)
         self.model = WhisperForConditionalGeneration(
+            config=self.config
+        ).from_pretrained(
+            pretrained_model_name_or_path=model_name_or_path,
+            torch_dtype=self.config.torch_dtype,
+            # device_map=DEVICE,      # 'balanced', 'balanced_low_0', 'sequential', 'cuda', 'cpu'
+            low_cpu_mem_usage=True,
+        )
         # Move model to GPU
         if self.model.device.type != self.DEVICE:
+            print(f"Moving model to {self.DEVICE}")
             self.model = self.model.to(self.DEVICE)
             self.model.eval()
         else:
+            print(f"Model is already on {self.DEVICE}")
             self.model.eval()
+        print("dtype of model acc to config: ", self.config.torch_dtype)
+        print("dtype of loaded model: ", self.model.dtype)
+    def transcribe(
+        self, audio, language: str = "english", skip_special_tokens: bool = True
+    ) -> str:
+        input_features = (
+            self.processor(audio, sampling_rate=SAMPLE_RATE, return_tensors="pt")
+            .input_features.half()
+            .to(self.DEVICE)
+        )
         with torch.no_grad():
             predicted_ids = self.model.generate(
                 input_features,
+                num_beams=1,
                 language=language,
                 task="transcribe",
                 use_cache=True,
                 is_multilingual=True,
                 return_timestamps=True,
             )
+        transcription = self.tokenizer.batch_decode(
+            predicted_ids, skip_special_tokens=skip_special_tokens
+        )[0]
+        return transcription.strip()