update inference

Browse files

Files changed (20) hide show

README.md +8 -1
config.yaml +8 -3
finetune.py +120 -16
inference.py +3 -4
inference/compute-wer.py +565 -0
inference/inference-finetune-lid.py +168 -0
inference/inference-finetune-nolid.py +167 -0
inference/inference-ft.sh +4 -0
inference/inference-zeroshot-nolid.py +164 -0
inference/inference-zeroshot.py +165 -0
inference/inference-zs.sh +5 -0
inference/km_kh.ref +0 -0
inference/km_kh_finetune.pred +0 -0
inference/km_kh_finetune.wer +0 -0
inference/km_kh_finetune_nolid.pred +0 -0
inference/km_kh_finetune_nolid.wer +0 -0
inference/km_kh_zs_lid.pred +0 -0
inference/km_kh_zs_lid.wer +0 -0
inference/km_kh_zs_nolid.pred +0 -0
inference/km_kh_zs_nolid.wer +0 -0

README.md CHANGED Viewed

@@ -140,10 +140,17 @@ Key dependencies:
 - librosa (for audio processing)
 - evaluate (for metrics)
 ## Evaluation Results
 | Language    | Metric | Error Rate |
 |-------------|:------:|-----------:|
-| Khmer       |  CER   |     33.18% |

 - librosa (for audio processing)
 - evaluate (for metrics)
+## Zero-shot Results
+| LID    | Metric | Error Rate |
+|-------------|:------:|-----------:|
+| Khmer       |  CER   |     86.77% |
+| Auto        |  CER   |     86.39% |
 ## Evaluation Results
 | Language    | Metric | Error Rate |
 |-------------|:------:|-----------:|
+| Khmer       |  CER   |   55.66% |
+| Auto        |  CER   |   55.77% |

config.yaml CHANGED Viewed

@@ -4,11 +4,11 @@
 # Model Configuration
 model:
   checkpoint: "openai/whisper-large-v3"
-  max_target_length: 448
 # Output Configuration
 output:
-  output_dir: "./whisper-fleurs-km_kh-small"
 # Environment Configuration
 environment:
@@ -60,6 +60,10 @@ training:
     per_device_eval_batch_size: 16
     gradient_accumulation_steps: 1
   # Optimization settings
   gradient_checkpointing: true
@@ -86,7 +90,8 @@ training:
     - "tensorboard"
   # Hub settings
-  push_to_hub: false
   # Multi-GPU specific settings
   dataloader_drop_last: true

 # Model Configuration
 model:
   checkpoint: "openai/whisper-large-v3"
+  max_target_length: 446
 # Output Configuration
 output:
+  output_dir: "./ft-lid-whisper-fleurs-km_kh-small"
 # Environment Configuration
 environment:
     per_device_eval_batch_size: 16
     gradient_accumulation_steps: 1
+  multi_gpu:
+    per_device_train_batch_size: 4
+    per_device_eval_batch_size: 4
+    gradient_accumulation_steps: 1
   # Optimization settings
   gradient_checkpointing: true
     - "tensorboard"
   # Hub settings
+  push_to_hub: true
+  hub_private_repo: false  # Not pushing to a private repo for Khmer
   # Multi-GPU specific settings
   dataloader_drop_last: true

finetune.py CHANGED Viewed

@@ -47,6 +47,7 @@ import io
 import yaml
 import argparse
 from itertools import chain
 # Load configuration from YAML file
 def load_config(config_path):
@@ -132,6 +133,19 @@ class WhisperOnTheFlyDataset(TorchDataset):
         else:  # english, chinese
             text = item["text"]
         # Tokenize with appropriate processor
         if lang == "cebuano":
             labels = self.processors["cebuano"].tokenizer(
@@ -177,7 +191,8 @@ class WhisperOnTheFlyDataset(TorchDataset):
         return {
             "input_features": inputs.input_features.squeeze(0),
             "labels": labels.input_ids.squeeze(0),
-            "language": lang
         }
     def _process_audio(self, audio_sample):
@@ -216,16 +231,53 @@ class DataCollatorSpeechSeq2SeqWithPadding:
         label_features = [{"input_ids": feature["labels"]} for feature in features]
         # pad the labels to max length
         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
-        # replace padding with -100 to ignore loss correctly
-        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
-        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
-            labels = labels[:, 1:]
-        batch["labels"] = labels
         return batch
@@ -300,9 +352,9 @@ def load_chinese_dataset(dataset_config):
     """Load Chinese dataset with multiple test splits"""
     print("Loading Chinese...")
     wenet_train = load_dataset(dataset_config['train_dataset'], streaming=dataset_config['streaming'])
-    wenet_valid = load_dataset(dataset_config['validation_dataset'], dataset_config['validation_config'], split="validation", streaming=dataset_config['streaming'])
-    wenet_testnet = load_dataset(dataset_config['test_net_dataset'], dataset_config['test_net_config'], split="test", streaming=dataset_config['streaming'])
-    wenet_testmeeting = load_dataset(dataset_config['test_meeting_dataset'], dataset_config['test_meeting_config'], split="test", streaming=dataset_config['streaming'])
     return {
         "train": wenet_train["train"],
         "validation": wenet_valid,
@@ -352,12 +404,16 @@ else:
 model = WhisperForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
 # Multi-GPU handling
 if torch.cuda.device_count() > 1:
     print(f"Using {torch.cuda.device_count()} GPUs for training")
     # The model will be automatically distributed by the Trainer
-    model.to(device)
 else:
-    model.to(device)
@@ -519,10 +575,13 @@ def compute_metrics(pred):
     Compute WER and CER metrics for predictions
     """
     pred_ids = pred.predictions
     pred_str = main_processor.batch_decode(pred_ids, skip_special_tokens=True)
     label_ids = pred.label_ids
     label_ids[label_ids == -100] = main_processor.tokenizer.pad_token_id
     ref_str = main_processor.batch_decode(label_ids, skip_special_tokens=True)
     # lowercase & strip
@@ -531,7 +590,19 @@ def compute_metrics(pred):
     wer_score = wer_metric.compute(predictions=pred_str, references=ref_str)
     cer_score = cer_metric.compute(predictions=pred_str, references=ref_str)
-    return {"wer": wer_score, "cer": cer_score}
 # Check for multi-GPU setup
 num_gpus = torch.cuda.device_count()
@@ -578,6 +649,7 @@ training_args = Seq2SeqTrainingArguments(
     metric_for_best_model=training_config['metric_for_best_model'],
     greater_is_better=training_config['greater_is_better'],
     push_to_hub=training_config['push_to_hub'],
     save_total_limit=training_config['save_total_limit'],
     # Multi-GPU specific settings
     dataloader_drop_last=training_config['dataloader_drop_last'],
@@ -603,22 +675,50 @@ def evaluate_on_test_sets():
     results = {}
     for lang in enabled_languages:
         if lang in processed_datasets:
             lang_results = {}
             if lang == "chinese":
                 # Chinese has multiple test splits
                 if "test_net" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on WenetSpeech Chinese TEST_NET *****")
-                    chi_testnet_metrics = trainer.predict(processed_datasets[lang]["test_net"], metric_key_prefix=f"test_{lang}_net")
                     print(f"Chinese TEST_NET WER: {chi_testnet_metrics.metrics[f'test_{lang}_net_wer']*100:.2f}%")
                     print(f"Chinese TEST_NET CER: {chi_testnet_metrics.metrics[f'test_{lang}_net_cer']*100:.2f}%")
                     lang_results["test_net"] = chi_testnet_metrics.metrics
                 if "test_meeting" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on WenetSpeech Chinese TEST_MEETING *****")
-                    chi_testmeet_metrics = trainer.predict(processed_datasets[lang]["test_meeting"], metric_key_prefix=f"test_{lang}_meeting")
                     print(f"Chinese TEST_MEETING WER: {chi_testmeet_metrics.metrics[f'test_{lang}_meeting_wer']*100:.2f}%")
                     print(f"Chinese TEST_MEETING CER: {chi_testmeet_metrics.metrics[f'test_{lang}_meeting_cer']*100:.2f}%")
                     lang_results["test_meeting"] = chi_testmeet_metrics.metrics
@@ -626,7 +726,11 @@ def evaluate_on_test_sets():
                 # Standard test split
                 if "test" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on {lang.title()} test set *****")
-                    test_metrics = trainer.predict(processed_datasets[lang]["test"], metric_key_prefix=f"test_{lang}")
                     print(f"{lang.title()} Test WER: {test_metrics.metrics[f'test_{lang}_wer']*100:.2f}%")
                     print(f"{lang.title()} Test CER: {test_metrics.metrics[f'test_{lang}_cer']*100:.2f}%")
                     lang_results["test"] = test_metrics.metrics
@@ -666,7 +770,7 @@ if __name__ == "__main__":
     trainer.train()
     # Evaluate on all test sets
-    evaluate_on_test_sets()

 import yaml
 import argparse
 from itertools import chain
+import torch.distributed as dist
 # Load configuration from YAML file
 def load_config(config_path):
         else:  # english, chinese
             text = item["text"]
+        # Map language to Whisper language token ID
+        lang_id_map = {
+            "english": 50259,      # <|en|>
+            "chinese": 50260,      # <|zh|>
+            "indonesian": 50275,   # <|id|>
+            "malay": 50282,        # <|ms|>
+            "khmer": 50323,        # <|km|>
+            "cebuano": 50348,      # <|tl|> (using Tagalog as fallback for Cebuano)
+        }
+        # Get language token ID
+        lang_token_id = lang_id_map.get(lang)
         # Tokenize with appropriate processor
         if lang == "cebuano":
             labels = self.processors["cebuano"].tokenizer(
         return {
             "input_features": inputs.input_features.squeeze(0),
             "labels": labels.input_ids.squeeze(0),
+            "language": lang,
+            "language_token_id": lang_token_id
         }
     def _process_audio(self, audio_sample):
         label_features = [{"input_ids": feature["labels"]} for feature in features]
         # pad the labels to max length
         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
+        # Get original labels before modification
+        labels = labels_batch["input_ids"]
+        # Task ID is fixed for transcription (50360)
+        task_token_id = 50360  # transcribe task
+        # Create a tensor to store new labels with language and task tokens prepended
+        batch_size = labels.size(0)
+        seq_length = labels.size(1)
+        # Add 2 tokens (lang token and task token) at the beginning
+        new_labels = torch.full((batch_size, seq_length + 2), self.processor.tokenizer.pad_token_id, dtype=labels.dtype, device=labels.device)
+        # Add the language token and task token at the beginning for each sample
+        for i, feature in enumerate(features):
+            # Add SOT token as first token (50258)
+            # new_labels[i, 0] = 50258  # SOT token
+            # Add language token as second token if available
+            if "language_token_id" in feature and feature["language_token_id"] is not None:
+                new_labels[i, 0] = feature["language_token_id"]
+            # Add task token as third token
+            new_labels[i, 1] = task_token_id
+            # Copy the original label tokens after the special tokens
+            token_length = min(seq_length, labels.size(1))
+            new_labels[i, 2:2+token_length] = labels[i, :token_length]
+        # Create new attention mask for padded sequences
+        new_attention_mask = torch.zeros_like(new_labels, dtype=torch.long)
+        for i in range(batch_size):
+            # Find the last non-padding token in the original sequence
+            orig_seq_len = (labels[i] != self.processor.tokenizer.pad_token_id).sum().item()
+            # Set attention mask to 1 for all tokens up to the end of the sequence + 2 special tokens
+            new_attention_mask[i, :orig_seq_len+2] = 1
+        # Replace padding with -100 to ignore loss correctly
+        new_labels = new_labels.masked_fill(new_attention_mask.ne(1), -100)
         # if bos token is appended in previous tokenization step,
         # cut bos token here as it's append later anyways
+        if (new_labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
+            new_labels = new_labels[:, 1:]
+        batch["labels"] = new_labels
+        batch["attention_mask"] = new_attention_mask
         return batch
     """Load Chinese dataset with multiple test splits"""
     print("Loading Chinese...")
     wenet_train = load_dataset(dataset_config['train_dataset'], streaming=dataset_config['streaming'])
+    wenet_valid = load_dataset(dataset_config['validation_dataset'], dataset_config['validation_config'], split="validation", streaming=dataset_config['streaming'], trust_remote_code=dataset_config['trust_remote_code'])
+    wenet_testnet = load_dataset(dataset_config['test_net_dataset'], dataset_config['test_net_config'], split="test", streaming=dataset_config['streaming'], trust_remote_code=dataset_config['trust_remote_code'])
+    wenet_testmeeting = load_dataset(dataset_config['test_meeting_dataset'], dataset_config['test_meeting_config'], split="test", streaming=dataset_config['streaming'], trust_remote_code=dataset_config['trust_remote_code'])
     return {
         "train": wenet_train["train"],
         "validation": wenet_valid,
 model = WhisperForConditionalGeneration.from_pretrained(MODEL_CHECKPOINT)
 # Multi-GPU handling
+local_rank = int(os.environ["LOCAL_RANK"])
+torch.cuda.set_device(local_rank)
+print(f"Using GPU {local_rank} for training")
+dist.init_process_group(backend="nccl")
 if torch.cuda.device_count() > 1:
     print(f"Using {torch.cuda.device_count()} GPUs for training")
     # The model will be automatically distributed by the Trainer
+    model.to(torch.device("cuda", local_rank))
 else:
+    model.to(torch.device("cuda", local_rank))
     Compute WER and CER metrics for predictions
     """
     pred_ids = pred.predictions
+    # Decode predictions, skipping special tokens
     pred_str = main_processor.batch_decode(pred_ids, skip_special_tokens=True)
     label_ids = pred.label_ids
+    # Replace -100 with pad token ID for decoding
     label_ids[label_ids == -100] = main_processor.tokenizer.pad_token_id
+    # Decode reference texts, also skipping special tokens
     ref_str = main_processor.batch_decode(label_ids, skip_special_tokens=True)
     # lowercase & strip
     wer_score = wer_metric.compute(predictions=pred_str, references=ref_str)
     cer_score = cer_metric.compute(predictions=pred_str, references=ref_str)
+    # Combine metrics
+    metrics = {"wer": wer_score, "cer": cer_score}
+    # Log example predictions
+    if len(pred_str) > 0:
+        num_examples = min(3, len(pred_str))
+        for i in range(num_examples):
+            print(f"Example {i}:")
+            print(f"  Reference: {ref_str[i]}")
+            print(f"  Prediction: {pred_str[i]}")
+    return metrics
 # Check for multi-GPU setup
 num_gpus = torch.cuda.device_count()
     metric_for_best_model=training_config['metric_for_best_model'],
     greater_is_better=training_config['greater_is_better'],
     push_to_hub=training_config['push_to_hub'],
+    hub_private_repo=training_config['hub_private_repo'], # Always push to private repo
     save_total_limit=training_config['save_total_limit'],
     # Multi-GPU specific settings
     dataloader_drop_last=training_config['dataloader_drop_last'],
     results = {}
+    # Define language-specific generation parameters
+    lang_id_map = {
+        "english": 50259,      # <|en|>
+        "chinese": 50260,      # <|zh|>
+        "indonesian": 50275,   # <|id|>
+        "malay": 50282,        # <|ms|>
+        "khmer": 50323,        # <|km|>
+        "cebuano": 50348,      # <|tl|> (using Tagalog as fallback for Cebuano)
+    }
     for lang in enabled_languages:
         if lang in processed_datasets:
             lang_results = {}
+            # Set language-specific generation parameters
+            lang_token_id = lang_id_map.get(lang)
+            task_token_id = 50360  # transcribe task
+            # Define forced decoder IDs for generation if language is supported
+            forced_decoder_ids = None
+            if lang_token_id:
+                forced_decoder_ids = [[1, lang_token_id], [2, task_token_id]]
+                print(f"Using forced_decoder_ids for {lang}: {forced_decoder_ids}")
             if lang == "chinese":
                 # Chinese has multiple test splits
                 if "test_net" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on WenetSpeech Chinese TEST_NET *****")
+                    chi_testnet_metrics = trainer.predict(
+                        processed_datasets[lang]["test_net"],
+                        metric_key_prefix=f"test_{lang}_net",
+                        forced_decoder_ids=forced_decoder_ids
+                    )
                     print(f"Chinese TEST_NET WER: {chi_testnet_metrics.metrics[f'test_{lang}_net_wer']*100:.2f}%")
                     print(f"Chinese TEST_NET CER: {chi_testnet_metrics.metrics[f'test_{lang}_net_cer']*100:.2f}%")
                     lang_results["test_net"] = chi_testnet_metrics.metrics
                 if "test_meeting" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on WenetSpeech Chinese TEST_MEETING *****")
+                    chi_testmeet_metrics = trainer.predict(
+                        processed_datasets[lang]["test_meeting"],
+                        metric_key_prefix=f"test_{lang}_meeting",
+                        forced_decoder_ids=forced_decoder_ids
+                    )
                     print(f"Chinese TEST_MEETING WER: {chi_testmeet_metrics.metrics[f'test_{lang}_meeting_wer']*100:.2f}%")
                     print(f"Chinese TEST_MEETING CER: {chi_testmeet_metrics.metrics[f'test_{lang}_meeting_cer']*100:.2f}%")
                     lang_results["test_meeting"] = chi_testmeet_metrics.metrics
                 # Standard test split
                 if "test" in processed_datasets[lang]:
                     print(f"\n***** Evaluating on {lang.title()} test set *****")
+                    test_metrics = trainer.predict(
+                        processed_datasets[lang]["test"],
+                        metric_key_prefix=f"test_{lang}",
+                        forced_decoder_ids=forced_decoder_ids
+                    )
                     print(f"{lang.title()} Test WER: {test_metrics.metrics[f'test_{lang}_wer']*100:.2f}%")
                     print(f"{lang.title()} Test CER: {test_metrics.metrics[f'test_{lang}_cer']*100:.2f}%")
                     lang_results["test"] = test_metrics.metrics
     trainer.train()
     # Evaluate on all test sets
+    # evaluate_on_test_sets()

inference.py CHANGED Viewed

@@ -22,14 +22,14 @@ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "./whisper-fleurs-km_kh-small"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
 model.to(device)
 whisper_model = "openai/whisper-large-v3"
-processor = WhisperProcessor.from_pretrained(whisper_model, language="khmer")
 asr = pipeline(
     "automatic-speech-recognition",
@@ -44,7 +44,6 @@ asr = pipeline(
     num_beams=1,                   # Use beam search for better quality
     do_sample=False,               # Disable sampling for deterministic output
     early_stopping=False,           # Stop when sufficient beams are complete
-    suppress_tokens=[],
 )
@@ -52,7 +51,7 @@ asr = pipeline(
 def transcribe_batch(batch):
     # `batch["audio"]` is a list of {"array": np.ndarray, ...}
     inputs = [ ex["array"] for ex in batch["audio"] ]
-    outputs = asr(inputs)  # returns a list of dicts with "text"
     # lower-case and strip to normalize for CER
     preds = [ out["text"].lower().strip() for out in outputs ]
     return {"prediction": preds}

 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "./ft-lid-whisper-fleurs-km_kh-small"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 )
 model.to(device)
 whisper_model = "openai/whisper-large-v3"
+processor = WhisperProcessor.from_pretrained(whisper_model)
 asr = pipeline(
     "automatic-speech-recognition",
     num_beams=1,                   # Use beam search for better quality
     do_sample=False,               # Disable sampling for deterministic output
     early_stopping=False,           # Stop when sufficient beams are complete
 )
 def transcribe_batch(batch):
     # `batch["audio"]` is a list of {"array": np.ndarray, ...}
     inputs = [ ex["array"] for ex in batch["audio"] ]
+    outputs = asr(inputs, generate_kwargs={"language": "khmer"})  # returns a list of dicts with "text"
     # lower-case and strip to normalize for CER
     preds = [ out["text"].lower().strip() for out in outputs ]
     return {"prediction": preds}

inference/compute-wer.py ADDED Viewed

	@@ -0,0 +1,565 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import re, sys, unicodedata
+import codecs
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》', '(', ')', '（', '）', '[', ']', '【', '】', '{', '}', '〔', '〕',
+    '⟨', '⟩', '《', '》'
+]
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        x = re.sub(r'[.,!?;:()\[\]{}<>""„""«»‹›\/\\|@#$%^&*_=+~`-]', '', x)
+        # Skip tokens containing any digits
+        if re.search(r'\d', x):
+            continue
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'
+                    .format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+    def keys(self):
+        return list(self.data.keys())
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+def default_cluster(word):
+    # unicode_names = [unicodedata.name(char) for char in word]
+    unicode_names = []
+    for char in word:
+        try:
+            unicode_names.append(unicodedata.name(char))
+        except ValueError:
+            unicode_names.append("UNK")
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH')
+              or unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER')
+              or unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND')
+              or unicode_names[i].startswith('APOSTROPHE')
+              or unicode_names[i].startswith('COMMERCIAL AT')
+              or unicode_names[i].startswith('DEGREE CELSIUS')
+              or unicode_names[i].startswith('EQUALS SIGN')
+              or unicode_names[i].startswith('FULL STOP')
+              or unicode_names[i].startswith('HYPHEN-MINUS')
+              or unicode_names[i].startswith('LOW LINE')
+              or unicode_names[i].startswith('NUMBER SIGN')
+              or unicode_names[i].startswith('PLUS SIGN')
+              or unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+    default_clusters = {}
+    default_words = {}
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] +
+                    result['del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] +
+                            result['del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] +
+                                        result['del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )

inference/inference-finetune-lid.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python
+# pip install transformers datasets torch soundfile jiwer
+from datasets import load_dataset, Audio
+from transformers import pipeline, WhisperProcessor
+from torch.utils.data import DataLoader
+import torch
+from jiwer import wer as jiwer_wer
+from jiwer import cer as jiwer_cer
+import ipdb
+import subprocess
+import os
+# 1. Load FLEURS Burmese test set, cast to 16 kHz audio
+ds = load_dataset("google/fleurs", "km_kh", split="test", trust_remote_code=True)
+ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "pengyizhou/whisper-fleurs-km_kh-small"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+whisper_model = "openai/whisper-large-v3"
+processor = WhisperProcessor.from_pretrained(whisper_model)
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    chunk_length_s=30,
+    batch_size=64,
+    max_new_tokens=225,
+    device=device,
+    num_beams=1,                   # Use beam search for better quality
+)
+generate_kwargs = {
+    "condition_on_prev_tokens": False,
+    "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
+    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    "logprob_threshold": -1.0,
+    "language": "khmer",  # Specify the language for transcription
+}
+# 3. Batch‐wise transcription function
+def transcribe_batch(batch):
+    # `batch["audio"]` is a list of {"array": np.ndarray, ...}
+    inputs = [ ex["array"] for ex in batch["audio"] ]
+    outputs = asr(inputs, generate_kwargs=generate_kwargs)  # returns a list of dicts with "text"
+    # lower-case and strip to normalize for CER
+    preds = [ out["text"].lower().strip() for out in outputs ]
+    return {"prediction": preds}
+# 4. Map over the dataset in chunks of, say, 32 examples at a time
+result = ds.map(
+    transcribe_batch,
+    batched=True,
+    batch_size=64,              # feed 32 audios → pipeline will sub-batch into 8s
+    remove_columns=ds.column_names
+)
+# ipdb.set_trace()
+# 5. Compute corpus-level CER with jiwer
+# refs = "\n".join(t.lower().strip() for t in ds["transcription"])
+# preds = "\n".join(t for t in result["prediction"])
+# score = jiwer_cer(refs, preds)
+ids = [key for key in ds["id"]]
+refs = [t.lower().strip() for t in ds["transcription"]]
+preds = [t for t in result["prediction"]]
+score_cer = jiwer_cer(refs, preds)
+score_wer = jiwer_wer(refs, preds)
+print(f"CER on FLEURS km_kh: {score_cer*100:.2f}%")
+print(f"WER on FLEURS km_kh: {score_wer*100:.2f}%")
+# Function to add spaces between characters for CER calculation
+def add_char_spaces(text):
+    """Add spaces between each character for character-level evaluation"""
+    return ' '.join(list(text.strip()))
+with open("./km_kh_finetune.pred", "w") as pred_results:
+    for key, pred in zip(ids, preds):
+        pred_with_spaces = add_char_spaces(pred)
+        pred_results.write("{} {}\n".format(key, pred_with_spaces))
+with open("./km_kh.ref", "w") as ref_results:
+    for key, ref in zip(ids, refs):
+        ref_with_spaces = add_char_spaces(ref)
+        ref_results.write("{} {}\n".format(key, ref_with_spaces))
+# Generate WER file using compute-wer.py
+print("Generating detailed WER analysis...")
+# Check if compute-wer.py exists
+compute_wer_script = "./compute-wer.py"
+if not os.path.exists(compute_wer_script):
+    # Try to find it in parent directories or common locations
+    possible_locations = [
+        "./compute-wer.py",
+    ]
+    for location in possible_locations:
+        if os.path.exists(location):
+            compute_wer_script = location
+            break
+    else:
+        print(f"Warning: compute-wer.py not found. Tried: {[compute_wer_script] + possible_locations}")
+        print("Skipping detailed WER analysis.")
+        compute_wer_script = None
+if compute_wer_script:
+    try:
+        # Run compute-wer.py with character-level analysis
+        ref_file = "./km_kh.ref"
+        hyp_file = "./km_kh_finetune.pred"
+        wer_file = "./km_kh_finetune.wer"
+        cmd = [
+            "python", compute_wer_script,
+            "--char=1",  # Character-level analysis
+            "--v=1",     # Verbose output
+            ref_file,
+            hyp_file
+        ]
+        print(f"Running: {' '.join(cmd)} > {wer_file}")
+        # Run the command and redirect output to wer file
+        with open(wer_file, "w") as wer_output:
+            result = subprocess.run(
+                cmd,
+                stdout=wer_output,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+        print(f"CER analysis saved to {wer_file}")
+        # Optionally, print the first few lines of the WER file
+        if os.path.exists(wer_file):
+            print("\nFirst few lines of WER analysis:")
+            with open(wer_file, "r") as f:
+                lines = f.readlines()
+                for i, line in enumerate(lines[:10]):  # Show first 10 lines
+                    print(f"  {line.rstrip()}")
+                if len(lines) > 10:
+                    print(f"  ... ({len(lines) - 10} more lines)")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running compute-wer.py: {e}")
+        if e.stderr:
+            print(f"Error details: {e.stderr}")
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+print("Inference and CER analysis completed!")

inference/inference-finetune-nolid.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#!/usr/bin/env python
+# pip install transformers datasets torch soundfile jiwer
+from datasets import load_dataset, Audio
+from transformers import pipeline, WhisperProcessor
+from torch.utils.data import DataLoader
+import torch
+from jiwer import wer as jiwer_wer
+from jiwer import cer as jiwer_cer
+import ipdb
+import subprocess
+import os
+# 1. Load FLEURS Burmese test set, cast to 16 kHz audio
+ds = load_dataset("google/fleurs", "km_kh", split="test", trust_remote_code=True)
+ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "pengyizhou/whisper-fleurs-km_kh-small"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+whisper_model = "openai/whisper-large-v3"
+processor = WhisperProcessor.from_pretrained(whisper_model)
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    chunk_length_s=30,
+    batch_size=64,
+    max_new_tokens=225,
+    device=device,
+    num_beams=1,                   # Use beam search for better quality
+)
+generate_kwargs = {
+    "condition_on_prev_tokens": False,
+    "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
+    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    "logprob_threshold": -1.0,
+}
+# 3. Batch‐wise transcription function
+def transcribe_batch(batch):
+    # `batch["audio"]` is a list of {"array": np.ndarray, ...}
+    inputs = [ ex["array"] for ex in batch["audio"] ]
+    outputs = asr(inputs, generate_kwargs=generate_kwargs)  # returns a list of dicts with "text"
+    # lower-case and strip to normalize for CER
+    preds = [ out["text"].lower().strip() for out in outputs ]
+    return {"prediction": preds}
+# 4. Map over the dataset in chunks of, say, 32 examples at a time
+result = ds.map(
+    transcribe_batch,
+    batched=True,
+    batch_size=64,              # feed 32 audios → pipeline will sub-batch into 8s
+    remove_columns=ds.column_names
+)
+# ipdb.set_trace()
+# 5. Compute corpus-level CER with jiwer
+# refs = "\n".join(t.lower().strip() for t in ds["transcription"])
+# preds = "\n".join(t for t in result["prediction"])
+# score = jiwer_cer(refs, preds)
+ids = [key for key in ds["id"]]
+refs = [t.lower().strip() for t in ds["transcription"]]
+preds = [t for t in result["prediction"]]
+score_cer = jiwer_cer(refs, preds)
+score_wer = jiwer_wer(refs, preds)
+print(f"CER on FLEURS km_kh: {score_cer*100:.2f}%")
+print(f"WER on FLEURS km_kh: {score_wer*100:.2f}%")
+# Function to add spaces between characters for CER calculation
+def add_char_spaces(text):
+    """Add spaces between each character for character-level evaluation"""
+    return ' '.join(list(text.strip()))
+with open("./km_kh_finetune_nolid.pred", "w") as pred_results:
+    for key, pred in zip(ids, preds):
+        pred_with_spaces = add_char_spaces(pred)
+        pred_results.write("{} {}\n".format(key, pred_with_spaces))
+with open("./km_kh.ref", "w") as ref_results:
+    for key, ref in zip(ids, refs):
+        ref_with_spaces = add_char_spaces(ref)
+        ref_results.write("{} {}\n".format(key, ref_with_spaces))
+# Generate WER file using compute-wer.py
+print("Generating detailed WER analysis...")
+# Check if compute-wer.py exists
+compute_wer_script = "./compute-wer.py"
+if not os.path.exists(compute_wer_script):
+    # Try to find it in parent directories or common locations
+    possible_locations = [
+        "./compute-wer.py",
+    ]
+    for location in possible_locations:
+        if os.path.exists(location):
+            compute_wer_script = location
+            break
+    else:
+        print(f"Warning: compute-wer.py not found. Tried: {[compute_wer_script] + possible_locations}")
+        print("Skipping detailed WER analysis.")
+        compute_wer_script = None
+if compute_wer_script:
+    try:
+        # Run compute-wer.py with character-level analysis
+        ref_file = "./km_kh.ref"
+        hyp_file = "./km_kh_finetune_nolid.pred"
+        wer_file = "./km_kh_finetune_nolid.wer"
+        cmd = [
+            "python", compute_wer_script,
+            "--char=1",  # Character-level analysis
+            "--v=1",     # Verbose output
+            ref_file,
+            hyp_file
+        ]
+        print(f"Running: {' '.join(cmd)} > {wer_file}")
+        # Run the command and redirect output to wer file
+        with open(wer_file, "w") as wer_output:
+            result = subprocess.run(
+                cmd,
+                stdout=wer_output,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+        print(f"CER analysis saved to {wer_file}")
+        # Optionally, print the first few lines of the WER file
+        if os.path.exists(wer_file):
+            print("\nFirst few lines of WER analysis:")
+            with open(wer_file, "r") as f:
+                lines = f.readlines()
+                for i, line in enumerate(lines[:10]):  # Show first 10 lines
+                    print(f"  {line.rstrip()}")
+                if len(lines) > 10:
+                    print(f"  ... ({len(lines) - 10} more lines)")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running compute-wer.py: {e}")
+        if e.stderr:
+            print(f"Error details: {e.stderr}")
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+print("Inference and CER analysis completed!")

inference/inference-ft.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/bash
+python ./inference-finetune-lid.py
+python ./inference-finetune-nolid.py

inference/inference-zeroshot-nolid.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python
+# pip install transformers datasets torch soundfile jiwer
+from datasets import load_dataset, Audio
+from transformers import pipeline, WhisperProcessor
+from torch.utils.data import DataLoader
+import torch
+from jiwer import wer as jiwer_wer
+from jiwer import cer as jiwer_cer
+import ipdb
+import subprocess
+import os
+# 1. Load FLEURS Burmese test set, cast to 16 kHz audio
+ds = load_dataset("google/fleurs", "km_kh", split="test")
+ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "openai/whisper-large-v3"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+whisper_model = "openai/whisper-large-v3"
+processor = WhisperProcessor.from_pretrained(whisper_model, language="khmer")
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    chunk_length_s=30,
+    batch_size=64,
+    max_new_tokens=225,
+    device=device,
+    num_beams=1,                   # Use beam search for better quality
+)
+generate_kwargs = {
+    "condition_on_prev_tokens": False,
+    "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
+    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    "logprob_threshold": -1.0,
+}
+# 3. Batch‐wise transcription function
+def transcribe_batch(batch):
+    # `batch["audio"]` is a list of {"array": np.ndarray, ...}
+    inputs = [ ex["array"] for ex in batch["audio"] ]
+    outputs = asr(inputs, generate_kwargs=generate_kwargs)  # returns a list of dicts with "text"
+    # lower-case and strip to normalize for CER
+    preds = [ out["text"].lower().strip() for out in outputs ]
+    return {"prediction": preds}
+# 4. Map over the dataset in chunks of, say, 32 examples at a time
+result = ds.map(
+    transcribe_batch,
+    batched=True,
+    batch_size=64,              # feed 64 audios → pipeline will sub-batch into 8s
+    remove_columns=ds.column_names
+)
+# ipdb.set_trace()
+# 5. Compute corpus-level CER with jiwer
+# refs = "\n".join(t.lower().strip() for t in ds["transcription"])
+# preds = "\n".join(t for t in result["prediction"])
+# score = jiwer_cer(refs, preds)
+ids = [key for key in ds["id"]]
+refs = [t.lower().strip() for t in ds["transcription"]]
+preds = [t for t in result["prediction"]]
+score_cer = jiwer_cer(refs, preds)
+score_wer = jiwer_wer(refs, preds)
+print(f"CER on FLEURS km_kh: {score_cer*100:.2f}%")
+print(f"WER on FLEURS km_kh: {score_wer*100:.2f}%")
+# Function to add spaces between characters for CER calculation
+def add_char_spaces(text):
+    """Add spaces between each character for character-level evaluation"""
+    return ' '.join(list(text.strip()))
+with open("./km_kh_zs_nolid.pred", "w") as pred_results:
+    for key, pred in zip(ids, preds):
+        pred_with_spaces = add_char_spaces(pred)
+        pred_results.write("{} {}\n".format(key, pred_with_spaces))
+with open("./km_kh.ref", "w") as ref_results:
+    for key, ref in zip(ids, refs):
+        ref_with_spaces = add_char_spaces(ref)
+        ref_results.write("{} {}\n".format(key, ref_with_spaces))
+# Generate WER file using compute-wer.py
+print("Generating detailed WER analysis...")
+# Check if compute-wer.py exists
+compute_wer_script = "./compute-wer.py"
+if not os.path.exists(compute_wer_script):
+    # Try to find it in parent directories or common locations
+    possible_locations = [
+        "./compute-wer.py",
+    ]
+    for location in possible_locations:
+        if os.path.exists(location):
+            compute_wer_script = location
+            break
+    else:
+        print(f"Warning: compute-wer.py not found. Tried: {[compute_wer_script] + possible_locations}")
+        print("Skipping detailed WER analysis.")
+        compute_wer_script = None
+if compute_wer_script:
+    try:
+        # Run compute-wer.py with character-level analysis
+        ref_file = "./km_kh.ref"
+        hyp_file = "./km_kh_zs_nolid.pred"
+        wer_file = "./km_kh_zs_nolid.wer"
+        cmd = [
+            "python", compute_wer_script,
+            "--char=1",  # Character-level analysis
+            "--v=1",     # Verbose output
+            ref_file,
+            hyp_file
+        ]
+        print(f"Running: {' '.join(cmd)} > {wer_file}")
+        # Run the command and redirect output to wer file
+        with open(wer_file, "w") as wer_output:
+            result = subprocess.run(
+                cmd,
+                stdout=wer_output,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+        print(f"CER analysis saved to {wer_file}")
+        # Optionally, print the first few lines of the WER file
+        if os.path.exists(wer_file):
+            print("\nFirst few lines of WER analysis:")
+            with open(wer_file, "r") as f:
+                lines = f.readlines()
+                for i, line in enumerate(lines[:10]):  # Show first 10 lines
+                    print(f"  {line.rstrip()}")
+                if len(lines) > 10:
+                    print(f"  ... ({len(lines) - 10} more lines)")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running compute-wer.py: {e}")
+        if e.stderr:
+            print(f"Error details: {e.stderr}")
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+print("Inference and CER analysis completed!")

inference/inference-zeroshot.py ADDED Viewed

	@@ -0,0 +1,165 @@

+#!/usr/bin/env python
+# pip install transformers datasets torch soundfile jiwer
+from datasets import load_dataset, Audio
+from transformers import pipeline, WhisperProcessor
+from torch.utils.data import DataLoader
+import torch
+from jiwer import wer as jiwer_wer
+from jiwer import cer as jiwer_cer
+import ipdb
+import subprocess
+import os
+# 1. Load FLEURS Burmese test set, cast to 16 kHz audio
+ds = load_dataset("google/fleurs", "km_kh", split="test")
+ds = ds.cast_column("audio", Audio(sampling_rate=16_000))
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "openai/whisper-large-v3"
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model.to(device)
+whisper_model = "openai/whisper-large-v3"
+processor = WhisperProcessor.from_pretrained(whisper_model, language="khmer")
+asr = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch_dtype,
+    chunk_length_s=30,
+    batch_size=64,
+    max_new_tokens=225,
+    device=device,
+    num_beams=1,                   # Use beam search for better quality
+)
+generate_kwargs = {
+    "condition_on_prev_tokens": False,
+    "compression_ratio_threshold": 1.35,  # zlib compression ratio threshold (in token space)
+    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
+    "logprob_threshold": -1.0,
+    "language": "khmer",  # Specify the language for transcription
+}
+# 3. Batch‐wise transcription function
+def transcribe_batch(batch):
+    # `batch["audio"]` is a list of {"array": np.ndarray, ...}
+    inputs = [ ex["array"] for ex in batch["audio"] ]
+    outputs = asr(inputs, generate_kwargs=generate_kwargs)  # returns a list of dicts with "text"
+    # lower-case and strip to normalize for CER
+    preds = [ out["text"].lower().strip() for out in outputs ]
+    return {"prediction": preds}
+# 4. Map over the dataset in chunks of, say, 32 examples at a time
+result = ds.map(
+    transcribe_batch,
+    batched=True,
+    batch_size=64,              # feed 32 audios → pipeline will sub-batch into 8s
+    remove_columns=ds.column_names
+)
+# ipdb.set_trace()
+# 5. Compute corpus-level CER with jiwer
+# refs = "\n".join(t.lower().strip() for t in ds["transcription"])
+# preds = "\n".join(t for t in result["prediction"])
+# score = jiwer_cer(refs, preds)
+ids = [key for key in ds["id"]]
+refs = [t.lower().strip() for t in ds["transcription"]]
+preds = [t for t in result["prediction"]]
+score_cer = jiwer_cer(refs, preds)
+score_wer = jiwer_wer(refs, preds)
+print(f"CER on FLEURS km_kh: {score_cer*100:.2f}%")
+print(f"WER on FLEURS km_kh: {score_wer*100:.2f}%")
+# Function to add spaces between characters for CER calculation
+def add_char_spaces(text):
+    """Add spaces between each character for character-level evaluation"""
+    return ' '.join(list(text.strip()))
+with open("./km_kh_zs_lid.pred", "w") as pred_results:
+    for key, pred in zip(ids, preds):
+        pred_with_spaces = add_char_spaces(pred)
+        pred_results.write("{} {}\n".format(key, pred_with_spaces))
+with open("./km_kh.ref", "w") as ref_results:
+    for key, ref in zip(ids, refs):
+        ref_with_spaces = add_char_spaces(ref)
+        ref_results.write("{} {}\n".format(key, ref_with_spaces))
+# Generate WER file using compute-wer.py
+print("Generating detailed WER analysis...")
+# Check if compute-wer.py exists
+compute_wer_script = "./compute-wer.py"
+if not os.path.exists(compute_wer_script):
+    # Try to find it in parent directories or common locations
+    possible_locations = [
+        "./compute-wer.py",
+    ]
+    for location in possible_locations:
+        if os.path.exists(location):
+            compute_wer_script = location
+            break
+    else:
+        print(f"Warning: compute-wer.py not found. Tried: {[compute_wer_script] + possible_locations}")
+        print("Skipping detailed WER analysis.")
+        compute_wer_script = None
+if compute_wer_script:
+    try:
+        # Run compute-wer.py with character-level analysis
+        ref_file = "./km_kh.ref"
+        hyp_file = "./km_kh_zs_lid.pred"
+        wer_file = "./km_kh_zs_lid.wer"
+        cmd = [
+            "python", compute_wer_script,
+            "--char=1",  # Character-level analysis
+            "--v=1",     # Verbose output
+            ref_file,
+            hyp_file
+        ]
+        print(f"Running: {' '.join(cmd)} > {wer_file}")
+        # Run the command and redirect output to wer file
+        with open(wer_file, "w") as wer_output:
+            result = subprocess.run(
+                cmd,
+                stdout=wer_output,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+        print(f"CER analysis saved to {wer_file}")
+        # Optionally, print the first few lines of the WER file
+        if os.path.exists(wer_file):
+            print("\nFirst few lines of WER analysis:")
+            with open(wer_file, "r") as f:
+                lines = f.readlines()
+                for i, line in enumerate(lines[:10]):  # Show first 10 lines
+                    print(f"  {line.rstrip()}")
+                if len(lines) > 10:
+                    print(f"  ... ({len(lines) - 10} more lines)")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running compute-wer.py: {e}")
+        if e.stderr:
+            print(f"Error details: {e.stderr}")
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+print("Inference and CER analysis completed!")

inference/inference-zs.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+python ./inference-zeroshot.py
+python ./inference-zeroshot-nolid.py
+python ./inference.py

inference/km_kh.ref ADDED Viewed