In [1]:
# Install required packages for Whisper-Turbo fine-tuning
!pip install pandas transformers[sentencepiece] torch datasets audiofile librosa soundfile accelerate nlpo3 jiwer evaluate


Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting audiofile
  Downloading audiofile-1.5.1-py3-none-any.whl.metadata (4.9 kB)
Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)
Collecting accelerate
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
Collecting nlpo3
  Downloading nlpo3-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.1 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting transformers[senten

In [2]:
!apt-get update && apt-get install -y ffmpeg

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:3 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1381 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]      
Get:7 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1239 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]     
Get:9 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [47.7 kB]
Get:10 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3892 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2737 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy/

In [3]:
# !wget https://raw.githubusercontent.com/PyThaiNLP/nlpo3/refs/heads/main/words_th.txt

In [4]:
import torch
MODEL_NAME = "biodatlab/whisper-th-large-v3-combined"
lang = "th"
device = 0 if torch.cuda.is_available() else "cpu"

In [5]:
import pandas as pd
from datasets import Dataset, Audio
from dataclasses import dataclass

train_df = pd.read_csv('./readytotune-train.csv')
train_df = train_df.head(1925)

final_train_df = pd.concat([ train_df ])
final_train_df = final_train_df[['audioUrl','answer']]

test_df = pd.read_csv('./readytotune-test.csv')
test_df = test_df.head(100)

final_test_df = pd.concat([test_df])
final_test_df = final_test_df[['audioUrl','answer']]

train_dataset = Dataset.from_pandas(final_train_df).cast_column("audioUrl", Audio(sampling_rate=16000))
eval_dataset = Dataset.from_pandas(final_test_df).cast_column("audioUrl", Audio(sampling_rate=16000))

In [8]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(
    MODEL_NAME,
    # language="th",
    task="transcribe"
)
tokenizer = processor.tokenizer

def prepare_example(batch):
    # Assuming each element in batch["audioUrl"] is a dict with the key "array"
    audio_arrays = [audio["array"] for audio in batch["audioUrl"]]
    texts = batch["answer"]
    
    processed = processor(
        audio_arrays,
        sampling_rate=16000,
        text=texts,
    )
    # The processor now returns batched outputs, so assign them directly:
    batch["input_features"] = processed["input_features"]
    batch["labels"] = processed["labels"]
    return batch

# Use batched mapping and parallel processing:
train_dataset = train_dataset.map(prepare_example, batched=True, batch_size=16, num_proc=16)
eval_dataset  = eval_dataset.map(prepare_example, batched=True, batch_size=16, num_proc=16)

Map (num_proc=16):   0%|          | 0/1923 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForSpeechSeq2Seq

@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: AutoProcessor

    def __call__(self, features):
        # Convert input_features to tensors here
        input_features_list = []
        for f in features:
            # If f["input_features"] is a list or NumPy array, wrap with torch.tensor
            input_features_list.append(torch.tensor(f["input_features"]))
        input_features = torch.stack(input_features_list)

        labels = [f["labels"] for f in features]
        labels_batch = self.processor.tokenizer.pad(
            {"input_ids": labels},
            return_tensors="pt",
            padding=True
        )

        return {
            "input_features": input_features,  # (batch_size, feature_dim, time)
            "labels": labels_batch["input_ids"]
        }


data_collator = DataCollatorSpeechSeq2Seq(processor)
model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)

In [11]:
from transformers import TrainerCallback, AutoProcessor, pipeline
from nlpo3 import load_dict, segment
from jiwer import wer
import evaluate

metric = evaluate.load("wer")

# Load dictionary for Thai segmentation
load_dict("./words_th.txt", "dict_default")

def tokenize(text: str) -> str:
    tokens = segment(text, 'dict_default')
    return " ".join(tokens)


def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    pred_str_tokenize = [tokenize(s) for s in pred_str]
    label_str_tokenize = [tokenize(s) for s in label_str]

    wer = 100 * metric.compute(predictions=pred_str_tokenize, references=label_str_tokenize)
    # if wer > 40:
    #     print('pred_str_tokenize', pred_str_tokenize)
    #     print('label_str_tokenize', label_str_tokenize)

    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [13]:
from transformers import Seq2SeqTrainingArguments

model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="th",
    task="transcribe"
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-th-large-finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,

    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    load_best_model_at_end=True,

    predict_with_generate=True,
    metric_for_best_model="wer",
    greater_is_better=False,

    num_train_epochs=3,
    learning_rate=5e-6, # 0.00001
    weight_decay=0.01,
    report_to="none",  
)

In [14]:
from transformers import Seq2SeqTrainer

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,  # With learning_rate=1e-5, weight_decay=0.01
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,  # Updated from Step 6
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor
)

# Train
trainer.train()

# Evaluate
eval_metrics = trainer.evaluate()
print("Eval metrics:", eval_metrics)
print("Best checkpoint:", trainer.state.best_model_checkpoint)

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
100,0.4226,0.103267,30.811808


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Eval metrics: {'eval_loss': 0.10326727479696274, 'eval_wer': 30.81180811808118, 'eval_runtime': 60.3495, 'eval_samples_per_second': 1.657, 'eval_steps_per_second': 0.215, 'epoch': 2.958677685950413}
Best checkpoint: ./whisper-th-large-finetuned/checkpoint-100


In [None]:
# Average WER (Google) on evaluation dataset: 0.577

In [15]:
trainer.state.best_model_checkpoint

'./whisper-th-large-finetuned/checkpoint-100'

In [16]:
trainer.state.best_metric

30.81180811808118

In [17]:
#Save the weight to folder ./amity-stt-th-v-0-1
trainer.save_model("./amity-stt-th-v-0-1-test")
processor.save_pretrained("./amity-stt-th-v-0-1-test")

[]

In [None]:
#Upload model to huggingface hub by cli

# cd amity-stt-th-v-0-1
# huggingface-cli login
# huggingface-cli upload-large-folder amitysolution/amity-stt-th-v-0-1 --repo-type=model ./