File size: 2,512 Bytes
b1b221d
 
 
 
 
 
73a3144
b1b221d
 
 
73a3144
b1b221d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73a3144
 
 
 
b1b221d
 
 
 
 
 
 
 
 
 
 
 
 
3216a37
b1b221d
 
 
 
 
 
 
 
 
 
 
 
73a3144
 
b1b221d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Configuration for training only on Khmer (FLEURS km_kh) data
# Fine-tuning Whisper on Khmer language using Google FLEURS dataset

# Model Configuration
model:
  checkpoint: "openai/whisper-large-v3"
  max_target_length: 446

# Output Configuration
output:
  output_dir: "./ft-lid-whisper-fleurs-km_kh-small"

# Environment Configuration
environment:
  max_cpu_cores: 20
  test_cpu_cores: 20
  omp_num_threads: "20"
  mkl_num_threads: "20"
  openblas_num_threads: "20"
  veclib_maximum_threads: "20"
  numexpr_num_threads: "20"
  tokenizers_parallelism: "false"
  transformers_no_tf: "1"

# Audio Processing Configuration
audio:
  sampling_rate: 16000

# Language Configurations - Khmer only
languages:
    
  khmer:
    whisper_language: "khmer"
    fleurs_language: "km_kh"
    text_key: "transcription"
    train_subset_ratio: 0.25  # Use only 25% of training data for faster training/experimentation

# Dataset Configurations - Khmer FLEURS
datasets:
    
  khmer:
    source: "google/fleurs"
    language_code: "km_kh"
    splits:
      train: "train"
      validation: "validation"
      test: "test"
    trust_remote_code: true

# Training Configuration
training:
  # Basic training parameters
  learning_rate: 1.0e-5
  warmup_steps: 100
  max_steps: 800
  
  # Batch size and accumulation
  single_gpu:
    per_device_train_batch_size: 16
    per_device_eval_batch_size: 16
    gradient_accumulation_steps: 1

  multi_gpu:
    per_device_train_batch_size: 4
    per_device_eval_batch_size: 4
    gradient_accumulation_steps: 1
  
  # Optimization settings
  gradient_checkpointing: true
  fp16: true
  
  # Evaluation settings
  eval_strategy: "steps"
  eval_steps: 100
  predict_with_generate: true
  generation_max_length: 225
  
  # Saving and logging
  save_steps: 100
  logging_steps: 10
  save_total_limit: 3
  
  # Model selection
  load_best_model_at_end: true
  metric_for_best_model: "cer"  # Using CER for Khmer (character-based language)
  greater_is_better: false
  
  # Reporting
  report_to:
    - "tensorboard"
  
  # Hub settings
  push_to_hub: true
  hub_private_repo: false  # Not pushing to a private repo for Khmer
  
  # Multi-GPU specific settings
  dataloader_drop_last: true
  ddp_find_unused_parameters: false

# Data Processing Configuration
data_processing:
  # Random seed for reproducibility
  seed: 42
  
  # Columns to remove during standardization
  columns_to_remove:
    - "id"
    - "num_samples"
    - "path"
    - "speaker_id"
    - "chapter_id"
    - "segment_id"