| | """ |
| | Training script for the Chess Challenge. |
| | |
| | This script provides a complete training pipeline using the Hugging Face Trainer. |
| | Students can modify this script to experiment with different training strategies. |
| | """ |
| |
|
| | from __future__ import annotations |
| |
|
| | import argparse |
| | import os |
| | import warnings |
| | from pathlib import Path |
| |
|
| | |
| | warnings.filterwarnings("ignore", message="'return' in a 'finally' block") |
| |
|
| | import torch |
| | from transformers import ( |
| | Trainer, |
| | TrainingArguments, |
| | set_seed, |
| | ) |
| |
|
| | from data import ChessDataCollator, create_train_val_datasets |
| | from model import ChessConfig, ChessForCausalLM |
| | from tokenizer import ChessTokenizer |
| |
|
| |
|
| | def count_parameters(model, trainable_only=True): |
| | """Count the number of parameters in a model.""" |
| | if trainable_only: |
| | return sum(p.numel() for p in model.parameters() if p.requires_grad) |
| | return sum(p.numel() for p in model.parameters()) |
| |
|
| |
|
| | def parse_args(): |
| | """Parse command line arguments.""" |
| | parser = argparse.ArgumentParser( |
| | description="Train a chess-playing language model" |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | "--n_embd", type=int, default=128, |
| | help="Embedding dimension" |
| | ) |
| | parser.add_argument( |
| | "--n_layer", type=int, default=4, |
| | help="Number of transformer layers" |
| | ) |
| | parser.add_argument( |
| | "--n_head", type=int, default=4, |
| | help="Number of attention heads" |
| | ) |
| | parser.add_argument( |
| | "--n_ctx", type=int, default=256, |
| | help="Maximum context length" |
| | ) |
| | parser.add_argument( |
| | "--n_inner", type=int, default=None, |
| | help="Feed-forward inner dimension (default: 4 * n_embd)" |
| | ) |
| | parser.add_argument( |
| | "--dropout", type=float, default=0.1, |
| | help="Dropout probability" |
| | ) |
| | parser.add_argument( |
| | "--no_tie_weights", action="store_true", |
| | help="Disable weight tying between embedding and output layers" |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | "--dataset_name", type=str, default="dlouapre/lichess_2025-01_1M", |
| | help="Name of the dataset on Hugging Face Hub" |
| | ) |
| | parser.add_argument( |
| | "--max_train_samples", type=int, default=None, |
| | help="Maximum number of training samples" |
| | ) |
| | parser.add_argument( |
| | "--val_samples", type=int, default=5000, |
| | help="Number of validation samples" |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | "--output_dir", type=str, default="./output", |
| | help="Output directory for model and logs" |
| | ) |
| | parser.add_argument( |
| | "--num_train_epochs", type=int, default=3, |
| | help="Number of training epochs" |
| | ) |
| | parser.add_argument( |
| | "--per_device_train_batch_size", type=int, default=32, |
| | help="Training batch size per device" |
| | ) |
| | parser.add_argument( |
| | "--per_device_eval_batch_size", type=int, default=64, |
| | help="Evaluation batch size per device" |
| | ) |
| | parser.add_argument( |
| | "--learning_rate", type=float, default=5e-4, |
| | help="Learning rate" |
| | ) |
| | parser.add_argument( |
| | "--weight_decay", type=float, default=0.01, |
| | help="Weight decay" |
| | ) |
| | parser.add_argument( |
| | "--warmup_ratio", type=float, default=0.1, |
| | help="Warmup ratio" |
| | ) |
| | parser.add_argument( |
| | "--seed", type=int, default=42, |
| | help="Random seed" |
| | ) |
| | |
| | |
| | parser.add_argument( |
| | "--logging_steps", type=int, default=100, |
| | help="Logging frequency" |
| | ) |
| | parser.add_argument( |
| | "--eval_steps", type=int, default=500, |
| | help="Evaluation frequency" |
| | ) |
| | parser.add_argument( |
| | "--save_steps", type=int, default=1000, |
| | help="Checkpoint saving frequency" |
| | ) |
| | |
| | return parser.parse_args() |
| |
|
| |
|
| | def main(): |
| | """Main training function.""" |
| | args = parse_args() |
| | |
| | |
| | set_seed(args.seed) |
| | |
| | print("=" * 60) |
| | print("CHESS CHALLENGE - TRAINING") |
| | print("=" * 60) |
| | |
| | |
| | print("\nBuilding tokenizer from dataset...") |
| | tokenizer = ChessTokenizer.build_vocab_from_dataset( |
| | dataset_name=args.dataset_name, |
| | min_frequency=500, |
| | max_samples=100000, |
| | ) |
| | print(f" Vocabulary size: {tokenizer.vocab_size}") |
| | |
| | |
| | actual_vocab_size = tokenizer.vocab_size |
| | |
| | |
| | print("\nCreating model configuration...") |
| | config = ChessConfig( |
| | vocab_size=actual_vocab_size, |
| | n_embd=args.n_embd, |
| | n_layer=args.n_layer, |
| | n_head=args.n_head, |
| | n_ctx=args.n_ctx, |
| | n_inner=args.n_inner, |
| | dropout=args.dropout, |
| | tie_weights=not args.no_tie_weights, |
| | pad_token_id=tokenizer.pad_token_id, |
| | bos_token_id=tokenizer.bos_token_id, |
| | eos_token_id=tokenizer.eos_token_id, |
| | ) |
| | |
| | |
| | print(f"\nModel configuration:") |
| | print(f" vocab_size: {config.vocab_size}") |
| | print(f" n_embd: {config.n_embd}") |
| | print(f" n_layer: {config.n_layer}") |
| | print(f" n_head: {config.n_head}") |
| | print(f" tie_weights: {config.tie_weights}") |
| | |
| | |
| | print("\nCreating model...") |
| | model = ChessForCausalLM(config) |
| | n_params = count_parameters(model) |
| | print(f" Total parameters: {n_params:,}") |
| | |
| | if n_params > 1_000_000: |
| | print("WARNING: Model exceeds 1M parameter limit!") |
| | else: |
| | print("OK: Model is within 1M parameter limit") |
| | |
| | |
| | print("\nLoading datasets...") |
| | train_dataset, val_dataset = create_train_val_datasets( |
| | tokenizer=tokenizer, |
| | dataset_name=args.dataset_name, |
| | max_length=args.n_ctx, |
| | train_samples=args.max_train_samples, |
| | val_samples=args.val_samples, |
| | ) |
| | print(f" Training samples: {len(train_dataset):,}") |
| | print(f" Validation samples: {len(val_dataset):,}") |
| | |
| | |
| | data_collator = ChessDataCollator(tokenizer, max_length=args.n_ctx) |
| | |
| | |
| | training_args = TrainingArguments( |
| | output_dir=args.output_dir, |
| | num_train_epochs=args.num_train_epochs, |
| | per_device_train_batch_size=args.per_device_train_batch_size, |
| | per_device_eval_batch_size=args.per_device_eval_batch_size, |
| | learning_rate=args.learning_rate, |
| | weight_decay=args.weight_decay, |
| | warmup_ratio=args.warmup_ratio, |
| | logging_dir=os.path.join(args.output_dir, "logs"), |
| | logging_steps=args.logging_steps, |
| | eval_strategy="epoch", |
| | save_strategy="epoch", |
| | save_total_limit=3, |
| | load_best_model_at_end=True, |
| | metric_for_best_model="eval_loss", |
| | greater_is_better=False, |
| | seed=args.seed, |
| | bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), |
| | report_to=["none"], |
| | ) |
| | |
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | eval_dataset=val_dataset, |
| | data_collator=data_collator, |
| | tokenizer=tokenizer, |
| | ) |
| | |
| | |
| | print("\nStarting training...") |
| | trainer.train() |
| | |
| | |
| | print("\nSaving final model...") |
| | final_model_dir = os.path.join(args.output_dir, "final_model") |
| | trainer.save_model(final_model_dir) |
| | tokenizer.save_pretrained(final_model_dir) |
| | |
| | |
| | import shutil |
| | import json |
| | script_dir = Path(__file__).parent |
| | shutil.copy(script_dir / "model.py", final_model_dir) |
| | shutil.copy(script_dir / "tokenizer.py", final_model_dir) |
| | print(" Copied model.py and tokenizer.py") |
| | |
| | |
| | config_path = os.path.join(final_model_dir, "config.json") |
| | with open(config_path) as f: |
| | config_dict = json.load(f) |
| | config_dict["auto_map"] = { |
| | "AutoConfig": "model.ChessConfig", |
| | "AutoModelForCausalLM": "model.ChessForCausalLM", |
| | } |
| | with open(config_path, "w") as f: |
| | json.dump(config_dict, f, indent=2) |
| | print(" Added auto_map to config.json") |
| | |
| | |
| | tokenizer_config_path = os.path.join(final_model_dir, "tokenizer_config.json") |
| | with open(tokenizer_config_path) as f: |
| | tokenizer_dict = json.load(f) |
| | tokenizer_dict["auto_map"] = { |
| | "AutoTokenizer": ["tokenizer.ChessTokenizer", None], |
| | } |
| | with open(tokenizer_config_path, "w") as f: |
| | json.dump(tokenizer_dict, f, indent=2) |
| | print(" Added auto_map to tokenizer_config.json") |
| | |
| | print("\nTraining complete!") |
| | print(f" Model saved to: {final_model_dir}") |
| | print(" Ready for submission with: python submit.py --model_path " + final_model_dir) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|