import torch import os from model import GPT, GPTConfig def compress_model(compression_type='cpu_compatible'): """ Load the original model and compress it using specified method Args: compression_type (str): 'cpu_compatible' - Uses FP32 but removes training artifacts 'fp16' - Half precision, better for GPU but may not work on all CPUs 'quantized' - INT8 quantization, most compressed but slightly lower quality """ input_path = 'best_model.pt' output_path = f'compressed_model_{compression_type}.pt' print(f"Loading model from {input_path}...") # Load original model checkpoint = torch.load(input_path, map_location='cpu') # Get original size original_size = os.path.getsize(input_path) / (1024 * 1024) # MB print(f"Original model size: {original_size:.2f} MB") # Initialize model config = GPTConfig( block_size=1024, vocab_size=50304, n_layer=12, n_head=12, n_embd=768 ) model = GPT(config) model.load_state_dict(checkpoint['model_state_dict']) # Apply compression based on type if compression_type == 'fp16': model = model.half() # Convert to FP16 dtype = 'float16' elif compression_type == 'quantized': # Quantize the model to INT8 model = torch.quantization.quantize_dynamic( model, {torch.nn.Linear}, dtype=torch.qint8 ) dtype = 'int8' else: # cpu_compatible model = model.float() # Ensure FP32 dtype = 'float32' # Create minimal checkpoint compressed_checkpoint = { 'model_state_dict': model.state_dict(), 'config': { 'block_size': config.block_size, 'vocab_size': config.vocab_size, 'n_layer': config.n_layer, 'n_head': config.n_head, 'n_embd': config.n_embd }, 'dtype': dtype, 'compression_type': compression_type } # Save compressed model print(f"Saving {compression_type} compressed model...") torch.save(compressed_checkpoint, output_path) # Get compressed size compressed_size = os.path.getsize(output_path) / (1024 * 1024) # MB print(f"Compressed model size: {compressed_size:.2f} MB") print(f"Compression ratio: {original_size/compressed_size:.2f}x") # Verify loading print("\nVerifying compressed model...") try: test_load = torch.load(output_path, map_location='cpu') print("✓ Compressed model loads successfully!") print(f"Model type: {test_load['compression_type']}") print(f"Data type: {test_load['dtype']}") except Exception as e: print(f"Error loading compressed model: {str(e)}") if __name__ == "__main__": # Create CPU-compatible version compress_model('cpu_compatible') # Optionally create FP16 version for GPU # compress_model('fp16') # Optionally create quantized version # compress_model('quantized')