#!/usr/bin/env python3 """ Compare two model variants to see if they have different configurations. Usage: export CBORG_API_KEY=... python compare_model_configs.py openai/o:latest openai/o3 """ import os import sys from openai import OpenAI import json def test_model_detailed(client, model_id): """Test a model and return detailed response information.""" try: response = client.chat.completions.create( model=model_id, messages=[{"role": "user", "content": "What is 2+2?"}], max_tokens=10, temperature=1.0, # Explicitly set top_p=1.0, # Explicitly set ) # Extract all available information info = { 'model': response.model, 'id': response.id, 'created': response.created, 'object': response.object, 'system_fingerprint': getattr(response, 'system_fingerprint', None), 'usage': { 'prompt_tokens': response.usage.prompt_tokens, 'completion_tokens': response.usage.completion_tokens, 'total_tokens': response.usage.total_tokens, }, 'response_content': response.choices[0].message.content, 'finish_reason': response.choices[0].finish_reason, } # Try to get any additional metadata try: info['raw_response'] = str(response) except: pass return info, None except Exception as e: return None, str(e) def main(): if len(sys.argv) < 3: print("Usage: python compare_model_configs.py ") print("Example: python compare_model_configs.py openai/o:latest openai/o3") sys.exit(1) model1 = sys.argv[1] model2 = sys.argv[2] api_key = os.environ.get('CBORG_API_KEY') if not api_key: print("Error: CBORG_API_KEY environment variable not set.") sys.exit(1) client = OpenAI( api_key=api_key, base_url="https://api.cborg.lbl.gov" ) print("=" * 100) print(f"COMPARING: {model1} vs {model2}") print("=" * 100) print() # Test model 1 print(f"Testing {model1}...") info1, error1 = test_model_detailed(client, model1) if error1: print(f"❌ Error: {error1}") sys.exit(1) # Test model 2 print(f"Testing {model2}...") info2, error2 = test_model_detailed(client, model2) if error2: print(f"❌ Error: {error2}") sys.exit(1) print() print("=" * 100) print("COMPARISON RESULTS") print("=" * 100) print() # Compare underlying models print("1. UNDERLYING MODEL:") print(f" {model1:<30} → {info1['model']}") print(f" {model2:<30} → {info2['model']}") if info1['model'] == info2['model']: print(" ✓ SAME underlying model") else: print(" ⚠️ DIFFERENT underlying models!") print() # Compare system fingerprints (if available) print("2. SYSTEM FINGERPRINT:") print(f" {model1:<30} → {info1['system_fingerprint']}") print(f" {model2:<30} → {info2['system_fingerprint']}") if info1['system_fingerprint'] == info2['system_fingerprint']: print(" ✓ SAME system fingerprint") elif info1['system_fingerprint'] is None or info2['system_fingerprint'] is None: print(" ⚠️ System fingerprint not available") else: print(" ⚠️ DIFFERENT system fingerprints!") print() # Compare token usage patterns print("3. TOKEN USAGE (for same prompt):") print(f" {model1:<30} prompt={info1['usage']['prompt_tokens']}, completion={info1['usage']['completion_tokens']}") print(f" {model2:<30} prompt={info2['usage']['prompt_tokens']}, completion={info2['usage']['completion_tokens']}") if info1['usage'] == info2['usage']: print(" ✓ IDENTICAL token usage") else: print(" ⚠️ Different token usage (could indicate different behavior)") print() # Compare responses print("4. RESPONSE CONTENT:") print(f" {model1}: \"{info1['response_content']}\"") print(f" {model2}: \"{info2['response_content']}\"") if info1['response_content'] == info2['response_content']: print(" ✓ IDENTICAL responses") else: print(" ⚠️ Different responses") print() # Show raw response if available if 'raw_response' in info1: print("5. RAW RESPONSE MODEL 1:") print(f" {info1['raw_response'][:500]}") print() print("6. RAW RESPONSE MODEL 2:") print(f" {info2['raw_response'][:500]}") print() # Final verdict print("=" * 100) print("VERDICT:") print("=" * 100) same_count = 0 total_count = 4 if info1['model'] == info2['model']: same_count += 1 if info1['system_fingerprint'] == info2['system_fingerprint'] or \ (info1['system_fingerprint'] is None and info2['system_fingerprint'] is None): same_count += 1 if info1['usage'] == info2['usage']: same_count += 1 if info1['response_content'] == info2['response_content']: same_count += 1 print(f"Similarity: {same_count}/{total_count} metrics match") print() if same_count == total_count: print("✓ Models appear to be IDENTICAL") print(" → Same underlying model, same configuration") print(" → Likely just different aliases for the same deployment") elif info1['model'] == info2['model'] and same_count >= 2: print("⚠️ Models use the SAME base model but show some differences") print(" → Could be due to:") print(" - Different deployment instances") print(" - Randomness in generation") print(" - Different routing/load balancing") else: print("⚠️ Models appear to be DIFFERENT") print(" → Different configurations or versions") print() print("NOTE: In your dataset, these models have different performance because") print(" they represent different experimental runs, not necessarily different") print(" model configurations.") print("=" * 100) if __name__ == '__main__': main()