Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import random | |
| import json | |
| def generate_mock_data(filename="billing_export.csv", count=10000): | |
| """ | |
| Generates realistic, messy cloud billing data for FinOps demo. | |
| Includes tag inconsistencies, ambiguous data, and patterns that require AI. | |
| """ | |
| services = ["EC2", "EBS", "RDS", "Lambda", "S3", "DynamoDB"] | |
| envs_clean = ["prod", "dev", "staging", "test"] | |
| # Realistic tag inconsistencies (this is where AI shines!) | |
| env_variations = { | |
| "prod": ["prod", "production", "PROD", "Production", "prd"], | |
| "dev": ["dev", "development", "DEV", "Development", "develop"], | |
| "staging": ["staging", "stage", "stg", "STAGING"], | |
| "test": ["test", "testing", "TEST", "qa", "QA"] | |
| } | |
| # Ambiguous environments (requires AI judgment) | |
| ambiguous_envs = ["staging-prod", "prod-dr", "dev-staging", "test-prod"] | |
| # Team variations to test AI normalization | |
| teams_clean = ["Finance", "Data", "QA", "DevOps", "Security", "Frontend", "Backend", "ML", "Platform"] | |
| team_variations = { | |
| "Finance": ["finance", "Finance Team", "FIN", "finance-team"], | |
| "Data": ["data", "Data Science", "data-team", "DATA"], | |
| "QA": ["qa", "testing", "QA Team", "test", "quality"], | |
| "DevOps": ["devops", "ops", "SRE", "infra"], | |
| "Security": ["security", "sec", "infosec", "Security Team"], | |
| "Frontend": ["frontend", "web", "ui", "frontend-team"], | |
| "Backend": ["backend", "api", "backend-team", "services"], | |
| "ML": ["ml", "mlops", "ai", "model-training"], | |
| "Platform": ["platform", "platform-eng", "k8s-team"] | |
| } | |
| roles = ["web_server", "db_primary", "cache", "worker", "dr_backup", "analytics", "ml_training"] | |
| projects = ["phoenix", "atlas", "titan", "nexus", "quantum"] | |
| data = [] | |
| for i in range(count): | |
| service = random.choice(services) | |
| # Generate resource ID | |
| if service == "EBS": | |
| resource_id = f"vol-{random.randint(100000, 999999)}" | |
| elif service == "RDS": | |
| resource_id = f"db-{random.randint(100000, 999999)}" | |
| else: | |
| resource_id = f"i-{random.randint(100000, 999999)}" | |
| cost = round(random.uniform(10.0, 500.0), 2) | |
| cpu_avg = round(random.uniform(0.0, 100.0), 1) | |
| # Add CPU p95 for spike detection (AI use case!) | |
| cpu_p95 = min(100.0, cpu_avg + random.uniform(0, 30)) | |
| # Monthly pattern for anomaly detection | |
| monthly_pattern = [cpu_avg + random.uniform(-5, 10) for _ in range(12)] | |
| # Inject anomalies in some resources | |
| if i % 20 == 0: | |
| monthly_pattern[random.randint(0, 11)] = cpu_avg * 3 # Spike! | |
| last_active = random.randint(0, 90) | |
| owner = f"user{random.randint(1, 5)}@company.com" | |
| # Determine base environment | |
| base_env = random.choice(envs_clean) | |
| # Create tags with realistic inconsistencies | |
| tags = {} | |
| # 70% clean tags, 20% inconsistent, 10% ambiguous or missing | |
| rand = random.random() | |
| if rand < 0.7: | |
| # Clean tags | |
| tags["environment"] = base_env | |
| tags["team"] = random.choice(teams_clean) | |
| elif rand < 0.9: | |
| # Inconsistent naming (AI needed!) | |
| env_variant = random.choice(env_variations[base_env]) | |
| team_variant = random.choice(team_variations[random.choice(teams_clean)]) | |
| tags["environment"] = env_variant | |
| tags["team"] = team_variant | |
| # Sometimes duplicate keys with different casing | |
| if random.random() < 0.3: | |
| tags["Environment"] = env_variant # Duplicate! | |
| tags["Team"] = team_variant # Duplicate! | |
| else: | |
| # Ambiguous or missing (definitely AI!) | |
| if random.random() < 0.5: | |
| tags["environment"] = random.choice(ambiguous_envs) | |
| tags["team"] = random.choice(teams_clean) | |
| # Some resources missing critical tags | |
| # Add other tags | |
| tags["role"] = random.choice(roles) | |
| tags["project"] = random.choice(projects) | |
| tags["cost-center"] = f"CC-{random.randint(1000, 9999)}" | |
| # Add deprecated tags (AI should flag these) | |
| if random.random() < 0.1: | |
| tags["deprecated_owner"] = f"old_user{random.randint(1, 3)}@legacy.com" | |
| # Introduce realistic waste scenarios | |
| # 20%: Idle dev resources (Python can handle) | |
| if i % 5 == 0: | |
| tags["environment"] = "dev" | |
| cpu_avg = random.uniform(0, 3) | |
| cpu_p95 = random.uniform(0, 5) | |
| last_active = random.randint(30, 90) | |
| # 15%: Unattached volumes (Python can handle) | |
| if i % 7 == 0: | |
| service = "EBS" | |
| resource_id = f"vol-{random.randint(100000, 999999)}" | |
| cpu_avg = 0.0 | |
| cpu_p95 = 0.0 | |
| last_active = random.randint(30, 90) | |
| tags["state"] = "available" | |
| # 15%: Prod DR/backup (needs AI judgment - intentionally low CPU) | |
| if i % 6 == 0: | |
| tags["environment"] = "prod" | |
| tags["role"] = "dr_backup" | |
| cpu_avg = 0.5 | |
| cpu_p95 = 2.0 | |
| last_active = random.randint(50, 80) | |
| # 5%: Spot instance eligible (AI should recommend) | |
| spot_eligible = random.random() < 0.05 | |
| # 3%: Reserved instance opportunities (AI should detect patterns) | |
| ri_candidate = random.random() < 0.03 and cpu_avg > 50 | |
| # 5. STRATEGIC ADDITION: The "Orphan" Mystery (Sherlock Holmes Mode) | |
| # No tags, but the name hints at the owner. | |
| if i % 25 == 0: | |
| service = "EC2" | |
| cost = 850.00 | |
| tags = {} # EMPTY TAGS! (The nightmare scenario) | |
| # We bury the clue in a fake 'Name' tag or just the resource ID context | |
| # In real billing data, 'Name' is often a tag, but sometimes resource ID has patterns. | |
| # Here we'll simulate a 'Name' tag that exists but 'team' tag is missing. | |
| clue_type = random.choice(["ml", "test", "web"]) | |
| if clue_type == "ml": | |
| tags["Name"] = f"ml-training-cluster-{random.randint(1,9)}" | |
| elif clue_type == "test": | |
| tags["Name"] = f"qa-test-runner-{random.randint(100,999)}" | |
| else: | |
| tags["Name"] = f"web-frontend-v{random.randint(1,5)}" | |
| data.append({ | |
| "ResourceID": resource_id, | |
| "Service": service, | |
| "Cost_Monthly": round(cost, 2), | |
| "CPU_avg": round(cpu_avg, 2), | |
| "CPU_p95": round(cpu_p95, 2), | |
| "monthly_cpu_pattern": json.dumps(monthly_pattern), | |
| "LastActiveDays": last_active, | |
| "OwnerEmail": owner, | |
| "spot_eligible": spot_eligible, | |
| "ri_candidate": ri_candidate, | |
| "Tags": json.dumps(tags) | |
| }) | |
| df = pd.DataFrame(data) | |
| df.to_csv(filename, index=False) | |
| print(f"Generated {filename} with {len(df)} records.") | |
| print(f" - ~70% clean, ~20% tag inconsistencies, ~10% ambiguous/missing") | |
| print(f" - CPU patterns, anomalies, and RI opportunities included") | |
| return filename | |
| if __name__ == "__main__": | |
| generate_mock_data() | |