sentinel-finops / data_gen.py
akshay4's picture
first iter
d6c0046 verified
import pandas as pd
import random
import json
def generate_mock_data(filename="billing_export.csv", count=10000):
"""
Generates realistic, messy cloud billing data for FinOps demo.
Includes tag inconsistencies, ambiguous data, and patterns that require AI.
"""
services = ["EC2", "EBS", "RDS", "Lambda", "S3", "DynamoDB"]
envs_clean = ["prod", "dev", "staging", "test"]
# Realistic tag inconsistencies (this is where AI shines!)
env_variations = {
"prod": ["prod", "production", "PROD", "Production", "prd"],
"dev": ["dev", "development", "DEV", "Development", "develop"],
"staging": ["staging", "stage", "stg", "STAGING"],
"test": ["test", "testing", "TEST", "qa", "QA"]
}
# Ambiguous environments (requires AI judgment)
ambiguous_envs = ["staging-prod", "prod-dr", "dev-staging", "test-prod"]
# Team variations to test AI normalization
teams_clean = ["Finance", "Data", "QA", "DevOps", "Security", "Frontend", "Backend", "ML", "Platform"]
team_variations = {
"Finance": ["finance", "Finance Team", "FIN", "finance-team"],
"Data": ["data", "Data Science", "data-team", "DATA"],
"QA": ["qa", "testing", "QA Team", "test", "quality"],
"DevOps": ["devops", "ops", "SRE", "infra"],
"Security": ["security", "sec", "infosec", "Security Team"],
"Frontend": ["frontend", "web", "ui", "frontend-team"],
"Backend": ["backend", "api", "backend-team", "services"],
"ML": ["ml", "mlops", "ai", "model-training"],
"Platform": ["platform", "platform-eng", "k8s-team"]
}
roles = ["web_server", "db_primary", "cache", "worker", "dr_backup", "analytics", "ml_training"]
projects = ["phoenix", "atlas", "titan", "nexus", "quantum"]
data = []
for i in range(count):
service = random.choice(services)
# Generate resource ID
if service == "EBS":
resource_id = f"vol-{random.randint(100000, 999999)}"
elif service == "RDS":
resource_id = f"db-{random.randint(100000, 999999)}"
else:
resource_id = f"i-{random.randint(100000, 999999)}"
cost = round(random.uniform(10.0, 500.0), 2)
cpu_avg = round(random.uniform(0.0, 100.0), 1)
# Add CPU p95 for spike detection (AI use case!)
cpu_p95 = min(100.0, cpu_avg + random.uniform(0, 30))
# Monthly pattern for anomaly detection
monthly_pattern = [cpu_avg + random.uniform(-5, 10) for _ in range(12)]
# Inject anomalies in some resources
if i % 20 == 0:
monthly_pattern[random.randint(0, 11)] = cpu_avg * 3 # Spike!
last_active = random.randint(0, 90)
owner = f"user{random.randint(1, 5)}@company.com"
# Determine base environment
base_env = random.choice(envs_clean)
# Create tags with realistic inconsistencies
tags = {}
# 70% clean tags, 20% inconsistent, 10% ambiguous or missing
rand = random.random()
if rand < 0.7:
# Clean tags
tags["environment"] = base_env
tags["team"] = random.choice(teams_clean)
elif rand < 0.9:
# Inconsistent naming (AI needed!)
env_variant = random.choice(env_variations[base_env])
team_variant = random.choice(team_variations[random.choice(teams_clean)])
tags["environment"] = env_variant
tags["team"] = team_variant
# Sometimes duplicate keys with different casing
if random.random() < 0.3:
tags["Environment"] = env_variant # Duplicate!
tags["Team"] = team_variant # Duplicate!
else:
# Ambiguous or missing (definitely AI!)
if random.random() < 0.5:
tags["environment"] = random.choice(ambiguous_envs)
tags["team"] = random.choice(teams_clean)
# Some resources missing critical tags
# Add other tags
tags["role"] = random.choice(roles)
tags["project"] = random.choice(projects)
tags["cost-center"] = f"CC-{random.randint(1000, 9999)}"
# Add deprecated tags (AI should flag these)
if random.random() < 0.1:
tags["deprecated_owner"] = f"old_user{random.randint(1, 3)}@legacy.com"
# Introduce realistic waste scenarios
# 20%: Idle dev resources (Python can handle)
if i % 5 == 0:
tags["environment"] = "dev"
cpu_avg = random.uniform(0, 3)
cpu_p95 = random.uniform(0, 5)
last_active = random.randint(30, 90)
# 15%: Unattached volumes (Python can handle)
if i % 7 == 0:
service = "EBS"
resource_id = f"vol-{random.randint(100000, 999999)}"
cpu_avg = 0.0
cpu_p95 = 0.0
last_active = random.randint(30, 90)
tags["state"] = "available"
# 15%: Prod DR/backup (needs AI judgment - intentionally low CPU)
if i % 6 == 0:
tags["environment"] = "prod"
tags["role"] = "dr_backup"
cpu_avg = 0.5
cpu_p95 = 2.0
last_active = random.randint(50, 80)
# 5%: Spot instance eligible (AI should recommend)
spot_eligible = random.random() < 0.05
# 3%: Reserved instance opportunities (AI should detect patterns)
ri_candidate = random.random() < 0.03 and cpu_avg > 50
# 5. STRATEGIC ADDITION: The "Orphan" Mystery (Sherlock Holmes Mode)
# No tags, but the name hints at the owner.
if i % 25 == 0:
service = "EC2"
cost = 850.00
tags = {} # EMPTY TAGS! (The nightmare scenario)
# We bury the clue in a fake 'Name' tag or just the resource ID context
# In real billing data, 'Name' is often a tag, but sometimes resource ID has patterns.
# Here we'll simulate a 'Name' tag that exists but 'team' tag is missing.
clue_type = random.choice(["ml", "test", "web"])
if clue_type == "ml":
tags["Name"] = f"ml-training-cluster-{random.randint(1,9)}"
elif clue_type == "test":
tags["Name"] = f"qa-test-runner-{random.randint(100,999)}"
else:
tags["Name"] = f"web-frontend-v{random.randint(1,5)}"
data.append({
"ResourceID": resource_id,
"Service": service,
"Cost_Monthly": round(cost, 2),
"CPU_avg": round(cpu_avg, 2),
"CPU_p95": round(cpu_p95, 2),
"monthly_cpu_pattern": json.dumps(monthly_pattern),
"LastActiveDays": last_active,
"OwnerEmail": owner,
"spot_eligible": spot_eligible,
"ri_candidate": ri_candidate,
"Tags": json.dumps(tags)
})
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
print(f"Generated {filename} with {len(df)} records.")
print(f" - ~70% clean, ~20% tag inconsistencies, ~10% ambiguous/missing")
print(f" - CPU patterns, anomalies, and RI opportunities included")
return filename
if __name__ == "__main__":
generate_mock_data()