Spaces:

MCP-1st-Birthday
/

sentinel-finops

Sleeping

App Files Files Community

sentinel-finops / data_gen.py

akshay4

first iter

d6c0046 verified about 1 month ago

raw

history blame contribute delete

7.42 kB

	import pandas as pd
	import random
	import json

	def generate_mock_data(filename="billing_export.csv", count=10000):
	"""
	Generates realistic, messy cloud billing data for FinOps demo.
	Includes tag inconsistencies, ambiguous data, and patterns that require AI.
	"""
	services = ["EC2", "EBS", "RDS", "Lambda", "S3", "DynamoDB"]
	envs_clean = ["prod", "dev", "staging", "test"]

	# Realistic tag inconsistencies (this is where AI shines!)
	env_variations = {
	"prod": ["prod", "production", "PROD", "Production", "prd"],
	"dev": ["dev", "development", "DEV", "Development", "develop"],
	"staging": ["staging", "stage", "stg", "STAGING"],
	"test": ["test", "testing", "TEST", "qa", "QA"]
	}

	# Ambiguous environments (requires AI judgment)
	ambiguous_envs = ["staging-prod", "prod-dr", "dev-staging", "test-prod"]

	# Team variations to test AI normalization
	teams_clean = ["Finance", "Data", "QA", "DevOps", "Security", "Frontend", "Backend", "ML", "Platform"]
	team_variations = {
	"Finance": ["finance", "Finance Team", "FIN", "finance-team"],
	"Data": ["data", "Data Science", "data-team", "DATA"],
	"QA": ["qa", "testing", "QA Team", "test", "quality"],
	"DevOps": ["devops", "ops", "SRE", "infra"],
	"Security": ["security", "sec", "infosec", "Security Team"],
	"Frontend": ["frontend", "web", "ui", "frontend-team"],
	"Backend": ["backend", "api", "backend-team", "services"],
	"ML": ["ml", "mlops", "ai", "model-training"],
	"Platform": ["platform", "platform-eng", "k8s-team"]
	}

	roles = ["web_server", "db_primary", "cache", "worker", "dr_backup", "analytics", "ml_training"]
	projects = ["phoenix", "atlas", "titan", "nexus", "quantum"]

	data = []

	for i in range(count):
	service = random.choice(services)

	# Generate resource ID
	if service == "EBS":
	resource_id = f"vol-{random.randint(100000, 999999)}"
	elif service == "RDS":
	resource_id = f"db-{random.randint(100000, 999999)}"
	else:
	resource_id = f"i-{random.randint(100000, 999999)}"

	cost = round(random.uniform(10.0, 500.0), 2)
	cpu_avg = round(random.uniform(0.0, 100.0), 1)

	# Add CPU p95 for spike detection (AI use case!)
	cpu_p95 = min(100.0, cpu_avg + random.uniform(0, 30))

	# Monthly pattern for anomaly detection
	monthly_pattern = [cpu_avg + random.uniform(-5, 10) for _ in range(12)]
	# Inject anomalies in some resources
	if i % 20 == 0:
	monthly_pattern[random.randint(0, 11)] = cpu_avg * 3 # Spike!

	last_active = random.randint(0, 90)
	owner = f"user{random.randint(1, 5)}@company.com"

	# Determine base environment
	base_env = random.choice(envs_clean)

	# Create tags with realistic inconsistencies
	tags = {}

	# 70% clean tags, 20% inconsistent, 10% ambiguous or missing
	rand = random.random()

	if rand < 0.7:
	# Clean tags
	tags["environment"] = base_env
	tags["team"] = random.choice(teams_clean)
	elif rand < 0.9:
	# Inconsistent naming (AI needed!)
	env_variant = random.choice(env_variations[base_env])
	team_variant = random.choice(team_variations[random.choice(teams_clean)])
	tags["environment"] = env_variant
	tags["team"] = team_variant
	# Sometimes duplicate keys with different casing
	if random.random() < 0.3:
	tags["Environment"] = env_variant # Duplicate!
	tags["Team"] = team_variant # Duplicate!
	else:
	# Ambiguous or missing (definitely AI!)
	if random.random() < 0.5:
	tags["environment"] = random.choice(ambiguous_envs)
	tags["team"] = random.choice(teams_clean)
	# Some resources missing critical tags

	# Add other tags
	tags["role"] = random.choice(roles)
	tags["project"] = random.choice(projects)
	tags["cost-center"] = f"CC-{random.randint(1000, 9999)}"

	# Add deprecated tags (AI should flag these)
	if random.random() < 0.1:
	tags["deprecated_owner"] = f"old_user{random.randint(1, 3)}@legacy.com"

	# Introduce realistic waste scenarios

	# 20%: Idle dev resources (Python can handle)
	if i % 5 == 0:
	tags["environment"] = "dev"
	cpu_avg = random.uniform(0, 3)
	cpu_p95 = random.uniform(0, 5)
	last_active = random.randint(30, 90)

	# 15%: Unattached volumes (Python can handle)
	if i % 7 == 0:
	service = "EBS"
	resource_id = f"vol-{random.randint(100000, 999999)}"
	cpu_avg = 0.0
	cpu_p95 = 0.0
	last_active = random.randint(30, 90)
	tags["state"] = "available"

	# 15%: Prod DR/backup (needs AI judgment - intentionally low CPU)
	if i % 6 == 0:
	tags["environment"] = "prod"
	tags["role"] = "dr_backup"
	cpu_avg = 0.5
	cpu_p95 = 2.0
	last_active = random.randint(50, 80)

	# 5%: Spot instance eligible (AI should recommend)
	spot_eligible = random.random() < 0.05

	# 3%: Reserved instance opportunities (AI should detect patterns)
	ri_candidate = random.random() < 0.03 and cpu_avg > 50

	# 5. STRATEGIC ADDITION: The "Orphan" Mystery (Sherlock Holmes Mode)
	# No tags, but the name hints at the owner.
	if i % 25 == 0:
	service = "EC2"
	cost = 850.00
	tags = {} # EMPTY TAGS! (The nightmare scenario)
	# We bury the clue in a fake 'Name' tag or just the resource ID context
	# In real billing data, 'Name' is often a tag, but sometimes resource ID has patterns.
	# Here we'll simulate a 'Name' tag that exists but 'team' tag is missing.
	clue_type = random.choice(["ml", "test", "web"])
	if clue_type == "ml":
	tags["Name"] = f"ml-training-cluster-{random.randint(1,9)}"
	elif clue_type == "test":
	tags["Name"] = f"qa-test-runner-{random.randint(100,999)}"
	else:
	tags["Name"] = f"web-frontend-v{random.randint(1,5)}"


	data.append({
	"ResourceID": resource_id,
	"Service": service,
	"Cost_Monthly": round(cost, 2),
	"CPU_avg": round(cpu_avg, 2),
	"CPU_p95": round(cpu_p95, 2),
	"monthly_cpu_pattern": json.dumps(monthly_pattern),
	"LastActiveDays": last_active,
	"OwnerEmail": owner,
	"spot_eligible": spot_eligible,
	"ri_candidate": ri_candidate,
	"Tags": json.dumps(tags)
	})

	df = pd.DataFrame(data)
	df.to_csv(filename, index=False)
	print(f"Generated {filename} with {len(df)} records.")
	print(f" - ~70% clean, ~20% tag inconsistencies, ~10% ambiguous/missing")
	print(f" - CPU patterns, anomalies, and RI opportunities included")
	return filename

	if __name__ == "__main__":
	generate_mock_data()