code / app.py

Upload folder using huggingface_hub

ae48413 verified 5 months ago

15.1 kB

	import gradio as gr
	import pandas as pd
	import json
	import os
	import matplotlib.pyplot as plt
	import numpy as np

	def create_benchmark_plot(df):
	if df.empty:
	return None

	df_copy = df.copy()
	score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']

	for col in score_columns:
	df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)

	df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1)

	df_sorted = df_copy.sort_values(by='Total_Score', ascending=False)

	if len(df_sorted) > 10:
	top_models = df_sorted.head(10)
	else:
	top_models = df_sorted

	benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	models = top_models['Model'].unique()

	x = np.arange(len(benchmarks))
	width = 0.8 / len(models) if len(models) > 0 else 0.8

	fig, ax = plt.subplots(figsize=(30, 10))

	all_scores = []
	for i, model in enumerate(models):
	model_data = top_models[top_models['Model'] == model]
	scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks]
	all_scores.extend(scores)
	offset = width * i - (width * (len(models) - 1) / 2)
	rects = ax.bar(x + offset, scores, width, label=model)
	ax.bar_label(rects, padding=3)

	ax.set_ylabel('Scores')
	ax.set_xticks(x)
	ax.set_xticklabels(benchmarks, rotation=45, ha="right")
	ax.legend(loc='lower right')

	if all_scores:
	ax.set_ylim(top=max(all_scores) * 1.15)

	plt.tight_layout()

	return fig

	def load_leaderboard_data():
	data = []
	benchmarks_dir = "benchmarks"

	mmlu_categories = {
	"mmlu_professional": [
	"mmlu_professional_accounting", "mmlu_professional_law",
	"mmlu_professional_medicine", "mmlu_professional_psychology"
	],
	"mmlu_college": [
	"mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science",
	"mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics"
	],
	"mmlu_high_school": [
	"mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
	"mmlu_high_school_european_history", "mmlu_high_school_geography",
	"mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
	"mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
	"mmlu_high_school_physics", "mmlu_high_school_psychology",
	"mmlu_high_school_statistics", "mmlu_high_school_us_history",
	"mmlu_high_school_world_history"
	]
	}

	all_mmlu_scores = [
	"mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics",
	"mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry",
	"mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine",
	"mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics",
	"mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics",
	"mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology",
	"mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
	"mmlu_high_school_european_history", "mmlu_high_school_geography",
	"mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
	"mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
	"mmlu_high_school_physics", "mmlu_high_school_psychology",
	"mmlu_high_school_statistics", "mmlu_high_school_us_history",
	"mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality",
	"mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence",
	"mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management",
	"mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous",
	"mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other",
	"mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting",
	"mmlu_professional_law", "mmlu_professional_medicine",
	"mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies",
	"mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy",
	"mmlu_virology", "mmlu_world_religions"
	]

	other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])]
	mmlu_categories["mmlu_other"] = other_mmlu_scores

	for filename in os.listdir(benchmarks_dir):
	if filename.endswith(".json") and filename.startswith("results_"):
	filepath = os.path.join(benchmarks_dir, filename)
	with open(filepath, 'r') as f:
	content = json.load(f)

	model_name = content.get("model_name")
	if not model_name:
	model_name = os.path.splitext(filename)[0]

	if model_name.endswith('/'):
	model_name = model_name.rstrip('/')

	model_name = os.path.basename(model_name)

	results = content.get("results", {})
	ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none")
	mmlu_score = results.get("mmlu", {}).get("acc,none")

	row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score}

	for score_name in all_mmlu_scores:
	row[score_name] = results.get(score_name, {}).get("acc,none")

	for category, scores in mmlu_categories.items():
	category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores]
	category_scores = [s for s in category_scores if pd.notna(s)]
	if category_scores:
	row[category] = sum(category_scores) / len(category_scores)
	else:
	row[category] = np.nan

	data.append(row)

	df_raw = pd.DataFrame(data)

	numeric_cols = [col for col in df_raw.columns if col != 'Model']
	for col in numeric_cols:
	df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')

	score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	for col in score_columns:
	df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0)

	df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1)

	df_sorted = df_raw.sort_values(by='Total_Score', ascending=False)

	df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy()

	df = df.drop(columns=['Total_Score'])

	for col in numeric_cols:
	df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x)

	df.fillna(0, inplace=True)

	return df

	def style_diff(df, all_data_df):
	def highlight_max(s):
	s_numeric = pd.to_numeric(s, errors='coerce')
	max_val = s_numeric.max()
	return ['background-color: #68a055' if v == max_val else '' for v in s_numeric]

	def highlight_min(s):
	s_numeric = pd.to_numeric(s, errors='coerce')
	s_filtered = s_numeric[s_numeric > 0]
	if s_filtered.empty:
	return ['' for _ in s_numeric]
	min_val = s_filtered.min()
	return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric]

	df_styler = df.style
	for col in df.columns:
	if col != 'Model':
	numeric_col = pd.to_numeric(df[col], errors='coerce')
	if not numeric_col.isnull().all():
	df_styler = df_styler.apply(highlight_max, subset=[col], axis=0)
	df_styler = df_styler.apply(highlight_min, subset=[col], axis=0)
	return df_styler

	def prepare_plot_data(df, all_cols=False):
	df_plot = df.copy()

	if not df_plot.empty:
	if all_cols:
	score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	for col in score_columns:
	df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0)
	df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1)
	df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True)
	df_plot = df_plot.head(10)
	df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])]
	else:
	df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0)
	df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True)

	return df_plot

	initial_df = load_leaderboard_data()
	display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	display_df = initial_df[display_cols].copy()
	for col in display_df.columns:
	if col != 'Model':
	display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0)

	with gr.Blocks() as demo:
	gr.Markdown("# Model Leaderboard")

	def update_plots(selected_models):
	if not selected_models:
	df_to_plot = initial_df
	else:
	df_to_plot = initial_df[initial_df['Model'].isin(selected_models)]

	scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False)

	padding_factor = 0.1
	min_padding = 0.05

	if not scatter_plot_df.empty:
	x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
	x_range = x_max - x_min
	x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding
	x_lim = [x_min - x_padding, x_max + x_padding]

	y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
	y_range = y_max - y_min
	y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding
	y_lim = [y_min - y_padding, y_max + y_padding]
	else:
	x_lim = [0, 1]
	y_lim = [0, 1]
	scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined'])

	scatter_plot_update = gr.ScatterPlot(
	value=scatter_plot_df,
	x="MMLU",
	y="IFEval",
	color="Model",
	title="Model Performance",
	x_lim=x_lim,
	y_lim=y_lim,
	)

	bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True)

	if not bar_plot_df.empty:
	value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
	var_name='Benchmark', value_name='Score')
	else:
	melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score'])

	bar_plot_update = gr.BarPlot(
	value=melted_df,
	x="Score",
	y="Ranked_Model",
	color="Benchmark",
	title="MMLU and IFEval Scores by Model",
	x_title="Score",
	y_title="Model",
	color_legend_title="Benchmark",
	vertical=False,
	)

	benchmark_plot_update = create_benchmark_plot(df_to_plot)

	if not selected_models:
	df_to_display = display_df
	styled_df = style_diff(df_to_display, initial_df)
	else:
	df_to_display = display_df[display_df['Model'].isin(selected_models)]
	styled_df = style_diff(df_to_display, initial_df)

	return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df

	with gr.Accordion("Plots", open=True):
	with gr.Tabs():
	with gr.TabItem("Summary Plots"):
	with gr.Row():
	scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False)

	padding_factor = 0.1
	min_padding = 0.05

	x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
	x_range = x_max - x_min
	x_padding = max(x_range * padding_factor, min_padding)
	x_lim = [x_min - x_padding, x_max + x_padding]

	y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
	y_range = y_max - y_min
	y_padding = max(y_range * padding_factor, min_padding)
	y_lim = [y_min - y_padding, y_max + y_padding]

	scatterplot = gr.ScatterPlot(
	value=scatter_plot_df,
	x="MMLU",
	y="IFEval",
	color="Model",
	title="Model Performance",
	x_lim=x_lim,
	y_lim=y_lim,
	)

	bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True)
	value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
	melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
	var_name='Benchmark', value_name='Score')

	barplot = gr.BarPlot(
	value=melted_df,
	x="Score",
	y="Ranked_Model",
	color="Benchmark",
	title="MMLU and IFEval Scores by Model",
	x_title="Score",
	y_title="Model",
	color_legend_title="Benchmark",
	vertical=False,
	)
	with gr.TabItem("Benchmark Comparison"):
	with gr.Row():
	benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df))

	model_names = initial_df["Model"].tolist()
	model_selector = gr.Dropdown(
	choices=model_names,
	label="Select Models to Display",
	multiselect=True,
	info="Select one or more models to display on the plots. If none are selected, all models will be shown."
	)

	with gr.Row():
	dataframe = gr.DataFrame(
	value=style_diff(display_df, initial_df),
	type="pandas",
	column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"],
	wrap=True
	)

	model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe])

	if __name__ == "__main__":
	demo.launch()