code / app.py
Delta-Vector's picture
Upload folder using huggingface_hub
ae48413 verified
import gradio as gr
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import numpy as np
def create_benchmark_plot(df):
if df.empty:
return None
df_copy = df.copy()
score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
for col in score_columns:
df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1)
df_sorted = df_copy.sort_values(by='Total_Score', ascending=False)
if len(df_sorted) > 10:
top_models = df_sorted.head(10)
else:
top_models = df_sorted
benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
models = top_models['Model'].unique()
x = np.arange(len(benchmarks))
width = 0.8 / len(models) if len(models) > 0 else 0.8
fig, ax = plt.subplots(figsize=(30, 10))
all_scores = []
for i, model in enumerate(models):
model_data = top_models[top_models['Model'] == model]
scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks]
all_scores.extend(scores)
offset = width * i - (width * (len(models) - 1) / 2)
rects = ax.bar(x + offset, scores, width, label=model)
ax.bar_label(rects, padding=3)
ax.set_ylabel('Scores')
ax.set_xticks(x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
ax.legend(loc='lower right')
if all_scores:
ax.set_ylim(top=max(all_scores) * 1.15)
plt.tight_layout()
return fig
def load_leaderboard_data():
data = []
benchmarks_dir = "benchmarks"
mmlu_categories = {
"mmlu_professional": [
"mmlu_professional_accounting", "mmlu_professional_law",
"mmlu_professional_medicine", "mmlu_professional_psychology"
],
"mmlu_college": [
"mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science",
"mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics"
],
"mmlu_high_school": [
"mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
"mmlu_high_school_european_history", "mmlu_high_school_geography",
"mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
"mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
"mmlu_high_school_physics", "mmlu_high_school_psychology",
"mmlu_high_school_statistics", "mmlu_high_school_us_history",
"mmlu_high_school_world_history"
]
}
all_mmlu_scores = [
"mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics",
"mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry",
"mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine",
"mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics",
"mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics",
"mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology",
"mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
"mmlu_high_school_european_history", "mmlu_high_school_geography",
"mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
"mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
"mmlu_high_school_physics", "mmlu_high_school_psychology",
"mmlu_high_school_statistics", "mmlu_high_school_us_history",
"mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality",
"mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence",
"mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management",
"mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous",
"mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other",
"mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting",
"mmlu_professional_law", "mmlu_professional_medicine",
"mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies",
"mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy",
"mmlu_virology", "mmlu_world_religions"
]
other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])]
mmlu_categories["mmlu_other"] = other_mmlu_scores
for filename in os.listdir(benchmarks_dir):
if filename.endswith(".json") and filename.startswith("results_"):
filepath = os.path.join(benchmarks_dir, filename)
with open(filepath, 'r') as f:
content = json.load(f)
model_name = content.get("model_name")
if not model_name:
model_name = os.path.splitext(filename)[0]
if model_name.endswith('/'):
model_name = model_name.rstrip('/')
model_name = os.path.basename(model_name)
results = content.get("results", {})
ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none")
mmlu_score = results.get("mmlu", {}).get("acc,none")
row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score}
for score_name in all_mmlu_scores:
row[score_name] = results.get(score_name, {}).get("acc,none")
for category, scores in mmlu_categories.items():
category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores]
category_scores = [s for s in category_scores if pd.notna(s)]
if category_scores:
row[category] = sum(category_scores) / len(category_scores)
else:
row[category] = np.nan
data.append(row)
df_raw = pd.DataFrame(data)
numeric_cols = [col for col in df_raw.columns if col != 'Model']
for col in numeric_cols:
df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
for col in score_columns:
df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0)
df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1)
df_sorted = df_raw.sort_values(by='Total_Score', ascending=False)
df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy()
df = df.drop(columns=['Total_Score'])
for col in numeric_cols:
df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x)
df.fillna(0, inplace=True)
return df
def style_diff(df, all_data_df):
def highlight_max(s):
s_numeric = pd.to_numeric(s, errors='coerce')
max_val = s_numeric.max()
return ['background-color: #68a055' if v == max_val else '' for v in s_numeric]
def highlight_min(s):
s_numeric = pd.to_numeric(s, errors='coerce')
s_filtered = s_numeric[s_numeric > 0]
if s_filtered.empty:
return ['' for _ in s_numeric]
min_val = s_filtered.min()
return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric]
df_styler = df.style
for col in df.columns:
if col != 'Model':
numeric_col = pd.to_numeric(df[col], errors='coerce')
if not numeric_col.isnull().all():
df_styler = df_styler.apply(highlight_max, subset=[col], axis=0)
df_styler = df_styler.apply(highlight_min, subset=[col], axis=0)
return df_styler
def prepare_plot_data(df, all_cols=False):
df_plot = df.copy()
if not df_plot.empty:
if all_cols:
score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
for col in score_columns:
df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0)
df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1)
df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True)
df_plot = df_plot.head(10)
df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])]
else:
df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0)
df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True)
return df_plot
initial_df = load_leaderboard_data()
display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
display_df = initial_df[display_cols].copy()
for col in display_df.columns:
if col != 'Model':
display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0)
with gr.Blocks() as demo:
gr.Markdown("# Model Leaderboard")
def update_plots(selected_models):
if not selected_models:
df_to_plot = initial_df
else:
df_to_plot = initial_df[initial_df['Model'].isin(selected_models)]
scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False)
padding_factor = 0.1
min_padding = 0.05
if not scatter_plot_df.empty:
x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
x_range = x_max - x_min
x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding
x_lim = [x_min - x_padding, x_max + x_padding]
y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
y_range = y_max - y_min
y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding
y_lim = [y_min - y_padding, y_max + y_padding]
else:
x_lim = [0, 1]
y_lim = [0, 1]
scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined'])
scatter_plot_update = gr.ScatterPlot(
value=scatter_plot_df,
x="MMLU",
y="IFEval",
color="Model",
title="Model Performance",
x_lim=x_lim,
y_lim=y_lim,
)
bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True)
if not bar_plot_df.empty:
value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
var_name='Benchmark', value_name='Score')
else:
melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score'])
bar_plot_update = gr.BarPlot(
value=melted_df,
x="Score",
y="Ranked_Model",
color="Benchmark",
title="MMLU and IFEval Scores by Model",
x_title="Score",
y_title="Model",
color_legend_title="Benchmark",
vertical=False,
)
benchmark_plot_update = create_benchmark_plot(df_to_plot)
if not selected_models:
df_to_display = display_df
styled_df = style_diff(df_to_display, initial_df)
else:
df_to_display = display_df[display_df['Model'].isin(selected_models)]
styled_df = style_diff(df_to_display, initial_df)
return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df
with gr.Accordion("Plots", open=True):
with gr.Tabs():
with gr.TabItem("Summary Plots"):
with gr.Row():
scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False)
padding_factor = 0.1
min_padding = 0.05
x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
x_range = x_max - x_min
x_padding = max(x_range * padding_factor, min_padding)
x_lim = [x_min - x_padding, x_max + x_padding]
y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
y_range = y_max - y_min
y_padding = max(y_range * padding_factor, min_padding)
y_lim = [y_min - y_padding, y_max + y_padding]
scatterplot = gr.ScatterPlot(
value=scatter_plot_df,
x="MMLU",
y="IFEval",
color="Model",
title="Model Performance",
x_lim=x_lim,
y_lim=y_lim,
)
bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True)
value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
var_name='Benchmark', value_name='Score')
barplot = gr.BarPlot(
value=melted_df,
x="Score",
y="Ranked_Model",
color="Benchmark",
title="MMLU and IFEval Scores by Model",
x_title="Score",
y_title="Model",
color_legend_title="Benchmark",
vertical=False,
)
with gr.TabItem("Benchmark Comparison"):
with gr.Row():
benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df))
model_names = initial_df["Model"].tolist()
model_selector = gr.Dropdown(
choices=model_names,
label="Select Models to Display",
multiselect=True,
info="Select one or more models to display on the plots. If none are selected, all models will be shown."
)
with gr.Row():
dataframe = gr.DataFrame(
value=style_diff(display_df, initial_df),
type="pandas",
column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"],
wrap=True
)
model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe])
if __name__ == "__main__":
demo.launch()