| import gradio as gr | |
| import pandas as pd | |
| import json | |
| import os | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| def create_benchmark_plot(df): | |
| if df.empty: | |
| return None | |
| df_copy = df.copy() | |
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| for col in score_columns: | |
| df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0) | |
| df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1) | |
| df_sorted = df_copy.sort_values(by='Total_Score', ascending=False) | |
| if len(df_sorted) > 10: | |
| top_models = df_sorted.head(10) | |
| else: | |
| top_models = df_sorted | |
| benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| models = top_models['Model'].unique() | |
| x = np.arange(len(benchmarks)) | |
| width = 0.8 / len(models) if len(models) > 0 else 0.8 | |
| fig, ax = plt.subplots(figsize=(30, 10)) | |
| all_scores = [] | |
| for i, model in enumerate(models): | |
| model_data = top_models[top_models['Model'] == model] | |
| scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks] | |
| all_scores.extend(scores) | |
| offset = width * i - (width * (len(models) - 1) / 2) | |
| rects = ax.bar(x + offset, scores, width, label=model) | |
| ax.bar_label(rects, padding=3) | |
| ax.set_ylabel('Scores') | |
| ax.set_xticks(x) | |
| ax.set_xticklabels(benchmarks, rotation=45, ha="right") | |
| ax.legend(loc='lower right') | |
| if all_scores: | |
| ax.set_ylim(top=max(all_scores) * 1.15) | |
| plt.tight_layout() | |
| return fig | |
| def load_leaderboard_data(): | |
| data = [] | |
| benchmarks_dir = "benchmarks" | |
| mmlu_categories = { | |
| "mmlu_professional": [ | |
| "mmlu_professional_accounting", "mmlu_professional_law", | |
| "mmlu_professional_medicine", "mmlu_professional_psychology" | |
| ], | |
| "mmlu_college": [ | |
| "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science", | |
| "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics" | |
| ], | |
| "mmlu_high_school": [ | |
| "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", | |
| "mmlu_high_school_european_history", "mmlu_high_school_geography", | |
| "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", | |
| "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", | |
| "mmlu_high_school_physics", "mmlu_high_school_psychology", | |
| "mmlu_high_school_statistics", "mmlu_high_school_us_history", | |
| "mmlu_high_school_world_history" | |
| ] | |
| } | |
| all_mmlu_scores = [ | |
| "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics", | |
| "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry", | |
| "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine", | |
| "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics", | |
| "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics", | |
| "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology", | |
| "mmlu_high_school_chemistry", "mmlu_high_school_computer_science", | |
| "mmlu_high_school_european_history", "mmlu_high_school_geography", | |
| "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics", | |
| "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics", | |
| "mmlu_high_school_physics", "mmlu_high_school_psychology", | |
| "mmlu_high_school_statistics", "mmlu_high_school_us_history", | |
| "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality", | |
| "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence", | |
| "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management", | |
| "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous", | |
| "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other", | |
| "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting", | |
| "mmlu_professional_law", "mmlu_professional_medicine", | |
| "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies", | |
| "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy", | |
| "mmlu_virology", "mmlu_world_religions" | |
| ] | |
| other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])] | |
| mmlu_categories["mmlu_other"] = other_mmlu_scores | |
| for filename in os.listdir(benchmarks_dir): | |
| if filename.endswith(".json") and filename.startswith("results_"): | |
| filepath = os.path.join(benchmarks_dir, filename) | |
| with open(filepath, 'r') as f: | |
| content = json.load(f) | |
| model_name = content.get("model_name") | |
| if not model_name: | |
| model_name = os.path.splitext(filename)[0] | |
| if model_name.endswith('/'): | |
| model_name = model_name.rstrip('/') | |
| model_name = os.path.basename(model_name) | |
| results = content.get("results", {}) | |
| ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none") | |
| mmlu_score = results.get("mmlu", {}).get("acc,none") | |
| row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score} | |
| for score_name in all_mmlu_scores: | |
| row[score_name] = results.get(score_name, {}).get("acc,none") | |
| for category, scores in mmlu_categories.items(): | |
| category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores] | |
| category_scores = [s for s in category_scores if pd.notna(s)] | |
| if category_scores: | |
| row[category] = sum(category_scores) / len(category_scores) | |
| else: | |
| row[category] = np.nan | |
| data.append(row) | |
| df_raw = pd.DataFrame(data) | |
| numeric_cols = [col for col in df_raw.columns if col != 'Model'] | |
| for col in numeric_cols: | |
| df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce') | |
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| for col in score_columns: | |
| df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0) | |
| df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1) | |
| df_sorted = df_raw.sort_values(by='Total_Score', ascending=False) | |
| df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy() | |
| df = df.drop(columns=['Total_Score']) | |
| for col in numeric_cols: | |
| df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x) | |
| df.fillna(0, inplace=True) | |
| return df | |
| def style_diff(df, all_data_df): | |
| def highlight_max(s): | |
| s_numeric = pd.to_numeric(s, errors='coerce') | |
| max_val = s_numeric.max() | |
| return ['background-color: #68a055' if v == max_val else '' for v in s_numeric] | |
| def highlight_min(s): | |
| s_numeric = pd.to_numeric(s, errors='coerce') | |
| s_filtered = s_numeric[s_numeric > 0] | |
| if s_filtered.empty: | |
| return ['' for _ in s_numeric] | |
| min_val = s_filtered.min() | |
| return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric] | |
| df_styler = df.style | |
| for col in df.columns: | |
| if col != 'Model': | |
| numeric_col = pd.to_numeric(df[col], errors='coerce') | |
| if not numeric_col.isnull().all(): | |
| df_styler = df_styler.apply(highlight_max, subset=[col], axis=0) | |
| df_styler = df_styler.apply(highlight_min, subset=[col], axis=0) | |
| return df_styler | |
| def prepare_plot_data(df, all_cols=False): | |
| df_plot = df.copy() | |
| if not df_plot.empty: | |
| if all_cols: | |
| score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| for col in score_columns: | |
| df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0) | |
| df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1) | |
| df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True) | |
| df_plot = df_plot.head(10) | |
| df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])] | |
| else: | |
| df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0) | |
| df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True) | |
| return df_plot | |
| initial_df = load_leaderboard_data() | |
| display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| display_df = initial_df[display_cols].copy() | |
| for col in display_df.columns: | |
| if col != 'Model': | |
| display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Model Leaderboard") | |
| def update_plots(selected_models): | |
| if not selected_models: | |
| df_to_plot = initial_df | |
| else: | |
| df_to_plot = initial_df[initial_df['Model'].isin(selected_models)] | |
| scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False) | |
| padding_factor = 0.1 | |
| min_padding = 0.05 | |
| if not scatter_plot_df.empty: | |
| x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() | |
| x_range = x_max - x_min | |
| x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding | |
| x_lim = [x_min - x_padding, x_max + x_padding] | |
| y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() | |
| y_range = y_max - y_min | |
| y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding | |
| y_lim = [y_min - y_padding, y_max + y_padding] | |
| else: | |
| x_lim = [0, 1] | |
| y_lim = [0, 1] | |
| scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined']) | |
| scatter_plot_update = gr.ScatterPlot( | |
| value=scatter_plot_df, | |
| x="MMLU", | |
| y="IFEval", | |
| color="Model", | |
| title="Model Performance", | |
| x_lim=x_lim, | |
| y_lim=y_lim, | |
| ) | |
| bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True) | |
| if not bar_plot_df.empty: | |
| value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, | |
| var_name='Benchmark', value_name='Score') | |
| else: | |
| melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score']) | |
| bar_plot_update = gr.BarPlot( | |
| value=melted_df, | |
| x="Score", | |
| y="Ranked_Model", | |
| color="Benchmark", | |
| title="MMLU and IFEval Scores by Model", | |
| x_title="Score", | |
| y_title="Model", | |
| color_legend_title="Benchmark", | |
| vertical=False, | |
| ) | |
| benchmark_plot_update = create_benchmark_plot(df_to_plot) | |
| if not selected_models: | |
| df_to_display = display_df | |
| styled_df = style_diff(df_to_display, initial_df) | |
| else: | |
| df_to_display = display_df[display_df['Model'].isin(selected_models)] | |
| styled_df = style_diff(df_to_display, initial_df) | |
| return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df | |
| with gr.Accordion("Plots", open=True): | |
| with gr.Tabs(): | |
| with gr.TabItem("Summary Plots"): | |
| with gr.Row(): | |
| scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False) | |
| padding_factor = 0.1 | |
| min_padding = 0.05 | |
| x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max() | |
| x_range = x_max - x_min | |
| x_padding = max(x_range * padding_factor, min_padding) | |
| x_lim = [x_min - x_padding, x_max + x_padding] | |
| y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max() | |
| y_range = y_max - y_min | |
| y_padding = max(y_range * padding_factor, min_padding) | |
| y_lim = [y_min - y_padding, y_max + y_padding] | |
| scatterplot = gr.ScatterPlot( | |
| value=scatter_plot_df, | |
| x="MMLU", | |
| y="IFEval", | |
| color="Model", | |
| title="Model Performance", | |
| x_lim=x_lim, | |
| y_lim=y_lim, | |
| ) | |
| bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True) | |
| value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other'] | |
| melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars, | |
| var_name='Benchmark', value_name='Score') | |
| barplot = gr.BarPlot( | |
| value=melted_df, | |
| x="Score", | |
| y="Ranked_Model", | |
| color="Benchmark", | |
| title="MMLU and IFEval Scores by Model", | |
| x_title="Score", | |
| y_title="Model", | |
| color_legend_title="Benchmark", | |
| vertical=False, | |
| ) | |
| with gr.TabItem("Benchmark Comparison"): | |
| with gr.Row(): | |
| benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df)) | |
| model_names = initial_df["Model"].tolist() | |
| model_selector = gr.Dropdown( | |
| choices=model_names, | |
| label="Select Models to Display", | |
| multiselect=True, | |
| info="Select one or more models to display on the plots. If none are selected, all models will be shown." | |
| ) | |
| with gr.Row(): | |
| dataframe = gr.DataFrame( | |
| value=style_diff(display_df, initial_df), | |
| type="pandas", | |
| column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"], | |
| wrap=True | |
| ) | |
| model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe]) | |
| if __name__ == "__main__": | |
| demo.launch() | |