code

File size: 15,102 Bytes

ae48413

import gradio as gr
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import numpy as np

def create_benchmark_plot(df):
    if df.empty:
        return None

    df_copy = df.copy()
    score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
    
    for col in score_columns:
        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce').fillna(0)
    
    df_copy['Total_Score'] = df_copy[score_columns].sum(axis=1)
    
    df_sorted = df_copy.sort_values(by='Total_Score', ascending=False)
    
    if len(df_sorted) > 10:
        top_models = df_sorted.head(10)
    else:
        top_models = df_sorted
    
    benchmarks = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
    models = top_models['Model'].unique()
    
    x = np.arange(len(benchmarks))
    width = 0.8 / len(models) if len(models) > 0 else 0.8
    
    fig, ax = plt.subplots(figsize=(30, 10))
    
    all_scores = []
    for i, model in enumerate(models):
        model_data = top_models[top_models['Model'] == model]
        scores = [model_data[benchmark].values[0] if not model_data[benchmark].empty else 0 for benchmark in benchmarks]
        all_scores.extend(scores)
        offset = width * i - (width * (len(models) - 1) / 2)
        rects = ax.bar(x + offset, scores, width, label=model)
        ax.bar_label(rects, padding=3)

    ax.set_ylabel('Scores')
    ax.set_xticks(x)
    ax.set_xticklabels(benchmarks, rotation=45, ha="right")
    ax.legend(loc='lower right')
    
    if all_scores:
        ax.set_ylim(top=max(all_scores) * 1.15)

    plt.tight_layout()
    
    return fig

def load_leaderboard_data():
    data = []
    benchmarks_dir = "benchmarks"
    
    mmlu_categories = {
        "mmlu_professional": [
            "mmlu_professional_accounting", "mmlu_professional_law", 
            "mmlu_professional_medicine", "mmlu_professional_psychology"
        ],
        "mmlu_college": [
            "mmlu_college_biology", "mmlu_college_chemistry", "mmlu_college_computer_science",
            "mmlu_college_mathematics", "mmlu_college_medicine", "mmlu_college_physics"
        ],
        "mmlu_high_school": [
            "mmlu_high_school_biology", "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
            "mmlu_high_school_european_history", "mmlu_high_school_geography",
            "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
            "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
            "mmlu_high_school_physics", "mmlu_high_school_psychology",
            "mmlu_high_school_statistics", "mmlu_high_school_us_history",
            "mmlu_high_school_world_history"
        ]
    }
    
    all_mmlu_scores = [
        "mmlu_abstract_algebra", "mmlu_anatomy", "mmlu_astronomy", "mmlu_business_ethics",
        "mmlu_clinical_knowledge", "mmlu_college_biology", "mmlu_college_chemistry",
        "mmlu_college_computer_science", "mmlu_college_mathematics", "mmlu_college_medicine",
        "mmlu_college_physics", "mmlu_computer_security", "mmlu_conceptual_physics",
        "mmlu_econometrics", "mmlu_electrical_engineering", "mmlu_elementary_mathematics",
        "mmlu_formal_logic", "mmlu_global_facts", "mmlu_high_school_biology",
        "mmlu_high_school_chemistry", "mmlu_high_school_computer_science",
        "mmlu_high_school_european_history", "mmlu_high_school_geography",
        "mmlu_high_school_government_and_politics", "mmlu_high_school_macroeconomics",
        "mmlu_high_school_mathematics", "mmlu_high_school_microeconomics",
        "mmlu_high_school_physics", "mmlu_high_school_psychology",
        "mmlu_high_school_statistics", "mmlu_high_school_us_history",
        "mmlu_high_school_world_history", "mmlu_human_aging", "mmlu_human_sexuality",
        "mmlu_humanities", "mmlu_international_law", "mmlu_jurisprudence",
        "mmlu_logical_fallacies", "mmlu_machine_learning", "mmlu_management",
        "mmlu_marketing", "mmlu_medical_genetics", "mmlu_miscellaneous",
        "mmlu_moral_disputes", "mmlu_moral_scenarios", "mmlu_nutrition", "mmlu_other",
        "mmlu_philosophy", "mmlu_prehistory", "mmlu_professional_accounting",
        "mmlu_professional_law", "mmlu_professional_medicine",
        "mmlu_professional_psychology", "mmlu_public_relations", "mmlu_security_studies",
        "mmlu_social_sciences", "mmlu_sociology", "mmlu_stem", "mmlu_us_foreign_policy",
        "mmlu_virology", "mmlu_world_religions"
    ]

    other_mmlu_scores = [s for s in all_mmlu_scores if s not in sum(mmlu_categories.values(), [])]
    mmlu_categories["mmlu_other"] = other_mmlu_scores

    for filename in os.listdir(benchmarks_dir):
        if filename.endswith(".json") and filename.startswith("results_"):
            filepath = os.path.join(benchmarks_dir, filename)
            with open(filepath, 'r') as f:
                content = json.load(f)
                
                model_name = content.get("model_name")
                if not model_name:
                    model_name = os.path.splitext(filename)[0]
                
                if model_name.endswith('/'):
                    model_name = model_name.rstrip('/')

                model_name = os.path.basename(model_name)
                
                results = content.get("results", {})
                ifeval_score = results.get("ifeval", {}).get("prompt_level_strict_acc,none")
                mmlu_score = results.get("mmlu", {}).get("acc,none")

                row = {"Model": model_name, "IFEval": ifeval_score, "MMLU": mmlu_score}

                for score_name in all_mmlu_scores:
                    row[score_name] = results.get(score_name, {}).get("acc,none")

                for category, scores in mmlu_categories.items():
                    category_scores = [pd.to_numeric(row.get(s), errors='coerce') for s in scores]
                    category_scores = [s for s in category_scores if pd.notna(s)]
                    if category_scores:
                        row[category] = sum(category_scores) / len(category_scores)
                    else:
                        row[category] = np.nan
                
                data.append(row)
    
    df_raw = pd.DataFrame(data)
    
    numeric_cols = [col for col in df_raw.columns if col != 'Model']
    for col in numeric_cols:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')

    score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
    for col in score_columns:
        df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce').fillna(0)
    
    df_raw['Total_Score'] = df_raw[score_columns].sum(axis=1)

    df_sorted = df_raw.sort_values(by='Total_Score', ascending=False)
    
    df = df_sorted.drop_duplicates(subset=['Model'], keep='first').copy()
    
    df = df.drop(columns=['Total_Score'])

    for col in numeric_cols:
        df[col] = df[col].apply(lambda x: round(x, 4) if pd.notna(x) else x)

    df.fillna(0, inplace=True)
    
    return df

def style_diff(df, all_data_df):
    def highlight_max(s):
        s_numeric = pd.to_numeric(s, errors='coerce')
        max_val = s_numeric.max()
        return ['background-color: #68a055' if v == max_val else '' for v in s_numeric]

    def highlight_min(s):
        s_numeric = pd.to_numeric(s, errors='coerce')
        s_filtered = s_numeric[s_numeric > 0]
        if s_filtered.empty:
            return ['' for _ in s_numeric]
        min_val = s_filtered.min()
        return ['background-color: #d4605b' if v == min_val else '' for v in s_numeric]

    df_styler = df.style
    for col in df.columns:
        if col != 'Model':
            numeric_col = pd.to_numeric(df[col], errors='coerce')
            if not numeric_col.isnull().all():
                df_styler = df_styler.apply(highlight_max, subset=[col], axis=0)
                df_styler = df_styler.apply(highlight_min, subset=[col], axis=0)
    return df_styler

def prepare_plot_data(df, all_cols=False):
    df_plot = df.copy()
    
    if not df_plot.empty:
        if all_cols:
            score_columns = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
            for col in score_columns:
                df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce').fillna(0)
            df_plot['Total_Score'] = df_plot[score_columns].sum(axis=1)
            df_plot = df_plot.sort_values(by='Total_Score', ascending=False).reset_index(drop=True)
            df_plot = df_plot.head(10)
            df_plot['Ranked_Model'] = [f"{i+1:02d}. {model}" for i, model in enumerate(df_plot['Model'])]
        else:
            df_plot['MMLU_IFEval_Combined'] = df_plot['MMLU'].fillna(0) + df_plot['IFEval'].fillna(0)
            df_plot = df_plot.sort_values(by='MMLU_IFEval_Combined', ascending=False).reset_index(drop=True)
    
    return df_plot

initial_df = load_leaderboard_data()
display_cols = ['Model', 'IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
display_df = initial_df[display_cols].copy()
for col in display_df.columns:
    if col != 'Model':
        display_df[col] = pd.to_numeric(display_df[col], errors='coerce').fillna(0)

with gr.Blocks() as demo:
    gr.Markdown("# Model Leaderboard")

    def update_plots(selected_models):
        if not selected_models:
            df_to_plot = initial_df
        else:
            df_to_plot = initial_df[initial_df['Model'].isin(selected_models)]

        scatter_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=False)
        
        padding_factor = 0.1
        min_padding = 0.05

        if not scatter_plot_df.empty:
            x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
            x_range = x_max - x_min
            x_padding = max(x_range * padding_factor, min_padding) if x_range > 0 else min_padding
            x_lim = [x_min - x_padding, x_max + x_padding]

            y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
            y_range = y_max - y_min
            y_padding = max(y_range * padding_factor, min_padding) if y_range > 0 else min_padding
            y_lim = [y_min - y_padding, y_max + y_padding]
        else:
            x_lim = [0, 1]
            y_lim = [0, 1]
            scatter_plot_df = pd.DataFrame(columns=['Model', 'MMLU', 'IFEval', 'MMLU_IFEval_Combined'])

        scatter_plot_update = gr.ScatterPlot(
            value=scatter_plot_df,
            x="MMLU",
            y="IFEval",
            color="Model",
            title="Model Performance",
            x_lim=x_lim,
            y_lim=y_lim,
        )

        bar_plot_df = prepare_plot_data(df_to_plot.copy(), all_cols=True)
        
        if not bar_plot_df.empty:
            value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
            melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
                                         var_name='Benchmark', value_name='Score')
        else:
            melted_df = pd.DataFrame(columns=['Ranked_Model', 'Benchmark', 'Score'])

        bar_plot_update = gr.BarPlot(
            value=melted_df,
            x="Score",
            y="Ranked_Model",
            color="Benchmark",
            title="MMLU and IFEval Scores by Model",
            x_title="Score",
            y_title="Model",
            color_legend_title="Benchmark",
            vertical=False,
        )
        
        benchmark_plot_update = create_benchmark_plot(df_to_plot)

        if not selected_models:
            df_to_display = display_df
            styled_df = style_diff(df_to_display, initial_df)
        else:
            df_to_display = display_df[display_df['Model'].isin(selected_models)]
            styled_df = style_diff(df_to_display, initial_df)
        
        return scatter_plot_update, bar_plot_update, benchmark_plot_update, styled_df

    with gr.Accordion("Plots", open=True):
        with gr.Tabs():
            with gr.TabItem("Summary Plots"):
                with gr.Row():
                    scatter_plot_df = prepare_plot_data(initial_df.copy(), all_cols=False)
                    
                    padding_factor = 0.1
                    min_padding = 0.05

                    x_min, x_max = scatter_plot_df['MMLU'].min(), scatter_plot_df['MMLU'].max()
                    x_range = x_max - x_min
                    x_padding = max(x_range * padding_factor, min_padding)
                    x_lim = [x_min - x_padding, x_max + x_padding]

                    y_min, y_max = scatter_plot_df['IFEval'].min(), scatter_plot_df['IFEval'].max()
                    y_range = y_max - y_min
                    y_padding = max(y_range * padding_factor, min_padding)
                    y_lim = [y_min - y_padding, y_max + y_padding]

                    scatterplot = gr.ScatterPlot(
                        value=scatter_plot_df,
                        x="MMLU",
                        y="IFEval",
                        color="Model",
                        title="Model Performance",
                        x_lim=x_lim,
                        y_lim=y_lim,
                    )

                    bar_plot_df = prepare_plot_data(initial_df.copy(), all_cols=True)
                    value_vars = ['IFEval', 'MMLU', 'mmlu_professional', 'mmlu_college', 'mmlu_high_school', 'mmlu_other']
                    melted_df = bar_plot_df.melt(id_vars='Ranked_Model', value_vars=value_vars,
                                                 var_name='Benchmark', value_name='Score')

                    barplot = gr.BarPlot(
                        value=melted_df,
                        x="Score",
                        y="Ranked_Model",
                        color="Benchmark",
                        title="MMLU and IFEval Scores by Model",
                        x_title="Score",
                        y_title="Model",
                        color_legend_title="Benchmark",
                        vertical=False,
                    )
            with gr.TabItem("Benchmark Comparison"):
                with gr.Row():
                    benchmark_plot = gr.Plot(value=create_benchmark_plot(initial_df))

    model_names = initial_df["Model"].tolist()
    model_selector = gr.Dropdown(
        choices=model_names,
        label="Select Models to Display",
        multiselect=True,
        info="Select one or more models to display on the plots. If none are selected, all models will be shown."
    )

    with gr.Row():
        dataframe = gr.DataFrame(
            value=style_diff(display_df, initial_df),
            type="pandas",
            column_widths=["30%", "10%", "10%", "12%", "10%", "10%", "10%"],
            wrap=True
        )
    
    model_selector.change(update_plots, inputs=model_selector, outputs=[scatterplot, barplot, benchmark_plot, dataframe])

if __name__ == "__main__":
    demo.launch()