Lj Miranda
Add ability to incorporate external submissions (#7)
96c67b7 unverified
raw
history blame
10.5 kB
import logging
import os
import re
import sys
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from datasets import load_dataset
from datasets.data_files import EmptyDatasetError
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from huggingface_hub import HfApi
from src import about
from src.display.css_html_js import custom_css
from src.plots import plot_cost_efficiency, plot_parameter_efficiency
from src.schema import AutoEvalColumn, EvalResult, fields
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO,
)
# 1. Initialization
_hf_token = os.environ.get("HF_TOKEN")
if not _hf_token:
raise ValueError("HF_TOKEN not set!")
api = HfApi(token=_hf_token)
REPO_ID = "UD-Filipino/filbench-leaderboard"
REPO_RESULTS = "UD-Filipino/filbench-results"
SUBMISSION_RESULTS = "UD-Filipino/filbench-results-submission"
def restart_space():
api.restart_space(repo_id=REPO_ID)
# 2. Load and populate leaderboard data
def get_results(
source: str, aggregate: bool = False, submissions: str = None
) -> tuple[pd.DataFrame, list]:
"""Load results from a given source and return a DataFrame with the relevant columns.
If `aggregate` is True, it returns the aggregated results.
source (str): The source dataset to load results from.
aggregate (bool): Whether to return aggregated results or not.
submissions (str, optional): The submissions dataset to load results from.
RETURNS (tuple[pd.DataFrame, list]): A tuple containing the DataFrame with results and a list of master columns.
"""
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
raw_data = [EvalResult.init_from_dict(result) for result in results]
if submissions:
try:
submission_results = (
load_dataset(
submissions, split="train", download_mode="force_redownload"
)
.to_pandas()
.to_dict(orient="records")
)
except EmptyDatasetError:
logging.info("Empty dataset for submissions, skipping...")
submission_results = []
if len(submission_results) == 0:
logging.info("No external submissions found!")
else:
logging.info(f"Found {len(submission_results)} submission/s!")
raw_data += [
EvalResult.init_from_dict(result, is_submission=True)
for result in submission_results
]
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
df["Incomplete"] = ~df.isna().any(axis=1)
master_columns = []
for col in fields(AutoEvalColumn):
if col.meta:
master_columns.append(col.name)
if aggregate:
if col.aggregate:
master_columns.append(col.name)
else:
if not col.aggregate:
master_columns.append(col.name)
cols = [
c.name
for c in fields(AutoEvalColumn)
if not c.hidden and c.name in master_columns
]
cols.append("Incomplete")
df = df[cols].round(decimals=2)
return df, master_columns
def init_leaderboard(
source: str, aggregate: bool = False, submissions: str = None
) -> Leaderboard:
df, master_columns = get_results(
source=source, aggregate=aggregate, submissions=submissions
)
return Leaderboard(
value=df,
datatype=[c.type for c in fields(AutoEvalColumn) if c.name in master_columns],
select_columns=SelectColumns(
default_selection=[
c.name
for c in fields(AutoEvalColumn)
if c.displayed_by_default and c.name in master_columns
],
cant_deselect=[
c.name
for c in fields(AutoEvalColumn)
if c.never_hidden and c.name in master_columns
],
label="Select Columns to Display:",
),
filter_columns=[
# fmt: off
ColumnFilter("Incomplete", type="boolean", label="Hide incomplete evaluations", default=True),
ColumnFilter("Submission", type="boolean", label="Show only submitted results", default=False),
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model type"),
ColumnFilter(AutoEvalColumn.multilingual.name, type="checkboxgroup", label="Multilinguality"),
ColumnFilter(AutoEvalColumn.param_size.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", default=[-1, 83]),
# fmt: on
],
search_columns=[AutoEvalColumn.model.name],
hide_columns=[
c.name
for c in fields(AutoEvalColumn)
if c.hidden and c.name in master_columns
],
interactive=False,
)
def get_clean_df() -> pd.DataFrame:
df, _ = get_results(
source=REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
)
df_agg, _ = get_results(
source=REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
)
# Cleanup
def extract_names(html_string):
match = re.search(r"<a[^>]*>(.*?)</a>", html_string)
if match:
extracted_text = match.group(1) # "some value"
return extracted_text
def remove_emojis(string):
emoji_pattern = re.compile(
"["
"\U0001f600-\U0001f64f" # emoticons
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f700-\U0001f77f" # alchemical symbols
"\U0001f780-\U0001f7ff" # Geometric Shapes Extended
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
"\U0001fa00-\U0001fa6f" # Chess Symbols
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
"\U00002702-\U000027b0" # Dingbats
"\U000024c2-\U0001f251"
"]+",
flags=re.UNICODE,
)
return emoji_pattern.sub(r"", string).strip()
df["Model"] = df["Model"].apply(extract_names)
df = df.rename(columns={col: remove_emojis(col).strip() for col in df.columns})
df["Multilingual"] = df["Multilingual"].apply(remove_emojis)
df["Model Type"] = df["Model Type"].apply(remove_emojis)
df = df.reset_index(drop=True)
# Cleanup the aggregated dataset
df_agg["Model"] = df_agg["Model"].apply(extract_names)
df_agg = df_agg.rename(
columns={col: remove_emojis(col).strip() for col in df_agg.columns}
)
df_agg = df_agg.reset_index(drop=True)
df_agg = df_agg[
[
"Model",
"Cultural Knowledge",
"Classical NLP",
"Reading Comprehension",
"Generation",
]
]
df_agg = df_agg.rename(
columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
)
df_merge = df.merge(df_agg, on="Model")
return df_merge
def download_results():
df = get_clean_df()
filepath = "filbench_results.csv"
df.to_csv(filepath, index=False)
return filepath
# 3. Actual setup of the HF Space
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Column(scale=6):
num_models = len(
get_results(REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS)[0]
)
gr.Markdown(about.TOP_TEXT.format(str(num_models)))
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem(
"πŸ… FilBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0
):
leaderboard = init_leaderboard(
REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
)
with gr.TabItem(
"πŸ” FilBench - Detailed", elem_id="llm-benchmark-tab-table", id=1
):
leaderboard = init_leaderboard(
REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
)
with gr.TabItem("πŸ“Š Analysis", id=2):
df = get_clean_df()
with gr.Row():
with gr.Column():
gr.Markdown("## Parameter-Efficiency Plot")
plot_parameter_efficiency(df)
gr.Markdown(
"Model performance on FilBench with respect to their parameter size. "
"For mixture-of-experts models, we plot their full parameter count. "
"In general, we find that model size and performance are positively correlated."
)
with gr.Column():
gr.Markdown("## Cost-Efficiency Plot")
plot_cost_efficiency(df)
gr.Markdown(
"Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
"We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
"For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
)
with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
download_button = gr.DownloadButton("Download results (CSV)")
download_button.click(download_results, outputs=download_button)
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=about.CITATION_BUTTON_TEXT,
label=about.CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()