Spaces:
Running
Running
Commit
Β·
15fe18d
1
Parent(s):
7ffe204
Add option to download file
Browse files- .gitignore +1 -0
- app.py +72 -2
.gitignore
CHANGED
|
@@ -11,3 +11,4 @@ eval-results/
|
|
| 11 |
eval-queue-bk/
|
| 12 |
eval-results-bk/
|
| 13 |
logs/
|
|
|
|
|
|
| 11 |
eval-queue-bk/
|
| 12 |
eval-results-bk/
|
| 13 |
logs/
|
| 14 |
+
filbench_results.csv
|
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
|
@@ -27,7 +28,7 @@ def restart_space():
|
|
| 27 |
|
| 28 |
|
| 29 |
# 2. Load and populate leaderboard data
|
| 30 |
-
def
|
| 31 |
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
|
| 32 |
raw_data = [EvalResult.init_from_dict(result) for result in results]
|
| 33 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
@@ -35,7 +36,6 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
|
|
| 35 |
df = pd.DataFrame.from_records(all_data_json)
|
| 36 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 37 |
df["Incomplete"] = ~df.isna().any(axis=1)
|
| 38 |
-
|
| 39 |
master_columns = []
|
| 40 |
for col in fields(AutoEvalColumn):
|
| 41 |
if col.meta:
|
|
@@ -54,6 +54,11 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
|
|
| 54 |
]
|
| 55 |
cols.append("Incomplete")
|
| 56 |
df = df[cols].round(decimals=2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
return Leaderboard(
|
| 59 |
value=df,
|
|
@@ -90,6 +95,68 @@ def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# 3. Actual setup of the HF Space
|
| 94 |
demo = gr.Blocks(css=custom_css)
|
| 95 |
with demo:
|
|
@@ -111,6 +178,9 @@ with demo:
|
|
| 111 |
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 112 |
|
| 113 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
| 114 |
with gr.Accordion("π Citation", open=False):
|
| 115 |
citation_button = gr.Textbox(
|
| 116 |
value=about.CITATION_BUTTON_TEXT,
|
|
|
|
| 1 |
import os
|
| 2 |
+
import re
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
# 2. Load and populate leaderboard data
|
| 31 |
+
def get_results(source: str, aggregate: bool = False) -> pd.DataFrame:
|
| 32 |
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
|
| 33 |
raw_data = [EvalResult.init_from_dict(result) for result in results]
|
| 34 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
|
| 36 |
df = pd.DataFrame.from_records(all_data_json)
|
| 37 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
| 38 |
df["Incomplete"] = ~df.isna().any(axis=1)
|
|
|
|
| 39 |
master_columns = []
|
| 40 |
for col in fields(AutoEvalColumn):
|
| 41 |
if col.meta:
|
|
|
|
| 54 |
]
|
| 55 |
cols.append("Incomplete")
|
| 56 |
df = df[cols].round(decimals=2)
|
| 57 |
+
return df, master_columns
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def init_leaderboard(source: str, aggregate: bool = False) -> Leaderboard:
|
| 61 |
+
df, master_columns = get_results(source=source, aggregate=aggregate)
|
| 62 |
|
| 63 |
return Leaderboard(
|
| 64 |
value=df,
|
|
|
|
| 95 |
)
|
| 96 |
|
| 97 |
|
| 98 |
+
def download_results():
|
| 99 |
+
df, _ = get_results(source=REPO_RESULTS, aggregate=False)
|
| 100 |
+
df_agg, _ = get_results(source=REPO_RESULTS, aggregate=True)
|
| 101 |
+
|
| 102 |
+
# Cleanup
|
| 103 |
+
def extract_names(html_string):
|
| 104 |
+
match = re.search(r"<a[^>]*>(.*?)</a>", html_string)
|
| 105 |
+
if match:
|
| 106 |
+
extracted_text = match.group(1) # "some value"
|
| 107 |
+
return extracted_text
|
| 108 |
+
|
| 109 |
+
def remove_emojis(string):
|
| 110 |
+
emoji_pattern = re.compile(
|
| 111 |
+
"["
|
| 112 |
+
"\U0001f600-\U0001f64f" # emoticons
|
| 113 |
+
"\U0001f300-\U0001f5ff" # symbols & pictographs
|
| 114 |
+
"\U0001f680-\U0001f6ff" # transport & map symbols
|
| 115 |
+
"\U0001f700-\U0001f77f" # alchemical symbols
|
| 116 |
+
"\U0001f780-\U0001f7ff" # Geometric Shapes Extended
|
| 117 |
+
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
|
| 118 |
+
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
|
| 119 |
+
"\U0001fa00-\U0001fa6f" # Chess Symbols
|
| 120 |
+
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
|
| 121 |
+
"\U00002702-\U000027b0" # Dingbats
|
| 122 |
+
"\U000024c2-\U0001f251"
|
| 123 |
+
"]+",
|
| 124 |
+
flags=re.UNICODE,
|
| 125 |
+
)
|
| 126 |
+
return emoji_pattern.sub(r"", string)
|
| 127 |
+
|
| 128 |
+
df["Model"] = df["Model"].apply(extract_names)
|
| 129 |
+
df = df.rename(columns={col: remove_emojis(col).strip() for col in df.columns})
|
| 130 |
+
df["Multilingual"] = df["Multilingual"].apply(remove_emojis)
|
| 131 |
+
df["Model Type"] = df["Model Type"].apply(remove_emojis)
|
| 132 |
+
df = df.reset_index(drop=True)
|
| 133 |
+
|
| 134 |
+
# Cleanup the aggregated dataset
|
| 135 |
+
df_agg["Model"] = df_agg["Model"].apply(extract_names)
|
| 136 |
+
df_agg = df_agg.rename(
|
| 137 |
+
columns={col: remove_emojis(col).strip() for col in df_agg.columns}
|
| 138 |
+
)
|
| 139 |
+
df_agg = df_agg.reset_index(drop=True)
|
| 140 |
+
df_agg = df_agg[
|
| 141 |
+
[
|
| 142 |
+
"Model",
|
| 143 |
+
"Cultural Knowledge",
|
| 144 |
+
"Classical NLP",
|
| 145 |
+
"Reading Comprehension",
|
| 146 |
+
"Generation",
|
| 147 |
+
]
|
| 148 |
+
]
|
| 149 |
+
df_agg = df_agg.rename(
|
| 150 |
+
columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Combine the full and aggregated results
|
| 154 |
+
df_merge = df.merge(df_agg, on="Model")
|
| 155 |
+
filepath = "filbench_results.csv"
|
| 156 |
+
df_merge.to_csv(filepath, index=False)
|
| 157 |
+
return filepath
|
| 158 |
+
|
| 159 |
+
|
| 160 |
# 3. Actual setup of the HF Space
|
| 161 |
demo = gr.Blocks(css=custom_css)
|
| 162 |
with demo:
|
|
|
|
| 178 |
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 179 |
|
| 180 |
with gr.Row():
|
| 181 |
+
download_button = gr.DownloadButton("Download results (CSV)")
|
| 182 |
+
download_button.click(download_results, outputs=download_button)
|
| 183 |
+
|
| 184 |
with gr.Accordion("π Citation", open=False):
|
| 185 |
citation_button = gr.Textbox(
|
| 186 |
value=about.CITATION_BUTTON_TEXT,
|