Spaces:
Running
Running
Commit
·
b5b19aa
1
Parent(s):
814a536
Apply pre-commit and all fixes
Browse files- README.md +2 -2
- app.py +7 -29
- requirements.txt +3 -3
- src/about.py +1 -1
- src/display/css_html_js.py +2 -2
- src/display/utils.py +0 -2
- src/leaderboard/read_evals.py +0 -1
- src/submission/check_validity.py +3 -5
- src/submission/submit.py +2 -6
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# HF Leaderboard Backend
|
| 2 |
|
| 3 |
-
This is a fork of the [leaderboard demo from HuggingFace](https://huggingface.co/demo-leaderboard-backend) with some additional scripts for parsing the results from our evaluation runs.
|
| 4 |
|
| 5 |
## Set-up and installation
|
| 6 |
|
|
@@ -21,4 +21,4 @@ If you want to update the HuggingFace space, you should add a remote pointing to
|
|
| 21 |
```sh
|
| 22 |
git remote add hf https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard
|
| 23 |
git push hf main
|
| 24 |
-
```
|
|
|
|
| 1 |
# HF Leaderboard Backend
|
| 2 |
|
| 3 |
+
This is a fork of the [leaderboard demo from HuggingFace](https://huggingface.co/demo-leaderboard-backend) with some additional scripts for parsing the results from our evaluation runs.
|
| 4 |
|
| 5 |
## Set-up and installation
|
| 6 |
|
|
|
|
| 21 |
```sh
|
| 22 |
git remote add hf https://huggingface.co/spaces/UD-Filipino/filbench-leaderboard
|
| 23 |
git push hf main
|
| 24 |
+
```
|
app.py
CHANGED
|
@@ -1,38 +1,16 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 4 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 5 |
from huggingface_hub import snapshot_download
|
| 6 |
|
| 7 |
-
from src.about import
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
EVALUATION_QUEUE_TEXT,
|
| 11 |
-
INTRODUCTION_TEXT,
|
| 12 |
-
LLM_BENCHMARKS_TEXT,
|
| 13 |
-
TITLE,
|
| 14 |
-
)
|
| 15 |
from src.display.css_html_js import custom_css
|
| 16 |
-
from src.display.utils import
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
EVAL_TYPES,
|
| 21 |
-
AutoEvalColumn,
|
| 22 |
-
ModelType,
|
| 23 |
-
Precision,
|
| 24 |
-
WeightType,
|
| 25 |
-
fields,
|
| 26 |
-
)
|
| 27 |
-
from src.envs import (
|
| 28 |
-
API,
|
| 29 |
-
EVAL_REQUESTS_PATH,
|
| 30 |
-
EVAL_RESULTS_PATH,
|
| 31 |
-
QUEUE_REPO,
|
| 32 |
-
REPO_ID,
|
| 33 |
-
RESULTS_REPO,
|
| 34 |
-
TOKEN,
|
| 35 |
-
)
|
| 36 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 37 |
from src.submission.submit import add_new_eval
|
| 38 |
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
| 2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 3 |
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
| 4 |
from huggingface_hub import snapshot_download
|
| 5 |
|
| 6 |
+
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
|
| 7 |
+
from src.about import EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT
|
| 8 |
+
from src.about import TITLE
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from src.display.css_html_js import custom_css
|
| 10 |
+
from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES
|
| 11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Precision, WeightType, fields
|
| 12 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID
|
| 13 |
+
from src.envs import RESULTS_REPO, TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 15 |
from src.submission.submit import add_new_eval
|
| 16 |
|
requirements.txt
CHANGED
|
@@ -3,14 +3,14 @@ black
|
|
| 3 |
datasets
|
| 4 |
gradio
|
| 5 |
gradio[oauth]
|
| 6 |
-
gradio_leaderboard==0.0.13
|
| 7 |
gradio_client
|
|
|
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
| 10 |
numpy
|
| 11 |
pandas
|
| 12 |
python-dateutil
|
|
|
|
|
|
|
| 13 |
tqdm
|
| 14 |
transformers
|
| 15 |
-
tokenizers>=0.15.0
|
| 16 |
-
sentencepiece
|
|
|
|
| 3 |
datasets
|
| 4 |
gradio
|
| 5 |
gradio[oauth]
|
|
|
|
| 6 |
gradio_client
|
| 7 |
+
gradio_leaderboard==0.0.13
|
| 8 |
huggingface-hub>=0.18.0
|
| 9 |
matplotlib
|
| 10 |
numpy
|
| 11 |
pandas
|
| 12 |
python-dateutil
|
| 13 |
+
sentencepiece
|
| 14 |
+
tokenizers>=0.15.0
|
| 15 |
tqdm
|
| 16 |
transformers
|
|
|
|
|
|
src/about.py
CHANGED
|
@@ -30,7 +30,7 @@ Intro text
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 33 |
-
LLM_BENCHMARKS_TEXT =
|
| 34 |
## How it works
|
| 35 |
|
| 36 |
## Reproducibility
|
|
|
|
| 30 |
"""
|
| 31 |
|
| 32 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 33 |
+
LLM_BENCHMARKS_TEXT = """
|
| 34 |
## How it works
|
| 35 |
|
| 36 |
## Reproducibility
|
src/display/css_html_js.py
CHANGED
|
@@ -33,7 +33,7 @@ custom_css = """
|
|
| 33 |
background: none;
|
| 34 |
border: none;
|
| 35 |
}
|
| 36 |
-
|
| 37 |
#search-bar {
|
| 38 |
padding: 0px;
|
| 39 |
}
|
|
@@ -77,7 +77,7 @@ custom_css = """
|
|
| 77 |
#filter_type label > .wrap{
|
| 78 |
width: 103px;
|
| 79 |
}
|
| 80 |
-
#filter_type label > .wrap .wrap-inner{
|
| 81 |
padding: 2px;
|
| 82 |
}
|
| 83 |
#filter_type label > .wrap .wrap-inner input{
|
|
|
|
| 33 |
background: none;
|
| 34 |
border: none;
|
| 35 |
}
|
| 36 |
+
|
| 37 |
#search-bar {
|
| 38 |
padding: 0px;
|
| 39 |
}
|
|
|
|
| 77 |
#filter_type label > .wrap{
|
| 78 |
width: 103px;
|
| 79 |
}
|
| 80 |
+
#filter_type label > .wrap .wrap-inner{
|
| 81 |
padding: 2px;
|
| 82 |
}
|
| 83 |
#filter_type label > .wrap .wrap-inner input{
|
src/display/utils.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
-
import pandas as pd
|
| 5 |
-
|
| 6 |
from src.about import Tasks
|
| 7 |
|
| 8 |
|
|
|
|
| 1 |
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
|
|
|
|
|
|
| 4 |
from src.about import Tasks
|
| 5 |
|
| 6 |
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import glob
|
| 2 |
import json
|
| 3 |
-
import math
|
| 4 |
import os
|
| 5 |
from dataclasses import dataclass
|
| 6 |
|
|
|
|
| 1 |
import glob
|
| 2 |
import json
|
|
|
|
| 3 |
import os
|
| 4 |
from dataclasses import dataclass
|
| 5 |
|
src/submission/check_validity.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
-
import re
|
| 4 |
from collections import defaultdict
|
| 5 |
-
from datetime import datetime, timedelta, timezone
|
| 6 |
|
| 7 |
import huggingface_hub
|
| 8 |
from huggingface_hub import ModelCard
|
|
@@ -53,7 +51,7 @@ def is_model_on_hub(
|
|
| 53 |
)
|
| 54 |
if test_tokenizer:
|
| 55 |
try:
|
| 56 |
-
|
| 57 |
model_name,
|
| 58 |
revision=revision,
|
| 59 |
trust_remote_code=trust_remote_code,
|
|
@@ -68,7 +66,7 @@ def is_model_on_hub(
|
|
| 68 |
except Exception as e:
|
| 69 |
return (
|
| 70 |
False,
|
| 71 |
-
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured
|
| 72 |
None,
|
| 73 |
)
|
| 74 |
return True, None, config
|
|
@@ -81,7 +79,7 @@ def is_model_on_hub(
|
|
| 81 |
)
|
| 82 |
|
| 83 |
except Exception as e:
|
| 84 |
-
return False, "was not found on hub!", None
|
| 85 |
|
| 86 |
|
| 87 |
def get_model_size(model_info: ModelInfo, precision: str):
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
|
|
|
| 3 |
from collections import defaultdict
|
|
|
|
| 4 |
|
| 5 |
import huggingface_hub
|
| 6 |
from huggingface_hub import ModelCard
|
|
|
|
| 51 |
)
|
| 52 |
if test_tokenizer:
|
| 53 |
try:
|
| 54 |
+
AutoTokenizer.from_pretrained(
|
| 55 |
model_name,
|
| 56 |
revision=revision,
|
| 57 |
trust_remote_code=trust_remote_code,
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
return (
|
| 68 |
False,
|
| 69 |
+
f"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?: {e}",
|
| 70 |
None,
|
| 71 |
)
|
| 72 |
return True, None, config
|
|
|
|
| 79 |
)
|
| 80 |
|
| 81 |
except Exception as e:
|
| 82 |
+
return False, f"was not found on hub! {e}", None
|
| 83 |
|
| 84 |
|
| 85 |
def get_model_size(model_info: ModelInfo, precision: str):
|
src/submission/submit.py
CHANGED
|
@@ -4,12 +4,8 @@ from datetime import datetime, timezone
|
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
| 7 |
-
from src.submission.check_validity import
|
| 8 |
-
|
| 9 |
-
check_model_card,
|
| 10 |
-
get_model_size,
|
| 11 |
-
is_model_on_hub,
|
| 12 |
-
)
|
| 13 |
|
| 14 |
REQUESTED_MODELS = None
|
| 15 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
|
| 4 |
|
| 5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
| 6 |
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
| 7 |
+
from src.submission.check_validity import already_submitted_models, check_model_card
|
| 8 |
+
from src.submission.check_validity import get_model_size, is_model_on_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
REQUESTED_MODELS = None
|
| 11 |
USERS_TO_SUBMISSION_DATES = None
|