Spaces:
Running
Running
nam pham
commited on
Commit
·
64d96d3
1
Parent(s):
ad042b1
feat: download and upload file
Browse files- app.py +136 -39
- data/annotated_data.json +0 -0
app.py
CHANGED
|
@@ -247,9 +247,10 @@ def merge_entities(entities):
|
|
| 247 |
merged.append(current)
|
| 248 |
return merged
|
| 249 |
|
| 250 |
-
def annotate_text(
|
|
|
|
|
|
|
| 251 |
labels = [label.strip() for label in labels]
|
| 252 |
-
entities = model.predict_entities(text, labels, flat_ner=not nested_ner, threshold=threshold)
|
| 253 |
r = {
|
| 254 |
"text": text,
|
| 255 |
"entities": [
|
|
@@ -260,7 +261,9 @@ def annotate_text(model: GLiNER, text, labels: List[str], threshold: float, nest
|
|
| 260 |
"end": entity["end"],
|
| 261 |
"score": 0,
|
| 262 |
}
|
| 263 |
-
for entity in
|
|
|
|
|
|
|
| 264 |
],
|
| 265 |
}
|
| 266 |
r["entities"] = merge_entities(r["entities"])
|
|
@@ -311,25 +314,23 @@ class AutoAnnotator:
|
|
| 311 |
self.stat["current"] = -1 # Reset current progress
|
| 312 |
|
| 313 |
# Process texts in batches
|
| 314 |
-
batch_size = 32 # Adjust based on your GPU memory
|
| 315 |
processed_data = []
|
| 316 |
|
| 317 |
-
for i in
|
| 318 |
-
batch_texts = data[i:i + batch_size]
|
| 319 |
if isinstance(prompt, list):
|
| 320 |
prompt_text = random.choice(prompt)
|
| 321 |
else:
|
| 322 |
prompt_text = prompt
|
| 323 |
|
| 324 |
-
# Add prompt to
|
| 325 |
-
|
| 326 |
|
| 327 |
-
# Process
|
| 328 |
-
|
| 329 |
-
processed_data.
|
| 330 |
|
| 331 |
# Update progress
|
| 332 |
-
self.stat["current"] =
|
| 333 |
|
| 334 |
self.annotated_data = processed_data
|
| 335 |
return self.annotated_data
|
|
@@ -338,22 +339,93 @@ class AutoAnnotator:
|
|
| 338 |
annotator = None
|
| 339 |
sentences = []
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
def process_uploaded_file(file_obj):
|
| 342 |
if file_obj is None:
|
| 343 |
return "Please upload a file first!"
|
| 344 |
|
| 345 |
try:
|
| 346 |
# Read the uploaded file
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
return f"Successfully loaded {len(sentences)} sentences from file!"
|
| 351 |
except Exception as e:
|
| 352 |
return f"Error reading file: {str(e)}"
|
| 353 |
|
| 354 |
def is_valid_repo_name(repo_name):
|
| 355 |
# Hugging Face repo names must not contain slashes or spaces
|
| 356 |
-
return bool(re.match(r'^[A-Za-z0-9_
|
| 357 |
|
| 358 |
def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
|
| 359 |
"""Create a new repository on Hugging Face Hub"""
|
|
@@ -443,7 +515,7 @@ def convert_hf_dataset_to_ner_format(dataset):
|
|
| 443 |
|
| 444 |
return converted_data
|
| 445 |
|
| 446 |
-
def load_from_huggingface(dataset_name: str, split: str = "
|
| 447 |
"""Load dataset from Hugging Face Hub"""
|
| 448 |
try:
|
| 449 |
dataset = load_dataset(dataset_name, split=split)
|
|
@@ -797,17 +869,21 @@ with gr.Blocks() as demo:
|
|
| 797 |
)
|
| 798 |
local_status = gr.Textbox(label="Local File Status", visible=False)
|
| 799 |
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
|
| 812 |
bar = gr.Slider(
|
| 813 |
minimum=0,
|
|
@@ -827,7 +903,7 @@ with gr.Blocks() as demo:
|
|
| 827 |
save_btn = gr.Button("Save validated dataset")
|
| 828 |
|
| 829 |
# Add Hugging Face upload section
|
| 830 |
-
with gr.Group():
|
| 831 |
gr.Markdown("### Upload to Hugging Face")
|
| 832 |
hf_repo_name = gr.Textbox(
|
| 833 |
label="Repository Name",
|
|
@@ -846,6 +922,29 @@ with gr.Blocks() as demo:
|
|
| 846 |
upload_to_hf_btn = gr.Button("Upload to Hugging Face")
|
| 847 |
hf_upload_status = gr.Textbox(label="Upload Status")
|
| 848 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
inp_box = gr.HighlightedText(value=None, interactive=True)
|
| 850 |
|
| 851 |
def toggle_local_inputs():
|
|
@@ -853,8 +952,7 @@ with gr.Blocks() as demo:
|
|
| 853 |
local_file: gr.update(visible=True),
|
| 854 |
file_format: gr.update(visible=True),
|
| 855 |
local_status: gr.update(visible=True),
|
| 856 |
-
|
| 857 |
-
dataset_split: gr.update(visible=False)
|
| 858 |
}
|
| 859 |
|
| 860 |
def toggle_hf_inputs():
|
|
@@ -862,20 +960,19 @@ with gr.Blocks() as demo:
|
|
| 862 |
local_file: gr.update(visible=False),
|
| 863 |
file_format: gr.update(visible=False),
|
| 864 |
local_status: gr.update(visible=False),
|
| 865 |
-
|
| 866 |
-
dataset_split: gr.update(visible=True)
|
| 867 |
}
|
| 868 |
|
| 869 |
load_local_btn.click(
|
| 870 |
fn=toggle_local_inputs,
|
| 871 |
inputs=None,
|
| 872 |
-
outputs=[local_file, file_format, local_status,
|
| 873 |
)
|
| 874 |
|
| 875 |
load_hf_btn.click(
|
| 876 |
fn=toggle_hf_inputs,
|
| 877 |
inputs=None,
|
| 878 |
-
outputs=[local_file, file_format, local_status,
|
| 879 |
)
|
| 880 |
|
| 881 |
def process_and_load_local(file_obj, format):
|
|
@@ -893,13 +990,13 @@ with gr.Blocks() as demo:
|
|
| 893 |
def load_hf_dataset(name, split):
|
| 894 |
status = load_from_huggingface(name, split)
|
| 895 |
if "Successfully" in status:
|
| 896 |
-
return load_dataset()
|
| 897 |
-
return [status], 0, 0
|
| 898 |
|
| 899 |
-
|
| 900 |
fn=load_hf_dataset,
|
| 901 |
inputs=[dataset_name, dataset_split],
|
| 902 |
-
outputs=[inp_box, bar]
|
| 903 |
)
|
| 904 |
|
| 905 |
apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
|
|
|
|
| 247 |
merged.append(current)
|
| 248 |
return merged
|
| 249 |
|
| 250 |
+
def annotate_text(
|
| 251 |
+
model, text, labels: List[str], threshold: float, nested_ner: bool
|
| 252 |
+
) -> Dict:
|
| 253 |
labels = [label.strip() for label in labels]
|
|
|
|
| 254 |
r = {
|
| 255 |
"text": text,
|
| 256 |
"entities": [
|
|
|
|
| 261 |
"end": entity["end"],
|
| 262 |
"score": 0,
|
| 263 |
}
|
| 264 |
+
for entity in model.predict_entities(
|
| 265 |
+
text, labels, flat_ner=not nested_ner, threshold=threshold
|
| 266 |
+
)
|
| 267 |
],
|
| 268 |
}
|
| 269 |
r["entities"] = merge_entities(r["entities"])
|
|
|
|
| 314 |
self.stat["current"] = -1 # Reset current progress
|
| 315 |
|
| 316 |
# Process texts in batches
|
|
|
|
| 317 |
processed_data = []
|
| 318 |
|
| 319 |
+
for i, text in enumerate(data):
|
|
|
|
| 320 |
if isinstance(prompt, list):
|
| 321 |
prompt_text = random.choice(prompt)
|
| 322 |
else:
|
| 323 |
prompt_text = prompt
|
| 324 |
|
| 325 |
+
# Add prompt to text
|
| 326 |
+
text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
|
| 327 |
|
| 328 |
+
# Process single text
|
| 329 |
+
result = annotate_text(self.model, text_with_prompt, labels, threshold, nested_ner)
|
| 330 |
+
processed_data.append(result)
|
| 331 |
|
| 332 |
# Update progress
|
| 333 |
+
self.stat["current"] = i + 1
|
| 334 |
|
| 335 |
self.annotated_data = processed_data
|
| 336 |
return self.annotated_data
|
|
|
|
| 339 |
annotator = None
|
| 340 |
sentences = []
|
| 341 |
|
| 342 |
+
def process_text_for_gliner(text: str, max_tokens: int = 384, overlap: int = 50) -> List[str]:
|
| 343 |
+
"""
|
| 344 |
+
Process text for GLiNER by splitting long texts into overlapping chunks.
|
| 345 |
+
Preserves sentence boundaries and context when possible.
|
| 346 |
+
|
| 347 |
+
Args:
|
| 348 |
+
text: The input text to process
|
| 349 |
+
max_tokens: Maximum number of tokens per chunk
|
| 350 |
+
overlap: Number of tokens to overlap between chunks
|
| 351 |
+
|
| 352 |
+
Returns:
|
| 353 |
+
List of text chunks suitable for GLiNER
|
| 354 |
+
"""
|
| 355 |
+
# First split into sentences to preserve natural boundaries
|
| 356 |
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
| 357 |
+
chunks = []
|
| 358 |
+
current_chunk = []
|
| 359 |
+
current_length = 0
|
| 360 |
+
|
| 361 |
+
for sentence in sentences:
|
| 362 |
+
# Tokenize the sentence
|
| 363 |
+
sentence_tokens = tokenize_text(sentence)
|
| 364 |
+
sentence_length = len(sentence_tokens)
|
| 365 |
+
|
| 366 |
+
# If a single sentence is too long, split it
|
| 367 |
+
if sentence_length > max_tokens:
|
| 368 |
+
# If we have accumulated tokens, add them as a chunk
|
| 369 |
+
if current_chunk:
|
| 370 |
+
chunks.append(" ".join(current_chunk))
|
| 371 |
+
current_chunk = []
|
| 372 |
+
current_length = 0
|
| 373 |
+
|
| 374 |
+
# Split the long sentence into smaller chunks
|
| 375 |
+
start = 0
|
| 376 |
+
while start < sentence_length:
|
| 377 |
+
end = min(start + max_tokens, sentence_length)
|
| 378 |
+
chunk_tokens = sentence_tokens[start:end]
|
| 379 |
+
chunks.append(" ".join(chunk_tokens))
|
| 380 |
+
start = end - overlap if end < sentence_length else end
|
| 381 |
+
|
| 382 |
+
# If adding this sentence would exceed max_tokens, start a new chunk
|
| 383 |
+
elif current_length + sentence_length > max_tokens:
|
| 384 |
+
chunks.append(" ".join(current_chunk))
|
| 385 |
+
current_chunk = sentence_tokens
|
| 386 |
+
current_length = sentence_length
|
| 387 |
+
else:
|
| 388 |
+
current_chunk.extend(sentence_tokens)
|
| 389 |
+
current_length += sentence_length
|
| 390 |
+
|
| 391 |
+
# Add any remaining tokens as the final chunk
|
| 392 |
+
if current_chunk:
|
| 393 |
+
chunks.append(" ".join(current_chunk))
|
| 394 |
+
|
| 395 |
+
return chunks
|
| 396 |
+
|
| 397 |
def process_uploaded_file(file_obj):
|
| 398 |
if file_obj is None:
|
| 399 |
return "Please upload a file first!"
|
| 400 |
|
| 401 |
try:
|
| 402 |
# Read the uploaded file
|
| 403 |
+
global sentences
|
| 404 |
+
if file_obj.name.endswith('.csv'):
|
| 405 |
+
import pandas as pd
|
| 406 |
+
df = pd.read_csv(file_obj.name)
|
| 407 |
+
sentences = df['Nội dung'].dropna().tolist()
|
| 408 |
+
# Process each sentence and flatten the list
|
| 409 |
+
processed_sentences = []
|
| 410 |
+
for sentence in sentences:
|
| 411 |
+
processed_sentences.extend(process_text_for_gliner(sentence))
|
| 412 |
+
sentences = processed_sentences
|
| 413 |
+
else:
|
| 414 |
+
# Read the file content directly from the file object
|
| 415 |
+
content = file_obj.read().decode('utf-8')
|
| 416 |
+
raw_sentences = [line.strip() for line in content.splitlines() if line.strip()]
|
| 417 |
+
# Process each sentence and flatten the list
|
| 418 |
+
processed_sentences = []
|
| 419 |
+
for sentence in raw_sentences:
|
| 420 |
+
processed_sentences.extend(process_text_for_gliner(sentence))
|
| 421 |
+
sentences = processed_sentences
|
| 422 |
return f"Successfully loaded {len(sentences)} sentences from file!"
|
| 423 |
except Exception as e:
|
| 424 |
return f"Error reading file: {str(e)}"
|
| 425 |
|
| 426 |
def is_valid_repo_name(repo_name):
|
| 427 |
# Hugging Face repo names must not contain slashes or spaces
|
| 428 |
+
return bool(re.match(r'^[A-Za-z0-9_./-]+$', repo_name))
|
| 429 |
|
| 430 |
def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = False):
|
| 431 |
"""Create a new repository on Hugging Face Hub"""
|
|
|
|
| 515 |
|
| 516 |
return converted_data
|
| 517 |
|
| 518 |
+
def load_from_huggingface(dataset_name: str, split: str = "all"):
|
| 519 |
"""Load dataset from Hugging Face Hub"""
|
| 520 |
try:
|
| 521 |
dataset = load_dataset(dataset_name, split=split)
|
|
|
|
| 869 |
)
|
| 870 |
local_status = gr.Textbox(label="Local File Status", visible=False)
|
| 871 |
|
| 872 |
+
with gr.Group(visible=False) as hf_inputs:
|
| 873 |
+
with gr.Row():
|
| 874 |
+
dataset_name = gr.Textbox(
|
| 875 |
+
label="Hugging Face Dataset Name",
|
| 876 |
+
placeholder="Enter dataset name (e.g., conll2003)",
|
| 877 |
+
scale=3
|
| 878 |
+
)
|
| 879 |
+
dataset_split = gr.Dropdown(
|
| 880 |
+
choices=["train", "validation", "test"],
|
| 881 |
+
value="train",
|
| 882 |
+
label="Dataset Split",
|
| 883 |
+
scale=2
|
| 884 |
+
)
|
| 885 |
+
load_dataset_btn = gr.Button("Load Dataset", scale=1)
|
| 886 |
+
hf_status = gr.Textbox(label="Dataset Loading Status")
|
| 887 |
|
| 888 |
bar = gr.Slider(
|
| 889 |
minimum=0,
|
|
|
|
| 903 |
save_btn = gr.Button("Save validated dataset")
|
| 904 |
|
| 905 |
# Add Hugging Face upload section
|
| 906 |
+
with gr.Group(visible=False) as hf_upload_group:
|
| 907 |
gr.Markdown("### Upload to Hugging Face")
|
| 908 |
hf_repo_name = gr.Textbox(
|
| 909 |
label="Repository Name",
|
|
|
|
| 922 |
upload_to_hf_btn = gr.Button("Upload to Hugging Face")
|
| 923 |
hf_upload_status = gr.Textbox(label="Upload Status")
|
| 924 |
|
| 925 |
+
with gr.Row():
|
| 926 |
+
show_hf_upload_btn = gr.Button("Show Upload Options")
|
| 927 |
+
hide_hf_upload_btn = gr.Button("Hide Upload Options", visible=False)
|
| 928 |
+
|
| 929 |
+
def toggle_hf_upload(show: bool):
|
| 930 |
+
return {
|
| 931 |
+
hf_upload_group: gr.update(visible=show),
|
| 932 |
+
show_hf_upload_btn: gr.update(visible=not show),
|
| 933 |
+
hide_hf_upload_btn: gr.update(visible=show)
|
| 934 |
+
}
|
| 935 |
+
|
| 936 |
+
show_hf_upload_btn.click(
|
| 937 |
+
fn=lambda: toggle_hf_upload(True),
|
| 938 |
+
inputs=None,
|
| 939 |
+
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
|
| 940 |
+
)
|
| 941 |
+
|
| 942 |
+
hide_hf_upload_btn.click(
|
| 943 |
+
fn=lambda: toggle_hf_upload(False),
|
| 944 |
+
inputs=None,
|
| 945 |
+
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
|
| 946 |
+
)
|
| 947 |
+
|
| 948 |
inp_box = gr.HighlightedText(value=None, interactive=True)
|
| 949 |
|
| 950 |
def toggle_local_inputs():
|
|
|
|
| 952 |
local_file: gr.update(visible=True),
|
| 953 |
file_format: gr.update(visible=True),
|
| 954 |
local_status: gr.update(visible=True),
|
| 955 |
+
hf_inputs: gr.update(visible=False)
|
|
|
|
| 956 |
}
|
| 957 |
|
| 958 |
def toggle_hf_inputs():
|
|
|
|
| 960 |
local_file: gr.update(visible=False),
|
| 961 |
file_format: gr.update(visible=False),
|
| 962 |
local_status: gr.update(visible=False),
|
| 963 |
+
hf_inputs: gr.update(visible=True)
|
|
|
|
| 964 |
}
|
| 965 |
|
| 966 |
load_local_btn.click(
|
| 967 |
fn=toggle_local_inputs,
|
| 968 |
inputs=None,
|
| 969 |
+
outputs=[local_file, file_format, local_status, hf_inputs]
|
| 970 |
)
|
| 971 |
|
| 972 |
load_hf_btn.click(
|
| 973 |
fn=toggle_hf_inputs,
|
| 974 |
inputs=None,
|
| 975 |
+
outputs=[local_file, file_format, local_status, hf_inputs]
|
| 976 |
)
|
| 977 |
|
| 978 |
def process_and_load_local(file_obj, format):
|
|
|
|
| 990 |
def load_hf_dataset(name, split):
|
| 991 |
status = load_from_huggingface(name, split)
|
| 992 |
if "Successfully" in status:
|
| 993 |
+
return load_dataset(), status
|
| 994 |
+
return [status], 0, 0, status
|
| 995 |
|
| 996 |
+
load_dataset_btn.click(
|
| 997 |
fn=load_hf_dataset,
|
| 998 |
inputs=[dataset_name, dataset_split],
|
| 999 |
+
outputs=[inp_box, bar, hf_status]
|
| 1000 |
)
|
| 1001 |
|
| 1002 |
apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
|
data/annotated_data.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|