Spaces:
Running
Running
nam pham
commited on
Commit
·
ffa19f8
1
Parent(s):
345f1ee
fix: upload to huggingface
Browse files- app.py +30 -15
- data/annotated_data.json +0 -0
app.py
CHANGED
|
@@ -295,10 +295,16 @@ def batch_annotate_text(model: GLiNER, texts: List[str], labels: List[str], thre
|
|
| 295 |
|
| 296 |
class AutoAnnotator:
|
| 297 |
def __init__(
|
| 298 |
-
self, model: str = "
|
| 299 |
-
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
|
|
|
| 300 |
) -> None:
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
self.model = GLiNER.from_pretrained(model).to(device)
|
| 303 |
self.annotated_data = []
|
| 304 |
self.stat = {
|
|
@@ -315,22 +321,31 @@ class AutoAnnotator:
|
|
| 315 |
|
| 316 |
# Process texts in batches
|
| 317 |
processed_data = []
|
|
|
|
| 318 |
|
| 319 |
-
for i
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
-
#
|
| 326 |
-
|
|
|
|
| 327 |
|
| 328 |
-
#
|
| 329 |
-
|
| 330 |
-
|
| 331 |
|
| 332 |
# Update progress
|
| 333 |
-
self.stat["current"] = i +
|
| 334 |
|
| 335 |
self.annotated_data = processed_data
|
| 336 |
return self.annotated_data
|
|
@@ -339,7 +354,7 @@ class AutoAnnotator:
|
|
| 339 |
annotator = None
|
| 340 |
sentences = []
|
| 341 |
|
| 342 |
-
def process_text_for_gliner(text: str, max_tokens: int =
|
| 343 |
"""
|
| 344 |
Process text for GLiNER by splitting long texts into overlapping chunks.
|
| 345 |
Preserves sentence boundaries and context when possible.
|
|
@@ -442,7 +457,7 @@ def create_hf_repo(repo_name: str, repo_type: str = "dataset", private: bool = F
|
|
| 442 |
exist_ok=True,
|
| 443 |
token=HF_TOKEN
|
| 444 |
)
|
| 445 |
-
return
|
| 446 |
except Exception as e:
|
| 447 |
raise Exception(f"Error creating repository: {str(e)}")
|
| 448 |
|
|
|
|
| 295 |
|
| 296 |
class AutoAnnotator:
|
| 297 |
def __init__(
|
| 298 |
+
self, model: str = "BookingCare/gliner-multi-healthcare",
|
| 299 |
+
# device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
| 300 |
+
device = torch.device('cpu')
|
| 301 |
) -> None:
|
| 302 |
|
| 303 |
+
# Set PyTorch memory management settings
|
| 304 |
+
if torch.cuda.is_available():
|
| 305 |
+
torch.cuda.empty_cache()
|
| 306 |
+
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
| 307 |
+
|
| 308 |
self.model = GLiNER.from_pretrained(model).to(device)
|
| 309 |
self.annotated_data = []
|
| 310 |
self.stat = {
|
|
|
|
| 321 |
|
| 322 |
# Process texts in batches
|
| 323 |
processed_data = []
|
| 324 |
+
batch_size = 8 # Reduced batch size to prevent OOM errors
|
| 325 |
|
| 326 |
+
for i in range(0, len(data), batch_size):
|
| 327 |
+
batch_texts = data[i:i + batch_size]
|
| 328 |
+
batch_with_prompts = []
|
| 329 |
+
|
| 330 |
+
# Add prompts to batch texts
|
| 331 |
+
for text in batch_texts:
|
| 332 |
+
if isinstance(prompt, list):
|
| 333 |
+
prompt_text = random.choice(prompt)
|
| 334 |
+
else:
|
| 335 |
+
prompt_text = prompt
|
| 336 |
+
text_with_prompt = f"{prompt_text}\n{text}" if prompt_text else text
|
| 337 |
+
batch_with_prompts.append(text_with_prompt)
|
| 338 |
|
| 339 |
+
# Process batch
|
| 340 |
+
batch_results = batch_annotate_text(self.model, batch_with_prompts, labels, threshold, nested_ner)
|
| 341 |
+
processed_data.extend(batch_results)
|
| 342 |
|
| 343 |
+
# Clear CUDA cache after each batch
|
| 344 |
+
if torch.cuda.is_available():
|
| 345 |
+
torch.cuda.empty_cache()
|
| 346 |
|
| 347 |
# Update progress
|
| 348 |
+
self.stat["current"] = min(i + batch_size, len(data))
|
| 349 |
|
| 350 |
self.annotated_data = processed_data
|
| 351 |
return self.annotated_data
|
|
|
|
| 354 |
annotator = None
|
| 355 |
sentences = []
|
| 356 |
|
| 357 |
+
def process_text_for_gliner(text: str, max_tokens: int = 256, overlap: int = 32) -> List[str]:
|
| 358 |
"""
|
| 359 |
Process text for GLiNER by splitting long texts into overlapping chunks.
|
| 360 |
Preserves sentence boundaries and context when possible.
|
|
|
|
| 457 |
exist_ok=True,
|
| 458 |
token=HF_TOKEN
|
| 459 |
)
|
| 460 |
+
return repo_name
|
| 461 |
except Exception as e:
|
| 462 |
raise Exception(f"Error creating repository: {str(e)}")
|
| 463 |
|
data/annotated_data.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|