AMWAL_ArFinNER / amwal.py
Muhsabrys's picture
Update amwal.py
345c58c verified
import re
import os
import spacy
from huggingface_hub import snapshot_download
# -----------------------
# Arabic normalization
# -----------------------
_DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]")
def normalize_arabic(text: str) -> str:
# 1. Remove diacritics
text = _DIACRITICS.sub("", text)
# 2. Normalize hamza variants
text = re.sub(r"[إأآ]", "ا", text) # إ أ آ → ا
text = re.sub(r"[ؤئ]", "ء", text) # ؤ ئ → ء
# 3. Normalize other orthographic variants
text = re.sub(r"ة", "ه", text) # ة → ه
text = re.sub(r"ى", "ي", text) # ى → ي
return text
# -----------------------
# Loader
# -----------------------
_MODEL = None
def load_ner(
repo_id="Muhsabrys/AMWAL_ArFinNER",
local_path=None,
):
"""
Load AMWAL NER:
- from local_path (development / testing)
- or from Hugging Face (default)
"""
global _MODEL
if _MODEL is None:
if local_path is not None:
model_path = os.path.join(local_path, "spacy_model", "model-best")
else:
path = snapshot_download(repo_id=repo_id)
model_path = os.path.join(path, "spacy_model", "model-best")
_MODEL = spacy.load(model_path)
def ner(text: str):
raw = text
text_norm = normalize_arabic(text)
doc = _MODEL(text_norm)
return {
"raw_text": raw,
"normalized_text": text_norm,
"entities": [
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char,
}
for ent in doc.ents
],
}
return ner