| | import re |
| | import os |
| | import spacy |
| | from huggingface_hub import snapshot_download |
| |
|
| | |
| | |
| | |
| |
|
| | _DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0640]") |
| |
|
| | def normalize_arabic(text: str) -> str: |
| | |
| | text = _DIACRITICS.sub("", text) |
| |
|
| | |
| | text = re.sub(r"[إأآ]", "ا", text) |
| | text = re.sub(r"[ؤئ]", "ء", text) |
| |
|
| | |
| | text = re.sub(r"ة", "ه", text) |
| | text = re.sub(r"ى", "ي", text) |
| |
|
| | return text |
| |
|
| | |
| | |
| | |
| | _MODEL = None |
| |
|
| | def load_ner( |
| | repo_id="Muhsabrys/AMWAL_ArFinNER", |
| | local_path=None, |
| | ): |
| | """ |
| | Load AMWAL NER: |
| | - from local_path (development / testing) |
| | - or from Hugging Face (default) |
| | """ |
| | global _MODEL |
| |
|
| | if _MODEL is None: |
| | if local_path is not None: |
| | model_path = os.path.join(local_path, "spacy_model", "model-best") |
| | else: |
| | path = snapshot_download(repo_id=repo_id) |
| | model_path = os.path.join(path, "spacy_model", "model-best") |
| |
|
| | _MODEL = spacy.load(model_path) |
| |
|
| | def ner(text: str): |
| | raw = text |
| | text_norm = normalize_arabic(text) |
| | doc = _MODEL(text_norm) |
| |
|
| | return { |
| | "raw_text": raw, |
| | "normalized_text": text_norm, |
| | "entities": [ |
| | { |
| | "text": ent.text, |
| | "label": ent.label_, |
| | "start": ent.start_char, |
| | "end": ent.end_char, |
| | } |
| | for ent in doc.ents |
| | ], |
| | } |
| |
|
| | return ner |
| |
|