angelLino
/

nllb-es-agr-V2

+import re
+import sys
+import typing as tp
+import unicodedata
+import torch
+from sacremoses import MosesPunctNormalizer
+from sentence_splitter import SentenceSplitter
+from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
+L1 = "spa_Latn"
+L2 = "agr_Latn"
+LANGUAGES = {
+    "Spanish | spa": L1,
+    "Awajun | agr ": L2,
+}
+def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
+    non_printable_map = {
+        ord(c): replace_by
+        for c in (chr(i) for i in range(sys.maxunicode + 1))
+        # same as \p{C} in perl
+        # see https://www.unicode.org/reports/tr44/#General_Category_Values
+        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
+    }
+    def replace_non_printing_char(line) -> str:
+        return line.translate(non_printable_map)
+    return replace_non_printing_char
+class TextPreprocessor:
+    """
+    Mimic the text preprocessing made for the NLLB model.
+    This code is adapted from the Stopes repo of the NLLB team:
+    https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214
+    """
+    def __init__(self, lang="en"):
+        self.mpn = MosesPunctNormalizer(lang=lang)
+        self.mpn.substitutions = [
+            (re.compile(r), sub) for r, sub in self.mpn.substitutions
+        ]
+        self.replace_nonprint = get_non_printing_char_replacer(" ")
+    def __call__(self, text: str) -> str:
+        clean = self.mpn.normalize(text)
+        clean = self.replace_nonprint(clean)
+        # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
+        clean = unicodedata.normalize("NFKC", clean)
+        return clean
+def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
+    """Apply a sentence splitter and return the sentences and all separators before and after them"""
+    if fix_double_space:
+        text = re.sub(" +", " ", text)
+    sentences = splitter.split(text)
+    fillers = []
+    i = 0
+    for sentence in sentences:
+        start_idx = text.find(sentence, i)
+        if ignore_errors and start_idx == -1:
+            # print(f"sent not found after {i}: `{sentence}`")
+            start_idx = i + 1
+        assert start_idx != -1, f"sent not found after {i}: `{sentence}`"
+        fillers.append(text[i:start_idx])
+        i = start_idx + len(sentence)
+    fillers.append(text[i:])
+    return sentences, fillers