|
|
import re |
|
|
import sys |
|
|
import typing as tp |
|
|
import unicodedata |
|
|
|
|
|
import torch |
|
|
from sacremoses import MosesPunctNormalizer |
|
|
from sentence_splitter import SentenceSplitter |
|
|
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer |
|
|
|
|
|
L1 = "spa_Latn" |
|
|
L2 = "agr_Latn" |
|
|
LANGUAGES = { |
|
|
"Spanish | spa": L1, |
|
|
"Awajun | agr ": L2, |
|
|
} |
|
|
|
|
|
def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]: |
|
|
non_printable_map = { |
|
|
ord(c): replace_by |
|
|
for c in (chr(i) for i in range(sys.maxunicode + 1)) |
|
|
|
|
|
|
|
|
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} |
|
|
} |
|
|
|
|
|
def replace_non_printing_char(line) -> str: |
|
|
return line.translate(non_printable_map) |
|
|
|
|
|
return replace_non_printing_char |
|
|
|
|
|
class TextPreprocessor: |
|
|
""" |
|
|
Mimic the text preprocessing made for the NLLB model. |
|
|
This code is adapted from the Stopes repo of the NLLB team: |
|
|
https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214 |
|
|
""" |
|
|
|
|
|
def __init__(self, lang="en"): |
|
|
self.mpn = MosesPunctNormalizer(lang=lang) |
|
|
self.mpn.substitutions = [ |
|
|
(re.compile(r), sub) for r, sub in self.mpn.substitutions |
|
|
] |
|
|
self.replace_nonprint = get_non_printing_char_replacer(" ") |
|
|
|
|
|
def __call__(self, text: str) -> str: |
|
|
clean = self.mpn.normalize(text) |
|
|
clean = self.replace_nonprint(clean) |
|
|
|
|
|
clean = unicodedata.normalize("NFKC", clean) |
|
|
return clean |
|
|
|
|
|
def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False): |
|
|
"""Apply a sentence splitter and return the sentences and all separators before and after them""" |
|
|
if fix_double_space: |
|
|
text = re.sub(" +", " ", text) |
|
|
sentences = splitter.split(text) |
|
|
fillers = [] |
|
|
i = 0 |
|
|
for sentence in sentences: |
|
|
start_idx = text.find(sentence, i) |
|
|
if ignore_errors and start_idx == -1: |
|
|
|
|
|
start_idx = i + 1 |
|
|
assert start_idx != -1, f"sent not found after {i}: `{sentence}`" |
|
|
fillers.append(text[i:start_idx]) |
|
|
i = start_idx + len(sentence) |
|
|
fillers.append(text[i:]) |
|
|
return sentences, fillers |