nllb-es-agr-V2 / translation.py
angelLino's picture
Create translation.py
2a0f456 verified
import re
import sys
import typing as tp
import unicodedata
import torch
from sacremoses import MosesPunctNormalizer
from sentence_splitter import SentenceSplitter
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
L1 = "spa_Latn"
L2 = "agr_Latn"
LANGUAGES = {
"Spanish | spa": L1,
"Awajun | agr ": L2,
}
def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
non_printable_map = {
ord(c): replace_by
for c in (chr(i) for i in range(sys.maxunicode + 1))
# same as \p{C} in perl
# see https://www.unicode.org/reports/tr44/#General_Category_Values
if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
}
def replace_non_printing_char(line) -> str:
return line.translate(non_printable_map)
return replace_non_printing_char
class TextPreprocessor:
"""
Mimic the text preprocessing made for the NLLB model.
This code is adapted from the Stopes repo of the NLLB team:
https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214
"""
def __init__(self, lang="en"):
self.mpn = MosesPunctNormalizer(lang=lang)
self.mpn.substitutions = [
(re.compile(r), sub) for r, sub in self.mpn.substitutions
]
self.replace_nonprint = get_non_printing_char_replacer(" ")
def __call__(self, text: str) -> str:
clean = self.mpn.normalize(text)
clean = self.replace_nonprint(clean)
# replace π“•π”―π”žπ”«π” π”’π”°π” π”ž by Francesca
clean = unicodedata.normalize("NFKC", clean)
return clean
def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
"""Apply a sentence splitter and return the sentences and all separators before and after them"""
if fix_double_space:
text = re.sub(" +", " ", text)
sentences = splitter.split(text)
fillers = []
i = 0
for sentence in sentences:
start_idx = text.find(sentence, i)
if ignore_errors and start_idx == -1:
# print(f"sent not found after {i}: `{sentence}`")
start_idx = i + 1
assert start_idx != -1, f"sent not found after {i}: `{sentence}`"
fillers.append(text[i:start_idx])
i = start_idx + len(sentence)
fillers.append(text[i:])
return sentences, fillers