angelLino commited on
Commit
2a0f456
Β·
verified Β·
1 Parent(s): 794d524

Create translation.py

Browse files

Added an script for allowing it to translate by a pipeline.

Files changed (1) hide show
  1. translation.py +69 -0
translation.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import typing as tp
4
+ import unicodedata
5
+
6
+ import torch
7
+ from sacremoses import MosesPunctNormalizer
8
+ from sentence_splitter import SentenceSplitter
9
+ from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
10
+
11
+ L1 = "spa_Latn"
12
+ L2 = "agr_Latn"
13
+ LANGUAGES = {
14
+ "Spanish | spa": L1,
15
+ "Awajun | agr ": L2,
16
+ }
17
+
18
+ def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
19
+ non_printable_map = {
20
+ ord(c): replace_by
21
+ for c in (chr(i) for i in range(sys.maxunicode + 1))
22
+ # same as \p{C} in perl
23
+ # see https://www.unicode.org/reports/tr44/#General_Category_Values
24
+ if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
25
+ }
26
+
27
+ def replace_non_printing_char(line) -> str:
28
+ return line.translate(non_printable_map)
29
+
30
+ return replace_non_printing_char
31
+
32
+ class TextPreprocessor:
33
+ """
34
+ Mimic the text preprocessing made for the NLLB model.
35
+ This code is adapted from the Stopes repo of the NLLB team:
36
+ https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214
37
+ """
38
+
39
+ def __init__(self, lang="en"):
40
+ self.mpn = MosesPunctNormalizer(lang=lang)
41
+ self.mpn.substitutions = [
42
+ (re.compile(r), sub) for r, sub in self.mpn.substitutions
43
+ ]
44
+ self.replace_nonprint = get_non_printing_char_replacer(" ")
45
+
46
+ def __call__(self, text: str) -> str:
47
+ clean = self.mpn.normalize(text)
48
+ clean = self.replace_nonprint(clean)
49
+ # replace π“•π”―π”žπ”«π” π”’π”°π” π”ž by Francesca
50
+ clean = unicodedata.normalize("NFKC", clean)
51
+ return clean
52
+
53
+ def sentenize_with_fillers(text, splitter, fix_double_space=True, ignore_errors=False):
54
+ """Apply a sentence splitter and return the sentences and all separators before and after them"""
55
+ if fix_double_space:
56
+ text = re.sub(" +", " ", text)
57
+ sentences = splitter.split(text)
58
+ fillers = []
59
+ i = 0
60
+ for sentence in sentences:
61
+ start_idx = text.find(sentence, i)
62
+ if ignore_errors and start_idx == -1:
63
+ # print(f"sent not found after {i}: `{sentence}`")
64
+ start_idx = i + 1
65
+ assert start_idx != -1, f"sent not found after {i}: `{sentence}`"
66
+ fillers.append(text[i:start_idx])
67
+ i = start_idx + len(sentence)
68
+ fillers.append(text[i:])
69
+ return sentences, fillers