Upload preprocess.sh with huggingface_hub
Browse files- preprocess.sh +58 -0
preprocess.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
#
|
| 3 |
+
# USAGE preprocess.sh langid spmodel < input > output
|
| 4 |
+
#
|
| 5 |
+
# replace SPMENCODE with your own setup!
|
| 6 |
+
#
|
| 7 |
+
# CHANGES
|
| 8 |
+
#
|
| 9 |
+
# * issue with perl code that removes control characters
|
| 10 |
+
# unicode property Other = \p{C}) seems to remove
|
| 11 |
+
# newline characters as well --> add negative lookahead
|
| 12 |
+
# to avoid removing newline characters!
|
| 13 |
+
#
|
| 14 |
+
|
| 15 |
+
SPMENCODE=`which spm_encode || echo "${PWD}/tools/marian-dev/build/spm_encode"`
|
| 16 |
+
|
| 17 |
+
## simple pre-processing steps adapted from Moses tools
|
| 18 |
+
|
| 19 |
+
sed -e 's/,/,/g' \
|
| 20 |
+
-e 's/。 */. /g' \
|
| 21 |
+
-e 's/、/,/g' \
|
| 22 |
+
-e 's/”/"/g' \
|
| 23 |
+
-e 's/“/"/g' \
|
| 24 |
+
-e 's/∶/:/g' \
|
| 25 |
+
-e 's/:/:/g' \
|
| 26 |
+
-e 's/?/\?/g' \
|
| 27 |
+
-e 's/《/"/g' \
|
| 28 |
+
-e 's/》/"/g' \
|
| 29 |
+
-e 's/)/\)/g' \
|
| 30 |
+
-e 's/!/\!/g' \
|
| 31 |
+
-e 's/(/\(/g' \
|
| 32 |
+
-e 's/;/;/g' \
|
| 33 |
+
-e 's/1/"/g' \
|
| 34 |
+
-e 's/」/"/g' \
|
| 35 |
+
-e 's/「/"/g' \
|
| 36 |
+
-e 's/0/0/g' \
|
| 37 |
+
-e 's/3/3/g' \
|
| 38 |
+
-e 's/2/2/g' \
|
| 39 |
+
-e 's/5/5/g' \
|
| 40 |
+
-e 's/6/6/g' \
|
| 41 |
+
-e 's/9/9/g' \
|
| 42 |
+
-e 's/7/7/g' \
|
| 43 |
+
-e 's/8/8/g' \
|
| 44 |
+
-e 's/4/4/g' \
|
| 45 |
+
-e 's/. */. /g' \
|
| 46 |
+
-e 's/~/\~/g' \
|
| 47 |
+
-e "s/’/\'/g" \
|
| 48 |
+
-e 's/…/\.\.\./g' \
|
| 49 |
+
-e 's/━/\-/g' \
|
| 50 |
+
-e 's/〈/\</g' \
|
| 51 |
+
-e 's/〉/\>/g' \
|
| 52 |
+
-e 's/【/\[/g' \
|
| 53 |
+
-e 's/】/\]/g' \
|
| 54 |
+
-e 's/%/\%/g' |
|
| 55 |
+
perl -C -pe 's/(?!\n)\p{C}/ /g;' |
|
| 56 |
+
perl -CIOE -pe 's/[\x{2060}\x{200B}\x{feff}]//g' |\
|
| 57 |
+
sed 's/ */ /g;s/^ *//g;s/ *$//g'
|
| 58 |
+
|