| | import re |
| | import nltk |
| | import logging |
| | from typing import List, Set, Dict, Optional |
| | from nltk.tokenize import word_tokenize |
| | from nltk.corpus import stopwords |
| | from nltk.stem import SnowballStemmer |
| | from TurkishStemmer import TurkishStemmer |
| | from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning |
| | import unicodedata |
| | import warnings |
| |
|
| | |
| | warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) |
| |
|
| | |
| | try: |
| | nltk.download('stopwords', quiet=True) |
| | nltk.download('punkt', quiet=True) |
| | nltk.download('punkt_tab', quiet=True) |
| | nltk.download('averaged_perceptron_tagger', quiet=True) |
| | except Exception as e: |
| | print(f"Warning: Could not download NLTK data: {str(e)}") |
| |
|
| | |
| | logging.basicConfig(level=logging.WARNING) |
| |
|
| | class TextPreprocessor: |
| | """ |
| | A comprehensive text preprocessor for multilingual text cleaning and normalization. |
| | Supports multiple languages and provides various text cleaning operations. |
| | """ |
| | |
| | SUPPORTED_LANGUAGES = {'en', 'es', 'fr', 'it', 'pt', 'ru', 'tr'} |
| | |
| | |
| | CONTRACTIONS = { |
| | "ain't": "is not", "aren't": "are not", "can't": "cannot", |
| | "couldn't": "could not", "didn't": "did not", "doesn't": "does not", |
| | "don't": "do not", "hadn't": "had not", "hasn't": "has not", |
| | "haven't": "have not", "he'd": "he would", "he'll": "he will", |
| | "he's": "he is", "i'd": "i would", "i'll": "i will", "i'm": "i am", |
| | "i've": "i have", "isn't": "is not", "it's": "it is", |
| | "let's": "let us", "shouldn't": "should not", "that's": "that is", |
| | "there's": "there is", "they'd": "they would", "they'll": "they will", |
| | "they're": "they are", "they've": "they have", "wasn't": "was not", |
| | "we'd": "we would", "we're": "we are", "we've": "we have", |
| | "weren't": "were not", "what's": "what is", "where's": "where is", |
| | "who's": "who is", "won't": "will not", "wouldn't": "would not", |
| | "you'd": "you would", "you'll": "you will", "you're": "you are", |
| | "you've": "you have" |
| | } |
| | |
| | def __init__(self, languages: Optional[Set[str]] = None): |
| | """ |
| | Initialize the text preprocessor with specified languages. |
| | |
| | Args: |
| | languages: Set of language codes to support. If None, all supported languages are used. |
| | """ |
| | self.languages = languages or self.SUPPORTED_LANGUAGES |
| | self._initialize_resources() |
| | |
| | def _initialize_resources(self): |
| | """Initialize language-specific resources like stop words and stemmers.""" |
| | |
| | self.logger = logging.getLogger(__name__) |
| | |
| | |
| | self.stop_words = {} |
| | nltk_langs = { |
| | 'en': 'english', 'es': 'spanish', 'fr': 'french', |
| | 'it': 'italian', 'pt': 'portuguese', 'ru': 'russian' |
| | } |
| | |
| | for lang, nltk_name in nltk_langs.items(): |
| | if lang in self.languages: |
| | try: |
| | self.stop_words[lang] = set(stopwords.words(nltk_name)) |
| | except Exception as e: |
| | self.logger.warning(f"Could not load stop words for {lang}: {str(e)}") |
| | self.stop_words[lang] = set() |
| | |
| | |
| | if 'tr' in self.languages: |
| | self.stop_words['tr'] = { |
| | 'acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', |
| | 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', |
| | 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', |
| | 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', |
| | 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', |
| | 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani' |
| | } |
| | |
| | |
| | self.stemmers = {} |
| | for lang, name in [ |
| | ('en', 'english'), ('es', 'spanish'), ('fr', 'french'), |
| | ('it', 'italian'), ('pt', 'portuguese'), ('ru', 'russian') |
| | ]: |
| | if lang in self.languages: |
| | self.stemmers[lang] = SnowballStemmer(name) |
| | |
| | |
| | if 'tr' in self.languages: |
| | self.stemmers['tr'] = TurkishStemmer() |
| | |
| | def remove_html(self, text: str) -> str: |
| | """Remove HTML tags from text.""" |
| | return BeautifulSoup(text, "html.parser").get_text() |
| | |
| | def expand_contractions(self, text: str) -> str: |
| | """Expand contractions in English text.""" |
| | for contraction, expansion in self.CONTRACTIONS.items(): |
| | text = re.sub(rf'\b{contraction}\b', expansion, text, flags=re.IGNORECASE) |
| | return text |
| | |
| | def remove_accents(self, text: str) -> str: |
| | """Remove accents from text while preserving base characters.""" |
| | return ''.join(c for c in unicodedata.normalize('NFKD', text) |
| | if not unicodedata.combining(c)) |
| | |
| | def clean_text(self, text: str, lang: str = 'en', |
| | remove_stops: bool = True, |
| | remove_numbers: bool = True, |
| | remove_urls: bool = True, |
| | remove_emails: bool = True, |
| | remove_mentions: bool = True, |
| | remove_hashtags: bool = True, |
| | expand_contractions: bool = True, |
| | remove_accents: bool = False, |
| | min_word_length: int = 2) -> str: |
| | """ |
| | Clean and normalize text with configurable options. |
| | |
| | Args: |
| | text: Input text to clean |
| | lang: Language code of the text |
| | remove_stops: Whether to remove stop words |
| | remove_numbers: Whether to remove numbers |
| | remove_urls: Whether to remove URLs |
| | remove_emails: Whether to remove email addresses |
| | remove_mentions: Whether to remove social media mentions |
| | remove_hashtags: Whether to remove hashtags |
| | expand_contractions: Whether to expand contractions (English only) |
| | remove_accents: Whether to remove accents from characters |
| | min_word_length: Minimum length of words to keep |
| | |
| | Returns: |
| | Cleaned text string |
| | """ |
| | try: |
| | |
| | text = str(text).lower().strip() |
| | |
| | |
| | if '<' in text and '>' in text: |
| | text = self.remove_html(text) |
| | |
| | |
| | if remove_urls: |
| | text = re.sub(r'http\S+|www\S+', '', text) |
| | |
| | |
| | if remove_emails: |
| | text = re.sub(r'\S+@\S+', '', text) |
| | |
| | |
| | if remove_mentions: |
| | text = re.sub(r'@\w+', '', text) |
| | |
| | |
| | if remove_hashtags: |
| | text = re.sub(r'#\w+', '', text) |
| | |
| | |
| | if remove_numbers: |
| | text = re.sub(r'\d+', '', text) |
| | |
| | |
| | if lang == 'en' and expand_contractions: |
| | text = self.expand_contractions(text) |
| | |
| | |
| | if remove_accents: |
| | text = self.remove_accents(text) |
| | |
| | |
| | if lang == 'tr': |
| | text = re.sub(r'[^a-zA-ZçğıöşüÇĞİÖŞÜ\s]', '', text) |
| | elif lang == 'ru': |
| | text = re.sub(r'[^а-яА-Я\s]', '', text) |
| | else: |
| | text = re.sub(r'[^\w\s]', '', text) |
| | |
| | |
| | try: |
| | words = word_tokenize(text) |
| | except Exception as e: |
| | self.logger.debug(f"Word tokenization failed, falling back to simple split: {str(e)}") |
| | words = text.split() |
| | |
| | |
| | if remove_stops and lang in self.stop_words: |
| | words = [w for w in words if w not in self.stop_words[lang]] |
| | |
| | |
| | words = [w for w in words if len(w) > min_word_length] |
| | |
| | |
| | return ' '.join(words) |
| | |
| | except Exception as e: |
| | self.logger.warning(f"Error in text cleaning: {str(e)}") |
| | return text |
| | |
| | def stem_text(self, text: str, lang: str = 'en') -> str: |
| | """ |
| | Apply language-specific stemming to text. |
| | |
| | Args: |
| | text: Input text to stem |
| | lang: Language code of the text |
| | |
| | Returns: |
| | Stemmed text string |
| | """ |
| | try: |
| | if lang not in self.stemmers: |
| | return text |
| | |
| | words = text.split() |
| | stemmed_words = [self.stemmers[lang].stem(word) for word in words] |
| | return ' '.join(stemmed_words) |
| | |
| | except Exception as e: |
| | self.logger.warning(f"Error in text stemming: {str(e)}") |
| | return text |
| | |
| | def preprocess_text(self, text: str, lang: str = 'en', |
| | clean_options: Dict = None, |
| | do_stemming: bool = True) -> str: |
| | """ |
| | Complete preprocessing pipeline combining cleaning and stemming. |
| | |
| | Args: |
| | text: Input text to preprocess |
| | lang: Language code of the text |
| | clean_options: Dictionary of options to pass to clean_text |
| | do_stemming: Whether to apply stemming |
| | |
| | Returns: |
| | Preprocessed text string |
| | """ |
| | |
| | clean_options = clean_options or {} |
| | |
| | |
| | cleaned_text = self.clean_text(text, lang, **clean_options) |
| | |
| | |
| | if do_stemming: |
| | cleaned_text = self.stem_text(cleaned_text, lang) |
| | |
| | return cleaned_text.strip() |
| |
|
| | |
| | if __name__ == "__main__": |
| | |
| | preprocessor = TextPreprocessor() |
| | |
| | |
| | examples = { |
| | 'en': "Here's an example! This is a test text with @mentions and #hashtags http://example.com", |
| | 'es': "¡Hola! Este es un ejemplo de texto en español con números 12345", |
| | 'fr': "Voici un exemple de texte en français avec des accents é è à", |
| | 'tr': "Bu bir Türkçe örnek metindir ve bazı özel karakterler içerir." |
| | } |
| | |
| | |
| | for lang, text in examples.items(): |
| | print(f"\nProcessing {lang} text:") |
| | print("Original:", text) |
| | processed = preprocessor.preprocess_text(text, lang) |
| | print("Processed:", processed) |