| | import pandas as pd |
| | import numpy as np |
| | from text_preprocessor import TextPreprocessor |
| | from tqdm import tqdm |
| | import logging |
| | from pathlib import Path |
| | import time |
| |
|
| | def process_dataset(input_path: str, output_path: str = None, batch_size: int = 1000): |
| | """ |
| | Process a dataset using the TextPreprocessor with efficient batch processing. |
| | |
| | Args: |
| | input_path: Path to input CSV file |
| | output_path: Path to save processed CSV file. If None, will use input name with _processed suffix |
| | batch_size: Number of texts to process in each batch |
| | """ |
| | |
| | if output_path is None: |
| | input_path = Path(input_path) |
| | output_path = input_path.parent / f"{input_path.stem}_processed{input_path.suffix}" |
| | |
| | |
| | preprocessor = TextPreprocessor() |
| | |
| | print(f"\nProcessing dataset: {input_path}") |
| | start_time = time.time() |
| | |
| | try: |
| | |
| | print("Reading dataset...") |
| | df = pd.read_csv(input_path) |
| | total_rows = len(df) |
| | print(f"Total rows: {total_rows:,}") |
| | |
| | |
| | print("\nProcessing text...") |
| | |
| | |
| | num_batches = (total_rows + batch_size - 1) // batch_size |
| | |
| | for i in tqdm(range(0, total_rows, batch_size), total=num_batches, desc="Processing batches"): |
| | |
| | batch_start = i |
| | batch_end = min(i + batch_size, total_rows) |
| | |
| | |
| | for idx in range(batch_start, batch_end): |
| | text = df.loc[idx, 'comment_text'] |
| | lang = df.loc[idx, 'lang'] if 'lang' in df.columns else 'en' |
| | |
| | |
| | processed = preprocessor.preprocess_text( |
| | text, |
| | lang=lang, |
| | clean_options={ |
| | 'remove_stops': True, |
| | 'remove_numbers': True, |
| | 'remove_urls': True, |
| | 'remove_emails': True, |
| | 'remove_mentions': True, |
| | 'remove_hashtags': True, |
| | 'expand_contractions': True, |
| | 'remove_accents': False, |
| | 'min_word_length': 2 |
| | }, |
| | do_stemming=True |
| | ) |
| | |
| | |
| | df.loc[idx, 'comment_text'] = processed |
| | |
| | |
| | if i == 0: |
| | print("\nSample processing results:") |
| | for j in range(min(3, batch_size)): |
| | print(f"\nProcessed text {j+1}: {df.loc[j, 'comment_text'][:100]}...") |
| | |
| | |
| | print(f"\nSaving processed dataset to: {output_path}") |
| | df.to_csv(output_path, index=False) |
| | |
| | |
| | end_time = time.time() |
| | processing_time = end_time - start_time |
| | |
| | print("\nProcessing Complete!") |
| | print("-" * 50) |
| | print(f"Total rows processed: {total_rows:,}") |
| | print(f"Processing time: {processing_time/60:.2f} minutes") |
| | print(f"Average time per text: {processing_time/total_rows*1000:.2f} ms") |
| | print(f"Output file size: {Path(output_path).stat().st_size/1024/1024:.1f} MB") |
| | |
| | |
| | print("\nVocabulary Statistics:") |
| | sample_size = min(1000, total_rows) |
| | original_words = set(' '.join(df['comment_text'].head(sample_size).astype(str)).split()) |
| | processed_words = set(' '.join(df['processed_text'].head(sample_size).astype(str)).split()) |
| | print(f"Sample unique words (first {sample_size:,} rows):") |
| | print(f"Before processing: {len(original_words):,}") |
| | print(f"After processing : {len(processed_words):,}") |
| | print(f"Reduction: {(1 - len(processed_words)/len(original_words))*100:.1f}%") |
| | |
| | except Exception as e: |
| | print(f"\nError processing dataset: {str(e)}") |
| | raise |
| |
|
| | if __name__ == "__main__": |
| | |
| | input_file = "dataset/split/train.csv" |
| | output_file = "dataset/split/train_no_stopwords.csv" |
| | |
| | process_dataset(input_file, output_file) |