Spaces:

dbadeev
/

cinematch-ai

Sleeping

File size: 4,038 Bytes

da524e0

# setup_punkt_extraction.py

import pickle
import os
import ast
import sys


def extract_punkt_data_to_files():
    """Извлечение данных из english.pickle в отдельные файлы"""

    # Путь к pickle файлу
    pickle_path = "/root/nltk_data/tokenizers/punkt_tab/english/english.pickle"
    output_dir = "/root/nltk_data/tokenizers/punkt_tab/english"

    try:
        print(f"Loading punkt model from {pickle_path}")

        # Загрузка модели
        with open(pickle_path, 'rb') as f:
            punkt_model = pickle.load(f)

        print(f"Punkt model loaded successfully: {type(punkt_model)}")

        # 1. Извлечение sentence starters
        try:
            if hasattr(punkt_model, '_lang_vars') and punkt_model._lang_vars:
                sent_starters = punkt_model._lang_vars.sent_starters
                with open(f"{output_dir}/sent_starters.txt", 'w') as f:
                    f.write('\n'.join(sent_starters))
                print(f"✅ Created sent_starters.txt with {len(sent_starters)} entries")
            else:
                print("⚠️ No sentence starters found, creating default ones")
                default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
                with open(f"{output_dir}/sent_starters.txt", 'w') as f:
                    f.write('\n'.join(default_starters))
        except Exception as e:
            print(f"⚠️ Error extracting sentence starters: {e}")
            # Создаем базовые стартеры
            default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
            with open(f"{output_dir}/sent_starters.txt", 'w') as f:
                f.write('\n'.join(default_starters))

        # 2. Извлечение collocations
        try:
            if hasattr(punkt_model, '_params') and punkt_model._params:
                collocations = punkt_model._params.collocations
                with open(f"{output_dir}/collocations.tab", 'w') as f:
                    for (word1, word2), freq in collocations.items():
                        f.write(f"{word1}\t{word2}\t{freq}\n")
                print(f"✅ Created collocations.tab with {len(collocations)} entries")
            else:
                # Создаем пустой файл
                open(f"{output_dir}/collocations.tab", 'w').close()
                print("✅ Created empty collocations.tab")
        except Exception as e:
            print(f"⚠️ Error extracting collocations: {e}")
            open(f"{output_dir}/collocations.tab", 'w').close()

        # 3. Создание остальных файлов
        try:
            # Abbreviations
            if hasattr(punkt_model, '_params') and hasattr(punkt_model._params, 'abbrev_types'):
                with open(f"{output_dir}/abbrev_types.txt", 'w') as f:
                    f.write('\n'.join(punkt_model._params.abbrev_types))
                print("✅ Created abbrev_types.txt from model")
            else:
                # Создаем пустой файл
                open(f"{output_dir}/abbrev_types.txt", 'w').close()
                print("✅ Created empty abbrev_types.txt")

            # Ortho context (обычно пустой)
            open(f"{output_dir}/ortho_context.tab", 'w').close()
            print("✅ Created empty ortho_context.tab")

        except Exception as e:
            print(f"⚠️ Warning creating additional files: {e}")
            # Создаем пустые файлы на всякий случай
            for filename in ["abbrev_types.txt", "ortho_context.tab"]:
                open(f"{output_dir}/{filename}", 'w').close()

        print("✅ All punkt_tab files created successfully")
        return True

    except Exception as e:
        print(f"❌ Error extracting punkt data: {e}")
        return False


if __name__ == "__main__":
    success = extract_punkt_data_to_files()
    sys.exit(0 if success else 1)