Spaces:
Running
Running
| # setup_punkt_extraction.py | |
| import pickle | |
| import os | |
| import ast | |
| import sys | |
| def extract_punkt_data_to_files(): | |
| """Извлечение данных из english.pickle в отдельные файлы""" | |
| # Путь к pickle файлу | |
| pickle_path = "/root/nltk_data/tokenizers/punkt_tab/english/english.pickle" | |
| output_dir = "/root/nltk_data/tokenizers/punkt_tab/english" | |
| try: | |
| print(f"Loading punkt model from {pickle_path}") | |
| # Загрузка модели | |
| with open(pickle_path, 'rb') as f: | |
| punkt_model = pickle.load(f) | |
| print(f"Punkt model loaded successfully: {type(punkt_model)}") | |
| # 1. Извлечение sentence starters | |
| try: | |
| if hasattr(punkt_model, '_lang_vars') and punkt_model._lang_vars: | |
| sent_starters = punkt_model._lang_vars.sent_starters | |
| with open(f"{output_dir}/sent_starters.txt", 'w') as f: | |
| f.write('\n'.join(sent_starters)) | |
| print(f"✅ Created sent_starters.txt with {len(sent_starters)} entries") | |
| else: | |
| print("⚠️ No sentence starters found, creating default ones") | |
| default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"] | |
| with open(f"{output_dir}/sent_starters.txt", 'w') as f: | |
| f.write('\n'.join(default_starters)) | |
| except Exception as e: | |
| print(f"⚠️ Error extracting sentence starters: {e}") | |
| # Создаем базовые стартеры | |
| default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"] | |
| with open(f"{output_dir}/sent_starters.txt", 'w') as f: | |
| f.write('\n'.join(default_starters)) | |
| # 2. Извлечение collocations | |
| try: | |
| if hasattr(punkt_model, '_params') and punkt_model._params: | |
| collocations = punkt_model._params.collocations | |
| with open(f"{output_dir}/collocations.tab", 'w') as f: | |
| for (word1, word2), freq in collocations.items(): | |
| f.write(f"{word1}\t{word2}\t{freq}\n") | |
| print(f"✅ Created collocations.tab with {len(collocations)} entries") | |
| else: | |
| # Создаем пустой файл | |
| open(f"{output_dir}/collocations.tab", 'w').close() | |
| print("✅ Created empty collocations.tab") | |
| except Exception as e: | |
| print(f"⚠️ Error extracting collocations: {e}") | |
| open(f"{output_dir}/collocations.tab", 'w').close() | |
| # 3. Создание остальных файлов | |
| try: | |
| # Abbreviations | |
| if hasattr(punkt_model, '_params') and hasattr(punkt_model._params, 'abbrev_types'): | |
| with open(f"{output_dir}/abbrev_types.txt", 'w') as f: | |
| f.write('\n'.join(punkt_model._params.abbrev_types)) | |
| print("✅ Created abbrev_types.txt from model") | |
| else: | |
| # Создаем пустой файл | |
| open(f"{output_dir}/abbrev_types.txt", 'w').close() | |
| print("✅ Created empty abbrev_types.txt") | |
| # Ortho context (обычно пустой) | |
| open(f"{output_dir}/ortho_context.tab", 'w').close() | |
| print("✅ Created empty ortho_context.tab") | |
| except Exception as e: | |
| print(f"⚠️ Warning creating additional files: {e}") | |
| # Создаем пустые файлы на всякий случай | |
| for filename in ["abbrev_types.txt", "ortho_context.tab"]: | |
| open(f"{output_dir}/{filename}", 'w').close() | |
| print("✅ All punkt_tab files created successfully") | |
| return True | |
| except Exception as e: | |
| print(f"❌ Error extracting punkt data: {e}") | |
| return False | |
| if __name__ == "__main__": | |
| success = extract_punkt_data_to_files() | |
| sys.exit(0 if success else 1) | |