cinematch-ai / setup_punkt_extraction.py
dbadeev's picture
Upload 25 files
16ce932 verified
# setup_punkt_extraction.py
import pickle
import os
import ast
import sys
def extract_punkt_data_to_files():
"""Извлечение данных из english.pickle в отдельные файлы"""
# Путь к pickle файлу
pickle_path = "/root/nltk_data/tokenizers/punkt_tab/english/english.pickle"
output_dir = "/root/nltk_data/tokenizers/punkt_tab/english"
try:
print(f"Loading punkt model from {pickle_path}")
# Загрузка модели
with open(pickle_path, 'rb') as f:
punkt_model = pickle.load(f)
print(f"Punkt model loaded successfully: {type(punkt_model)}")
# 1. Извлечение sentence starters
try:
if hasattr(punkt_model, '_lang_vars') and punkt_model._lang_vars:
sent_starters = punkt_model._lang_vars.sent_starters
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(sent_starters))
print(f"✅ Created sent_starters.txt with {len(sent_starters)} entries")
else:
print("⚠️ No sentence starters found, creating default ones")
default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(default_starters))
except Exception as e:
print(f"⚠️ Error extracting sentence starters: {e}")
# Создаем базовые стартеры
default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(default_starters))
# 2. Извлечение collocations
try:
if hasattr(punkt_model, '_params') and punkt_model._params:
collocations = punkt_model._params.collocations
with open(f"{output_dir}/collocations.tab", 'w') as f:
for (word1, word2), freq in collocations.items():
f.write(f"{word1}\t{word2}\t{freq}\n")
print(f"✅ Created collocations.tab with {len(collocations)} entries")
else:
# Создаем пустой файл
open(f"{output_dir}/collocations.tab", 'w').close()
print("✅ Created empty collocations.tab")
except Exception as e:
print(f"⚠️ Error extracting collocations: {e}")
open(f"{output_dir}/collocations.tab", 'w').close()
# 3. Создание остальных файлов
try:
# Abbreviations
if hasattr(punkt_model, '_params') and hasattr(punkt_model._params, 'abbrev_types'):
with open(f"{output_dir}/abbrev_types.txt", 'w') as f:
f.write('\n'.join(punkt_model._params.abbrev_types))
print("✅ Created abbrev_types.txt from model")
else:
# Создаем пустой файл
open(f"{output_dir}/abbrev_types.txt", 'w').close()
print("✅ Created empty abbrev_types.txt")
# Ortho context (обычно пустой)
open(f"{output_dir}/ortho_context.tab", 'w').close()
print("✅ Created empty ortho_context.tab")
except Exception as e:
print(f"⚠️ Warning creating additional files: {e}")
# Создаем пустые файлы на всякий случай
for filename in ["abbrev_types.txt", "ortho_context.tab"]:
open(f"{output_dir}/{filename}", 'w').close()
print("✅ All punkt_tab files created successfully")
return True
except Exception as e:
print(f"❌ Error extracting punkt data: {e}")
return False
if __name__ == "__main__":
success = extract_punkt_data_to_files()
sys.exit(0 if success else 1)