Spaces:
Sleeping
Sleeping
File size: 4,038 Bytes
da524e0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | # setup_punkt_extraction.py
import pickle
import os
import ast
import sys
def extract_punkt_data_to_files():
"""Извлечение данных из english.pickle в отдельные файлы"""
# Путь к pickle файлу
pickle_path = "/root/nltk_data/tokenizers/punkt_tab/english/english.pickle"
output_dir = "/root/nltk_data/tokenizers/punkt_tab/english"
try:
print(f"Loading punkt model from {pickle_path}")
# Загрузка модели
with open(pickle_path, 'rb') as f:
punkt_model = pickle.load(f)
print(f"Punkt model loaded successfully: {type(punkt_model)}")
# 1. Извлечение sentence starters
try:
if hasattr(punkt_model, '_lang_vars') and punkt_model._lang_vars:
sent_starters = punkt_model._lang_vars.sent_starters
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(sent_starters))
print(f"✅ Created sent_starters.txt with {len(sent_starters)} entries")
else:
print("⚠️ No sentence starters found, creating default ones")
default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(default_starters))
except Exception as e:
print(f"⚠️ Error extracting sentence starters: {e}")
# Создаем базовые стартеры
default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
with open(f"{output_dir}/sent_starters.txt", 'w') as f:
f.write('\n'.join(default_starters))
# 2. Извлечение collocations
try:
if hasattr(punkt_model, '_params') and punkt_model._params:
collocations = punkt_model._params.collocations
with open(f"{output_dir}/collocations.tab", 'w') as f:
for (word1, word2), freq in collocations.items():
f.write(f"{word1}\t{word2}\t{freq}\n")
print(f"✅ Created collocations.tab with {len(collocations)} entries")
else:
# Создаем пустой файл
open(f"{output_dir}/collocations.tab", 'w').close()
print("✅ Created empty collocations.tab")
except Exception as e:
print(f"⚠️ Error extracting collocations: {e}")
open(f"{output_dir}/collocations.tab", 'w').close()
# 3. Создание остальных файлов
try:
# Abbreviations
if hasattr(punkt_model, '_params') and hasattr(punkt_model._params, 'abbrev_types'):
with open(f"{output_dir}/abbrev_types.txt", 'w') as f:
f.write('\n'.join(punkt_model._params.abbrev_types))
print("✅ Created abbrev_types.txt from model")
else:
# Создаем пустой файл
open(f"{output_dir}/abbrev_types.txt", 'w').close()
print("✅ Created empty abbrev_types.txt")
# Ortho context (обычно пустой)
open(f"{output_dir}/ortho_context.tab", 'w').close()
print("✅ Created empty ortho_context.tab")
except Exception as e:
print(f"⚠️ Warning creating additional files: {e}")
# Создаем пустые файлы на всякий случай
for filename in ["abbrev_types.txt", "ortho_context.tab"]:
open(f"{output_dir}/{filename}", 'w').close()
print("✅ All punkt_tab files created successfully")
return True
except Exception as e:
print(f"❌ Error extracting punkt data: {e}")
return False
if __name__ == "__main__":
success = extract_punkt_data_to_files()
sys.exit(0 if success else 1)
|