Spaces:

MCP-1st-Birthday
/

cinematch-ai

Running

App Files Files Community

cinematch-ai / setup_punkt_extraction.py

dbadeev

Upload 25 files

16ce932 verified 26 days ago

raw

history blame contribute delete

4.04 kB

	# setup_punkt_extraction.py

	import pickle
	import os
	import ast
	import sys


	def extract_punkt_data_to_files():
	"""Извлечение данных из english.pickle в отдельные файлы"""

	# Путь к pickle файлу
	pickle_path = "/root/nltk_data/tokenizers/punkt_tab/english/english.pickle"
	output_dir = "/root/nltk_data/tokenizers/punkt_tab/english"

	try:
	print(f"Loading punkt model from {pickle_path}")

	# Загрузка модели
	with open(pickle_path, 'rb') as f:
	punkt_model = pickle.load(f)

	print(f"Punkt model loaded successfully: {type(punkt_model)}")

	# 1. Извлечение sentence starters
	try:
	if hasattr(punkt_model, '_lang_vars') and punkt_model._lang_vars:
	sent_starters = punkt_model._lang_vars.sent_starters
	with open(f"{output_dir}/sent_starters.txt", 'w') as f:
	f.write('\n'.join(sent_starters))
	print(f"✅ Created sent_starters.txt with {len(sent_starters)} entries")
	else:
	print("⚠️ No sentence starters found, creating default ones")
	default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
	with open(f"{output_dir}/sent_starters.txt", 'w') as f:
	f.write('\n'.join(default_starters))
	except Exception as e:
	print(f"⚠️ Error extracting sentence starters: {e}")
	# Создаем базовые стартеры
	default_starters = ["i", "you", "he", "she", "it", "we", "they", "the", "a", "an"]
	with open(f"{output_dir}/sent_starters.txt", 'w') as f:
	f.write('\n'.join(default_starters))

	# 2. Извлечение collocations
	try:
	if hasattr(punkt_model, '_params') and punkt_model._params:
	collocations = punkt_model._params.collocations
	with open(f"{output_dir}/collocations.tab", 'w') as f:
	for (word1, word2), freq in collocations.items():
	f.write(f"{word1}\t{word2}\t{freq}\n")
	print(f"✅ Created collocations.tab with {len(collocations)} entries")
	else:
	# Создаем пустой файл
	open(f"{output_dir}/collocations.tab", 'w').close()
	print("✅ Created empty collocations.tab")
	except Exception as e:
	print(f"⚠️ Error extracting collocations: {e}")
	open(f"{output_dir}/collocations.tab", 'w').close()

	# 3. Создание остальных файлов
	try:
	# Abbreviations
	if hasattr(punkt_model, '_params') and hasattr(punkt_model._params, 'abbrev_types'):
	with open(f"{output_dir}/abbrev_types.txt", 'w') as f:
	f.write('\n'.join(punkt_model._params.abbrev_types))
	print("✅ Created abbrev_types.txt from model")
	else:
	# Создаем пустой файл
	open(f"{output_dir}/abbrev_types.txt", 'w').close()
	print("✅ Created empty abbrev_types.txt")

	# Ortho context (обычно пустой)
	open(f"{output_dir}/ortho_context.tab", 'w').close()
	print("✅ Created empty ortho_context.tab")

	except Exception as e:
	print(f"⚠️ Warning creating additional files: {e}")
	# Создаем пустые файлы на всякий случай
	for filename in ["abbrev_types.txt", "ortho_context.tab"]:
	open(f"{output_dir}/{filename}", 'w').close()

	print("✅ All punkt_tab files created successfully")
	return True

	except Exception as e:
	print(f"❌ Error extracting punkt data: {e}")
	return False


	if __name__ == "__main__":
	success = extract_punkt_data_to_files()
	sys.exit(0 if success else 1)