# evaluation/run_evals.py import asyncio import json import os import datetime from agents.retriever import RetrieverAgent from agents.modal_agents import process_expert_agent from evaluation.judges import PersuasionJudge async def run_persuasion_eval(): nebius_key = os.environ.get("NEBIUS_API_KEY") if not nebius_key: print("❌ Error: NEBIUS_API_KEY environment variable is not set.") return # Инициализация judge = PersuasionJudge(nebius_key) retriever = RetrieverAgent() # Загрузка датасета try: with open("evaluation/golden_dataset.json", "r") as f: dataset = json.load(f) except FileNotFoundError: print("❌ Error: evaluation/golden_dataset.json not found.") return print(f"🕵️ Starting Evaluation on {len(dataset)} scenarios...") report = { "timestamp": datetime.datetime.utcnow().isoformat(), "total_cases": len(dataset), "results": [], "summary": {} } total_persuasiveness = 0 total_context_score = 0 for case in dataset: print(f"\nProcessing CASE ID: {case['id']}...") # 1. Поиск retrieval = retriever.retrieve_candidates(case['query'], top_k=10) candidates = retrieval.get("candidates", []) # 2. Эксперт if candidates: # Используем .remote для синхронного вызова (или aio для асинхронного, если настроено) # В локальном скрипте проще использовать синхронный вызов к remote функции expert_result = process_expert_agent.remote(case['query'], candidates) expert_text = str(expert_result) if isinstance(expert_result, dict): expert_text = expert_result.get("explanations", str(expert_result)) else: expert_text = "" # 3. Судья verdict = judge.evaluate_expert_skill( user_story=case['query'], expert_card=expert_text, bridges=case.get('key_narrative_bridges', []) ) # Сбор метрик p_score = verdict.get('persuasiveness_score', 0) c_score = verdict.get('context_score', 0) total_persuasiveness += p_score total_context_score += c_score print(f" Score: {p_score}/5 | Context: {c_score}/5") # Добавляем в отчет report["results"].append({ "case_id": case["id"], "query": case["query"], "expected_movie": case.get("expected_movie"), "expert_output_snippet": expert_text[:200] + "...", "scores": verdict, "candidates_found": len(candidates) }) # Итоговая статистика report["summary"] = { "avg_persuasiveness": round(total_persuasiveness / len(dataset), 2), "avg_context_aware": round(total_context_score / len(dataset), 2) } # Сохранение в файл filename = f"evaluation/report_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(filename, "w") as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\n✅ Evaluation Complete!") print(f"🏆 Average Persuasiveness: {report['summary']['avg_persuasiveness']}/5") print(f"📄 Full report saved to: {filename}") if __name__ == "__main__": asyncio.run(run_persuasion_eval())