Spaces:
Running
Running
| # evaluation/judges.py | |
| from agents.nebius_simple import create_nebius_llm | |
| import json | |
| import re | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class PersuasionJudge: | |
| def __init__(self, api_key): | |
| self.llm = create_nebius_llm(api_key, model="meta-llama/Llama-3.3-70B-Instruct", temperature=0.0) | |
| def evaluate_expert_skill(self, user_story: str, expert_card: str, bridges: list) -> dict: | |
| """ | |
| Оценивает мастерство Агента-Эксперта в написании рекомендаций. | |
| """ | |
| bridges_str = "\n- ".join(bridges) | |
| prompt = f""" | |
| Act as a Senior Editor for a Movie Magazine. | |
| Evaluate the quality and persuasiveness of the AI Critic's recommendation. | |
| CONTEXT: | |
| The User provided a detailed story (50+ words). | |
| The AI Agent recommended a movie and wrote a "Justification". | |
| --- USER STORY --- | |
| "{user_story}" | |
| --- AI RECOMMENDATION CARD --- | |
| {expert_card} | |
| --- EVALUATION TASK --- | |
| Check if the AI Agent successfully built "Narrative Bridges" - connecting specific details from the User's story to the Movie. | |
| Expected Bridges (The agent SHOULD mention these connections): | |
| - {bridges_str} | |
| Rate the Agent on 3 metrics (1-5 stars): | |
| 1. **Context Awareness (1-5)**: Did the agent reference specific details from the user's text (e.g. "You mentioned a botanist...") or did it use a generic template? | |
| 2. **Persuasiveness (1-5)**: Is the argument convincing? Does it explain WHY this movie matches the user's specific plot? | |
| 3. **Bridge Coverage (0-100%)**: What percentage of the "Expected Bridges" were explicitly addressed? | |
| OUTPUT JSON ONLY: | |
| {{ | |
| "context_score": int, | |
| "persuasiveness_score": int, | |
| "bridge_coverage_percent": int, | |
| "missing_bridges": ["list of missed points"], | |
| "feedback": "Short critique for the agent" | |
| }} | |
| """ | |
| try: | |
| response = self.llm.complete(prompt).text | |
| cleaned = re.sub(r"```json|```", "", response).strip() | |
| return json.loads(cleaned) | |
| except Exception as e: | |
| return {"error": str(e), "persuasiveness_score": 0} | |
| def evaluate_real_world_interaction(self, user_story: str, expert_card: str, movie_metadata: dict) -> dict: | |
| """ | |
| Оценка реального диалога (Reference-Free). | |
| Проверяет обоснованность (Groundedness) и логичность, не зная "правильного" ответа. | |
| Разделяем "галлюцинации фактов" и "слабые тематические связи" | |
| """ | |
| # Превращаем метаданные в текст для промпта | |
| facts_str = json.dumps({ | |
| "title": movie_metadata.get("title"), | |
| "director": movie_metadata.get("director"), | |
| "cast": movie_metadata.get("cast"), | |
| "genres": movie_metadata.get("genres"), | |
| "plot_keywords": movie_metadata.get("narrative_features", "") | |
| }, ensure_ascii=False) | |
| prompt = f""" | |
| You are an AI Auditor monitoring a Movie Recommendation System in production. | |
| Your goal is to detect **Factual Hallucinations** and evaluate **Logical Coherence**. | |
| --- INPUT DATA --- | |
| 1. USER STORY: "{user_story}" | |
| 2. REAL MOVIE FACTS (Ground Truth): {facts_str} | |
| 3. AGENT'S RECOMMENDATION TEXT: "{expert_card}" | |
| --- AUDIT TASKS --- | |
| **IMPORTANT DISTINCTION:** | |
| - **Hallucination = Inventing facts that contradict Movie Facts** (e.g., wrong actors, wrong plot events) | |
| - **Weak connection ≠ Hallucination** (e.g., "both are comedies with quirky characters" is NOT a hallucination, | |
| just a thematic bridge) | |
| 1. **Check Groundedness (Faithfulness)**: | |
| - Did the Agent mention any actors, directors, or plot details that CONTRADICT the Movie Facts? | |
| - If the Agent describes plot events NOT in the movie's overview, that is a HALLUCINATION. | |
| - If the Agent says "both films share a genre/mood/theme", that is NOT a hallucination. | |
| - Score 0 (False claims) to 1 (Fully supported by facts). | |
| 2. **Check Logical Link**: | |
| - Does the Agent clearly explain *how* the movie connects to the User Story? | |
| - Thematic connections ("both explore loneliness", "both are comedies") are VALID bridges. | |
| - Score 1 (Vague) to 5 (Strong logic). | |
| 3. **Hallucination Detection**: | |
| - Set `hallucination_detected: true` ONLY if the Agent invented false factual claims. | |
| - Examples of hallucinations: wrong actors, fabricated plot events, fake quotes. | |
| - Examples of NOT hallucinations: "both films share comedic tone", "similar narrative structure", | |
| "focuses on same themes". | |
| OUTPUT JSON ONLY: | |
| {{ | |
| "groundedness_score": float, | |
| "coherence_score": int, | |
| "hallucination_detected": boolean, | |
| "hallucination_details": "string (what was invented?) or null", | |
| "reasoning": "Short audit report" | |
| }} | |
| """ | |
| try: | |
| response = self.llm.complete(prompt).text | |
| # Очистка JSON | |
| cleaned = re.sub(r"```json|```", "", response).strip() | |
| return json.loads(cleaned) | |
| except Exception as e: | |
| logger.error(f"Shadow eval failed: {e}") | |
| return {"error": str(e), "groundedness_score": 0} |