dbadeev's picture
Upload 25 files
16ce932 verified
# evaluation/judges.py
from agents.nebius_simple import create_nebius_llm
import json
import re
import logging
logger = logging.getLogger(__name__)
class PersuasionJudge:
def __init__(self, api_key):
self.llm = create_nebius_llm(api_key, model="meta-llama/Llama-3.3-70B-Instruct", temperature=0.0)
def evaluate_expert_skill(self, user_story: str, expert_card: str, bridges: list) -> dict:
"""
Оценивает мастерство Агента-Эксперта в написании рекомендаций.
"""
bridges_str = "\n- ".join(bridges)
prompt = f"""
Act as a Senior Editor for a Movie Magazine.
Evaluate the quality and persuasiveness of the AI Critic's recommendation.
CONTEXT:
The User provided a detailed story (50+ words).
The AI Agent recommended a movie and wrote a "Justification".
--- USER STORY ---
"{user_story}"
--- AI RECOMMENDATION CARD ---
{expert_card}
--- EVALUATION TASK ---
Check if the AI Agent successfully built "Narrative Bridges" - connecting specific details from the User's story to the Movie.
Expected Bridges (The agent SHOULD mention these connections):
- {bridges_str}
Rate the Agent on 3 metrics (1-5 stars):
1. **Context Awareness (1-5)**: Did the agent reference specific details from the user's text (e.g. "You mentioned a botanist...") or did it use a generic template?
2. **Persuasiveness (1-5)**: Is the argument convincing? Does it explain WHY this movie matches the user's specific plot?
3. **Bridge Coverage (0-100%)**: What percentage of the "Expected Bridges" were explicitly addressed?
OUTPUT JSON ONLY:
{{
"context_score": int,
"persuasiveness_score": int,
"bridge_coverage_percent": int,
"missing_bridges": ["list of missed points"],
"feedback": "Short critique for the agent"
}}
"""
try:
response = self.llm.complete(prompt).text
cleaned = re.sub(r"```json|```", "", response).strip()
return json.loads(cleaned)
except Exception as e:
return {"error": str(e), "persuasiveness_score": 0}
def evaluate_real_world_interaction(self, user_story: str, expert_card: str, movie_metadata: dict) -> dict:
"""
Оценка реального диалога (Reference-Free).
Проверяет обоснованность (Groundedness) и логичность, не зная "правильного" ответа.
Разделяем "галлюцинации фактов" и "слабые тематические связи"
"""
# Превращаем метаданные в текст для промпта
facts_str = json.dumps({
"title": movie_metadata.get("title"),
"director": movie_metadata.get("director"),
"cast": movie_metadata.get("cast"),
"genres": movie_metadata.get("genres"),
"plot_keywords": movie_metadata.get("narrative_features", "")
}, ensure_ascii=False)
prompt = f"""
You are an AI Auditor monitoring a Movie Recommendation System in production.
Your goal is to detect **Factual Hallucinations** and evaluate **Logical Coherence**.
--- INPUT DATA ---
1. USER STORY: "{user_story}"
2. REAL MOVIE FACTS (Ground Truth): {facts_str}
3. AGENT'S RECOMMENDATION TEXT: "{expert_card}"
--- AUDIT TASKS ---
**IMPORTANT DISTINCTION:**
- **Hallucination = Inventing facts that contradict Movie Facts** (e.g., wrong actors, wrong plot events)
- **Weak connection ≠ Hallucination** (e.g., "both are comedies with quirky characters" is NOT a hallucination,
just a thematic bridge)
1. **Check Groundedness (Faithfulness)**:
- Did the Agent mention any actors, directors, or plot details that CONTRADICT the Movie Facts?
- If the Agent describes plot events NOT in the movie's overview, that is a HALLUCINATION.
- If the Agent says "both films share a genre/mood/theme", that is NOT a hallucination.
- Score 0 (False claims) to 1 (Fully supported by facts).
2. **Check Logical Link**:
- Does the Agent clearly explain *how* the movie connects to the User Story?
- Thematic connections ("both explore loneliness", "both are comedies") are VALID bridges.
- Score 1 (Vague) to 5 (Strong logic).
3. **Hallucination Detection**:
- Set `hallucination_detected: true` ONLY if the Agent invented false factual claims.
- Examples of hallucinations: wrong actors, fabricated plot events, fake quotes.
- Examples of NOT hallucinations: "both films share comedic tone", "similar narrative structure",
"focuses on same themes".
OUTPUT JSON ONLY:
{{
"groundedness_score": float,
"coherence_score": int,
"hallucination_detected": boolean,
"hallucination_details": "string (what was invented?) or null",
"reasoning": "Short audit report"
}}
"""
try:
response = self.llm.complete(prompt).text
# Очистка JSON
cleaned = re.sub(r"```json|```", "", response).strip()
return json.loads(cleaned)
except Exception as e:
logger.error(f"Shadow eval failed: {e}")
return {"error": str(e), "groundedness_score": 0}