Spaces:

MCP-1st-Birthday
/

cinematch-ai

Running

App Files Files Community

cinematch-ai / evaluation /judges.py

dbadeev

Upload 25 files

16ce932 verified 26 days ago

raw

history blame contribute delete

5.69 kB

	# evaluation/judges.py
	from agents.nebius_simple import create_nebius_llm
	import json
	import re
	import logging

	logger = logging.getLogger(__name__)


	class PersuasionJudge:
	def __init__(self, api_key):
	self.llm = create_nebius_llm(api_key, model="meta-llama/Llama-3.3-70B-Instruct", temperature=0.0)

	def evaluate_expert_skill(self, user_story: str, expert_card: str, bridges: list) -> dict:
	"""
	Оценивает мастерство Агента-Эксперта в написании рекомендаций.
	"""
	bridges_str = "\n- ".join(bridges)

	prompt = f"""
	Act as a Senior Editor for a Movie Magazine.
	Evaluate the quality and persuasiveness of the AI Critic's recommendation.

	CONTEXT:
	The User provided a detailed story (50+ words).
	The AI Agent recommended a movie and wrote a "Justification".

	--- USER STORY ---
	"{user_story}"

	--- AI RECOMMENDATION CARD ---
	{expert_card}

	--- EVALUATION TASK ---
	Check if the AI Agent successfully built "Narrative Bridges" - connecting specific details from the User's story to the Movie.

	Expected Bridges (The agent SHOULD mention these connections):
	- {bridges_str}

	Rate the Agent on 3 metrics (1-5 stars):

	1. Context Awareness (1-5): Did the agent reference specific details from the user's text (e.g. "You mentioned a botanist...") or did it use a generic template?
	2. Persuasiveness (1-5): Is the argument convincing? Does it explain WHY this movie matches the user's specific plot?
	3. Bridge Coverage (0-100%): What percentage of the "Expected Bridges" were explicitly addressed?

	OUTPUT JSON ONLY:
	{{
	"context_score": int,
	"persuasiveness_score": int,
	"bridge_coverage_percent": int,
	"missing_bridges": ["list of missed points"],
	"feedback": "Short critique for the agent"
	}}
	"""

	try:
	response = self.llm.complete(prompt).text
	cleaned = re.sub(r"```json\|```", "", response).strip()
	return json.loads(cleaned)
	except Exception as e:
	return {"error": str(e), "persuasiveness_score": 0}

	def evaluate_real_world_interaction(self, user_story: str, expert_card: str, movie_metadata: dict) -> dict:
	"""
	Оценка реального диалога (Reference-Free).
	Проверяет обоснованность (Groundedness) и логичность, не зная "правильного" ответа.
	Разделяем "галлюцинации фактов" и "слабые тематические связи"
	"""
	# Превращаем метаданные в текст для промпта
	facts_str = json.dumps({
	"title": movie_metadata.get("title"),
	"director": movie_metadata.get("director"),
	"cast": movie_metadata.get("cast"),
	"genres": movie_metadata.get("genres"),
	"plot_keywords": movie_metadata.get("narrative_features", "")
	}, ensure_ascii=False)

	prompt = f"""
	You are an AI Auditor monitoring a Movie Recommendation System in production.
	Your goal is to detect Factual Hallucinations and evaluate Logical Coherence.

	--- INPUT DATA ---
	1. USER STORY: "{user_story}"
	2. REAL MOVIE FACTS (Ground Truth): {facts_str}
	3. AGENT'S RECOMMENDATION TEXT: "{expert_card}"

	--- AUDIT TASKS ---
	IMPORTANT DISTINCTION:
	- Hallucination = Inventing facts that contradict Movie Facts (e.g., wrong actors, wrong plot events)
	- Weak connection ≠ Hallucination (e.g., "both are comedies with quirky characters" is NOT a hallucination,
	just a thematic bridge)
	1. Check Groundedness (Faithfulness):
	- Did the Agent mention any actors, directors, or plot details that CONTRADICT the Movie Facts?
	- If the Agent describes plot events NOT in the movie's overview, that is a HALLUCINATION.
	- If the Agent says "both films share a genre/mood/theme", that is NOT a hallucination.
	- Score 0 (False claims) to 1 (Fully supported by facts).

	2. Check Logical Link:
	- Does the Agent clearly explain how the movie connects to the User Story?
	- Thematic connections ("both explore loneliness", "both are comedies") are VALID bridges.
	- Score 1 (Vague) to 5 (Strong logic).

	3. Hallucination Detection:
	- Set `hallucination_detected: true` ONLY if the Agent invented false factual claims.
	- Examples of hallucinations: wrong actors, fabricated plot events, fake quotes.
	- Examples of NOT hallucinations: "both films share comedic tone", "similar narrative structure",
	"focuses on same themes".

	OUTPUT JSON ONLY:
	{{
	"groundedness_score": float,
	"coherence_score": int,
	"hallucination_detected": boolean,
	"hallucination_details": "string (what was invented?) or null",
	"reasoning": "Short audit report"
	}}
	"""

	try:
	response = self.llm.complete(prompt).text
	# Очистка JSON
	cleaned = re.sub(r"```json\|```", "", response).strip()
	return json.loads(cleaned)
	except Exception as e:
	logger.error(f"Shadow eval failed: {e}")
	return {"error": str(e), "groundedness_score": 0}