Antonio Montieri commited on
Commit
44b3f53
·
1 Parent(s): 41e6aba

app redone

Browse files
Files changed (3) hide show
  1. agent.py +55 -50
  2. app.py +180 -231
  3. appOLD.py +265 -0
agent.py CHANGED
@@ -101,57 +101,62 @@ def build_graph(provider: str = "mistral"):
101
 
102
  return builder.compile()
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # ----------------------------
106
- # Minimal Test Execution
107
- # ----------------------------
108
- if __name__ == "__main__":
109
- gaia_tasks = [
110
- {
111
- "id": "6f37996b",
112
- "question": """Given this table defining * on the set S = {a, b, c, d, e}
113
- |*|a|b|c|d|e|
114
- |---|---|---|---|---|---|
115
- |a|a|b|c|b|d|
116
- |b|b|c|a|e|c|
117
- |c|c|a|b|b|a|
118
- |d|b|e|b|e|d|
119
- |e|d|b|a|d|c|
120
- provide the subset of S involved in any possible counter-examples that prove * is not commutative.
121
- Provide your answer as a comma separated list of the elements in the set in alphabetical order."""
122
- },
123
- {
124
- "id": "305ac316",
125
- "question": "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name."
126
- },
127
- {
128
- "id": "840bfca7",
129
- "question": "On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?"
130
- }
131
- ]
132
 
133
- try:
134
- graph = build_graph(provider="mistral")
135
-
136
- for i, task in enumerate(gaia_tasks, 1):
137
- print(f"\n{'#'*60}")
138
- print(f"STARTING TASK {i} [ID: {task['id']}]")
139
- print(f"{'#'*60}")
140
-
141
- try:
142
- messages = [HumanMessage(content=task['question'])]
143
- result = graph.invoke(
144
- {"messages": messages},
145
- config={"recursion_limit": 100}
146
- )
147
-
148
- print("\nFULL EXECUTION LOG:")
149
- for m in result["messages"]:
150
- m.pretty_print()
151
- print("-" * 40)
152
-
153
- except Exception as e:
154
- print(f"ERROR executing task {task['id']}: {e}")
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  except Exception as e:
157
- print(f"CRITICAL SETUP ERROR: {e}")
 
101
 
102
  return builder.compile()
103
 
104
+ # =============================================================================
105
+ # 3. GAIAgent WRAPPER
106
+ # =============================================================================
107
+ class GAIAgent:
108
+ def __init__(self):
109
+ self.graph = build_graph(provider="mistral")
110
+
111
+ def _clean_answer(self, text: str) -> str:
112
+ """
113
+ Pulisce la risposta grezza dell'LLM per l'Exact Match della leaderboard.
114
+ """
115
+ if not text:
116
+ return ""
117
+
118
+ bold_match = re.search(r"\*\*(.*?)\*\*", text)
119
+ if bold_match:
120
+ return bold_match.group(1).strip()
121
+
122
+ match = re.search(r"FINAL\s*ANSWER\s*:\s*(.*)", text, re.IGNORECASE | re.DOTALL)
123
+ if match:
124
+ text = match.group(1).strip()
125
+
126
+ text = text.replace("**", "").replace("`", "").strip()
127
+
128
+ lines = [L.strip() for L in text.split('\n') if L.strip()]
129
+ if lines: text = lines[0]
130
+
131
+ if text.endswith(".") and len(text) > 4:
132
+ text = text[:-1]
133
+
134
+ return text
135
 
136
+ def __call__(self, question: str) -> str:
137
+ """
138
+ Interfaccia pubblica chiamata da app.py.
139
+ Accetta una stringa (domanda), restituisce una stringa (risposta pulita).
140
+ """
141
+ messages = [HumanMessage(content=question)]
142
+ result = self.graph.invoke({"messages": messages}, config={"recursion_limit": 60})
143
+ raw_answer = result["messages"][-1].content
144
+
145
+ return self._clean_answer(raw_answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
+ # =============================================================================
149
+ # 4. MAIN TEST
150
+ # =============================================================================
151
+ if __name__ == "__main__":
152
+ test_q = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
153
+
154
+ print("--- 🚀 Starting GAIAgent ---")
155
+
156
+ try:
157
+ agent = GAIAgent()
158
+ print(f"❓ Question: {test_q}")
159
+ ans = agent(test_q)
160
+ print(f"✅ FINAL ANSWER: {ans}")
161
  except Exception as e:
162
+ print(f" Error: {e}")
app.py CHANGED
@@ -1,265 +1,214 @@
1
  import os
2
- import re
3
- import gradio as gr
4
  import requests
5
- import inspect
6
  import pandas as pd
 
 
7
 
8
- from langchain_core.messages import HumanMessage
9
- from agent import build_graph
 
 
 
10
 
11
- # (Keep Constants as is)
12
  # --- Constants ---
13
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 
14
 
15
- # --- GAIAgent Definition ---
16
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
17
- class GAIAgent:
18
- def __init__(self):
19
- self.graph = build_graph(provider = "groq")
20
 
21
-
22
- def _clean_answer(self, text: str) -> str:
23
- """
24
- Pulisce la risposta per garantire l'Exact Match.
25
- Strategia:
26
- 1. Se c'è qualcosa tra **doppi asterischi**, è probabilmente la risposta.
27
- 2. Se c'è 'FINAL ANSWER:', prendi quello che segue.
28
- 3. Pulisci markdown e punteggiatura.
29
- """
30
- if not text:
31
- return ""
32
 
33
- # --- STRATEGIA 1: CATTURA GRASSETTO (Priorità Alta) ---
34
- # Se l'LLM scrive "The answer is **128**", prendiamo solo "128".
35
- bold_match = re.search(r"\*\*(.*?)\*\*", text)
36
- if bold_match:
37
- return bold_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- # --- STRATEGIA 2: CATTURA PREFIX (Standard) ---
40
- match = re.search(r"FINAL\s*ANSWER\s*:\s*(.*)", text, re.IGNORECASE | re.DOTALL)
41
- if match:
42
- text = match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # --- PULIZIA GENERALE ---
45
- # Rimuove markdown residuo
46
- text = text.replace("**", "").replace("`", "").strip()
47
-
48
- # Prende solo la prima riga (spesso le spiegazioni sono sotto)
49
- lines = [L.strip() for L in text.split('\n') if L.strip()]
50
- if lines: text = lines[0]
51
 
52
- # --- STRATEGIA 3: HEURISTICA "THE ANSWER IS..." ---
53
- # Se dopo tutto ciò abbiamo ancora una frase lunga che finisce con il codice
54
- # Esempio: "The number is 80GSFC21M0002."
55
- # Proviamo a splittare su " is " o " are "
56
- if len(text.split()) > 5: # Se è una frase lunga
57
- split_match = re.search(r"\b(is|are|was|were)\b\s*(.*)$", text, re.IGNORECASE)
58
- if split_match:
59
- candidate = split_match.group(2).strip()
60
- # Se quello che rimane è breve (la risposta), prendiamo quello
61
- if len(candidate.split()) <= 3:
62
- text = candidate
63
 
64
- # Rimuove punto finale
65
- if text.endswith(".") and len(text) > 4:
66
- text = text[:-1]
67
-
68
- return text
69
 
 
70
 
71
- def __call__(self, question: str) -> str:
72
- from langchain_core.messages import HumanMessage
73
- messages = [HumanMessage(content=question)]
74
 
75
- # Eseguiamo il grafo con un limite di ricorsione alto
76
- result = self.graph.invoke(
77
- {"messages": messages},
78
- config={"recursion_limit": 100}
79
- )
 
 
 
 
 
 
 
 
80
 
81
- raw_content = result["messages"][-1].content
 
 
82
 
83
- # --- FASE DI PULIZIA ---
84
- clean_content = self._clean_answer(raw_content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
- # Debug print per vedere cosa stiamo tagliando (utile per te)
87
- if raw_content != clean_content:
88
- print(f"CLEANED OUTPUT:\nRaw: '{raw_content}'\nClean: '{clean_content}'")
89
-
 
 
 
90
 
91
- def run_and_submit_all( profile: gr.OAuthProfile | None):
92
- """
93
- Fetches all questions, runs the GAIAgent on them, submits all answers,
94
- and displays the results.
95
- """
96
- # --- Determine HF Space Runtime URL and Repo URL ---
97
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
98
 
99
- if profile:
100
- username= f"{profile.username}"
101
- print(f"User logged in: {username}")
102
- else:
103
- print("User not logged in.")
104
- return "Please Login to Hugging Face with the button.", None
105
 
106
- api_url = DEFAULT_API_URL
107
- questions_url = f"{api_url}/questions"
108
- submit_url = f"{api_url}/submit"
 
 
 
 
 
109
 
110
- # 1. Instantiate Agent ( modify this part to create your agent)
111
- try:
112
- agent = GAIAgent()
113
- except Exception as e:
114
- print(f"Error instantiating agent: {e}")
115
- return f"Error initializing agent: {e}", None
116
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
117
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
118
- print(agent_code)
119
-
120
- # 2. Fetch Questions
121
- print(f"Fetching questions from: {questions_url}")
122
- try:
123
- response = requests.get(questions_url, timeout=15)
124
- response.raise_for_status()
125
- questions_data = response.json()
126
- if not questions_data:
127
- print("Fetched questions list is empty.")
128
- return "Fetched questions list is empty or invalid format.", None
129
- print(f"Fetched {len(questions_data)} questions.")
130
- except requests.exceptions.RequestException as e:
131
- print(f"Error fetching questions: {e}")
132
- return f"Error fetching questions: {e}", None
133
- except requests.exceptions.JSONDecodeError as e:
134
- print(f"Error decoding JSON response from questions endpoint: {e}")
135
- print(f"Response text: {response.text[:500]}")
136
- return f"Error decoding server response for questions: {e}", None
137
- except Exception as e:
138
- print(f"An unexpected error occurred fetching questions: {e}")
139
- return f"An unexpected error occurred fetching questions: {e}", None
140
-
141
- # 3. Run your Agent
142
- results_log = []
143
- answers_payload = []
144
- print(f"Running agent on {len(questions_data)} questions...")
145
- for item in questions_data:
146
- task_id = item.get("task_id")
147
- question_text = item.get("question")
148
- if not task_id or question_text is None:
149
- print(f"Skipping item with missing task_id or question: {item}")
150
- continue
151
- try:
152
- submitted_answer = agent(question_text)
153
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
154
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
155
- except Exception as e:
156
- print(f"Error running agent on task {task_id}: {e}")
157
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
158
-
159
- if not answers_payload:
160
- print("Agent did not produce any answers to submit.")
161
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
162
-
163
- # 4. Prepare Submission
164
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
165
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
166
- print(status_update)
167
 
168
- # 5. Submit
169
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
170
  try:
171
- response = requests.post(submit_url, json=submission_data, timeout=60)
172
  response.raise_for_status()
173
- result_data = response.json()
174
- final_status = (
175
- f"Submission Successful!\n"
176
- f"User: {result_data.get('username')}\n"
177
- f"Overall Score: {result_data.get('score', 'N/A')}% "
178
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
179
- f"Message: {result_data.get('message', 'No message received.')}"
180
- )
181
- print("Submission successful.")
182
- results_df = pd.DataFrame(results_log)
183
- return final_status, results_df
184
- except requests.exceptions.HTTPError as e:
185
- error_detail = f"Server responded with status {e.response.status_code}."
186
- try:
187
- error_json = e.response.json()
188
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
189
- except requests.exceptions.JSONDecodeError:
190
- error_detail += f" Response: {e.response.text[:500]}"
191
- status_message = f"Submission Failed: {error_detail}"
192
- print(status_message)
193
- results_df = pd.DataFrame(results_log)
194
- return status_message, results_df
195
- except requests.exceptions.Timeout:
196
- status_message = "Submission Failed: The request timed out."
197
- print(status_message)
198
- results_df = pd.DataFrame(results_log)
199
- return status_message, results_df
200
- except requests.exceptions.RequestException as e:
201
- status_message = f"Submission Failed: Network error - {e}"
202
- print(status_message)
203
- results_df = pd.DataFrame(results_log)
204
- return status_message, results_df
205
  except Exception as e:
206
- status_message = f"An unexpected error occurred during submission: {e}"
207
- print(status_message)
208
- results_df = pd.DataFrame(results_log)
209
- return status_message, results_df
210
-
211
-
212
- # --- Build Gradio Interface using Blocks ---
213
- with gr.Blocks() as demo:
214
- gr.Markdown("# GAIAgent Evaluation Runner")
215
- gr.Markdown(
216
- """
217
- **Instructions:**
218
-
219
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
220
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
221
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
222
 
223
- ---
224
- **Disclaimers:**
225
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
226
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
227
- """
228
- )
229
 
230
- gr.LoginButton()
231
-
232
- run_button = gr.Button("Run Evaluation & Submit All Answers")
233
-
234
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
235
- # Removed max_rows=10 from DataFrame constructor
236
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
 
 
 
 
237
 
238
- run_button.click(
239
- fn=run_and_submit_all,
240
- outputs=[status_output, results_table]
241
- )
 
242
 
243
  if __name__ == "__main__":
244
- print("\n" + "-"*30 + " App Starting " + "-"*30)
245
- # Check for SPACE_HOST and SPACE_ID at startup for information
246
- space_host_startup = os.getenv("SPACE_HOST")
247
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
248
-
249
- if space_host_startup:
250
- print(f"✅ SPACE_HOST found: {space_host_startup}")
251
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
252
- else:
253
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
254
-
255
- if space_id_startup: # Print repo URLs if SPACE_ID is found
256
- print(f"✅ SPACE_ID found: {space_id_startup}")
257
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
258
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
259
- else:
260
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
261
-
262
- print("-"*(60 + len(" App Starting ")) + "\n")
263
-
264
- print("Launching Gradio Interface for GAIAgent Evaluation...")
265
- demo.launch(debug=True, share=False)
 
1
  import os
2
+ import json
 
3
  import requests
 
4
  import pandas as pd
5
+ import gradio as gr
6
+ import time
7
 
8
+ # Import the modular agent
9
+ try:
10
+ from agent import GAIAgent
11
+ except ImportError:
12
+ print("WARNING: agent.py not found.")
13
 
 
14
  # --- Constants ---
15
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
16
+ CACHE_FILE = "answers_cache.json"
17
 
18
+ # --- RATE LIMIT CONFIGURATION ---
19
+ MAX_RETRIES = 3 # Max attempts per question
20
+ REQUEST_DELAY = 5 # Seconds to wait between questions (Grace Time)
21
+ ERROR_DELAY = 10 # Seconds to wait after an error
 
22
 
23
+ def fetch_questions():
24
+ """Fetches questions from the API."""
25
+ try:
26
+ response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
27
+ response.raise_for_status()
28
+ return response.json()
29
+ except Exception as e:
30
+ print(f"Error fetching questions: {e}")
31
+ return []
 
 
32
 
33
+ def load_cache_as_dict():
34
+ """Loads cache for fast lookup."""
35
+ if os.path.exists(CACHE_FILE):
36
+ try:
37
+ with open(CACHE_FILE, "r") as f:
38
+ data = json.load(f)
39
+ return {item["task_id"]: item["submitted_answer"] for item in data}
40
+ except:
41
+ pass
42
+ return {}
43
+
44
+ def save_cache(data):
45
+ """Saves progress to file."""
46
+ try:
47
+ with open(CACHE_FILE, "w") as f:
48
+ json.dump(data, f, indent=2)
49
+ except Exception as e:
50
+ print(f"Error saving cache: {e}")
51
+
52
+ # =============================================================================
53
+ # 🔎 VALIDATION LOGIC
54
+ # =============================================================================
55
+ def is_valid_answer(answer: str) -> bool:
56
+ """Checks if the answer meets minimum criteria."""
57
+ if not answer: return False
58
+ if "AGENT ERROR" in answer or "Error:" in answer: return False
59
+ if len(answer) > 150: return False # Too long
60
 
61
+ forbidden_phrases = ["I cannot", "I am sorry", "The answer is", "Based on"]
62
+ for phrase in forbidden_phrases:
63
+ if phrase.lower() in answer.lower():
64
+ if "not known" in answer.lower(): return True
65
+ return False
66
+ return True
67
+
68
+ # =============================================================================
69
+ # 1. GENERATOR FUNCTION (With Grace Time & Retry)
70
+ # =============================================================================
71
+ def generate_answers(profile: gr.OAuthProfile | None):
72
+ """
73
+ Runs the agent with strategic pauses to avoid Rate Limits.
74
+ """
75
+ if not profile:
76
+ yield "⚠️ Please login with Hugging Face first!", pd.DataFrame()
77
+ return
78
 
79
+ existing_answers = load_cache_as_dict()
80
+ questions = fetch_questions()
81
+
82
+ if not questions:
83
+ yield "❌ Failed to fetch questions.", pd.DataFrame()
84
+ return
 
85
 
86
+ try:
87
+ agent = GAIAgent()
88
+ except Exception as e:
89
+ yield f" Error initializing agent: {e}", pd.DataFrame()
90
+ return
 
 
 
 
 
 
91
 
92
+ results_log = []
93
+ payload_cache = []
94
+ total = len(questions)
 
 
95
 
96
+ yield f"🚀 Starting with {REQUEST_DELAY}s grace time...", pd.DataFrame()
97
 
98
+ for i, item in enumerate(questions):
99
+ task_id = item.get("task_id")
100
+ question_text = item.get("question")
101
 
102
+ # --- CACHE CHECK (Fast) ---
103
+ if task_id in existing_answers:
104
+ answer = existing_answers[task_id]
105
+ if is_valid_answer(answer):
106
+ results_log.append({
107
+ "Task ID": task_id,
108
+ "Status": "⚡ Cached",
109
+ "Question": question_text[:50] + "...",
110
+ "Answer": answer
111
+ })
112
+ payload_cache.append({"task_id": task_id, "submitted_answer": answer})
113
+ yield f"⚡ Task {i+1} Cached.", pd.DataFrame(results_log)
114
+ continue
115
 
116
+ # --- REAL GENERATION (Slow) ---
117
+ final_answer = "NOT KNOWN"
118
+ success = False
119
 
120
+ for attempt in range(MAX_RETRIES):
121
+ try:
122
+ status_msg = f"⏳ Task {i+1}/{total} (Try {attempt+1})..."
123
+ yield status_msg, pd.DataFrame(results_log)
124
+
125
+ answer = agent(question_text)
126
+
127
+ if is_valid_answer(answer):
128
+ final_answer = answer
129
+ success = True
130
+ # Cooling down
131
+ wait_time = REQUEST_DELAY
132
+ yield f"✅ Success. Cooling down for {wait_time}s...", pd.DataFrame(results_log)
133
+ time.sleep(wait_time)
134
+ break
135
+ else:
136
+ print(f"⚠️ Invalid answer: {answer}")
137
+ yield f"⚠️ Invalid format. Retrying in {ERROR_DELAY}s...", pd.DataFrame(results_log)
138
+ time.sleep(ERROR_DELAY)
139
+
140
+ except Exception as e:
141
+ print(f"❌ Error on attempt {attempt+1}: {e}")
142
+ yield f"❌ Error. Pausing {ERROR_DELAY}s...", pd.DataFrame(results_log)
143
+ time.sleep(ERROR_DELAY)
144
+
145
+ # Save Result
146
+ status_label = "✅ Generated" if success else "⚠️ Failed"
147
+ results_log.append({
148
+ "Task ID": task_id,
149
+ "Status": status_label,
150
+ "Question": question_text[:50] + "...",
151
+ "Answer": final_answer
152
+ })
153
 
154
+ payload_cache.append({
155
+ "task_id": task_id,
156
+ "submitted_answer": final_answer
157
+ })
158
+
159
+ save_cache(payload_cache)
160
+ yield f"🏁 Logic Complete Task {i+1}", pd.DataFrame(results_log)
161
 
162
+ yield "🎉 Generation Complete! Ready to Submit.", pd.DataFrame(results_log)
 
 
 
 
 
 
163
 
 
 
 
 
 
 
164
 
165
+ # =============================================================================
166
+ # 2. SUBMIT FUNCTION
167
+ # =============================================================================
168
+ def submit_to_leaderboard(profile: gr.OAuthProfile | None):
169
+ if not profile: return "⚠️ Please login first!"
170
+
171
+ answers = load_cache_as_dict()
172
+ if not answers: return "⚠️ No answers found."
173
 
174
+ payload_list = [{"task_id": k, "submitted_answer": v} for k, v in answers.items()]
175
+ username = profile.username
176
+ space_id = os.getenv("SPACE_ID", "local-test")
 
 
 
 
177
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
178
+
179
+ submission_data = {
180
+ "username": username,
181
+ "agent_code": agent_code,
182
+ "answers": payload_list
183
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
 
 
185
  try:
186
+ response = requests.post(f"{DEFAULT_API_URL}/submit", json=submission_data, timeout=60)
187
  response.raise_for_status()
188
+ res = response.json()
189
+ return f"✅ SUCCESS! Score: {res.get('score')}% ({res.get('correct_count')} correct)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  except Exception as e:
191
+ return f" Submission Failed: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
 
 
 
 
 
 
193
 
194
+ # =============================================================================
195
+ # 3. INTERFACE
196
+ # =============================================================================
197
+ with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
198
+ gr.Markdown("# 🤖 GAIA Agent Evaluation (Robust)")
199
+ gr.Markdown(f"**Settings:** Grace Time: {REQUEST_DELAY}s | Max Retries: {MAX_RETRIES}")
200
+
201
+ with gr.Row(): login_btn = gr.LoginButton()
202
+
203
+ with gr.Row():
204
+ btn_gen = gr.Button("1. Generate Answers (Safe Mode)", variant="primary")
205
+ btn_sub = gr.Button("2. Submit Results", variant="secondary")
206
 
207
+ status = gr.Textbox(label="Status", interactive=False)
208
+ table = gr.DataFrame(label="Results", wrap=True)
209
+
210
+ btn_gen.click(fn=generate_answers, inputs=[login_btn], outputs=[status, table])
211
+ btn_sub.click(fn=submit_to_leaderboard, inputs=[login_btn], outputs=[status])
212
 
213
  if __name__ == "__main__":
214
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
appOLD.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import gradio as gr
4
+ import requests
5
+ import inspect
6
+ import pandas as pd
7
+
8
+ from langchain_core.messages import HumanMessage
9
+ from agent import build_graph
10
+
11
+ # (Keep Constants as is)
12
+ # --- Constants ---
13
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
14
+
15
+ # --- GAIAgent Definition ---
16
+ # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
17
+ class GAIAgent:
18
+ def __init__(self):
19
+ self.graph = build_graph(provider = "groq")
20
+
21
+
22
+ def _clean_answer(self, text: str) -> str:
23
+ """
24
+ Pulisce la risposta per garantire l'Exact Match.
25
+ Strategia:
26
+ 1. Se c'è qualcosa tra **doppi asterischi**, è probabilmente la risposta.
27
+ 2. Se c'è 'FINAL ANSWER:', prendi quello che segue.
28
+ 3. Pulisci markdown e punteggiatura.
29
+ """
30
+ if not text:
31
+ return ""
32
+
33
+ # --- STRATEGIA 1: CATTURA GRASSETTO (Priorità Alta) ---
34
+ # Se l'LLM scrive "The answer is **128**", prendiamo solo "128".
35
+ bold_match = re.search(r"\*\*(.*?)\*\*", text)
36
+ if bold_match:
37
+ return bold_match.group(1).strip()
38
+
39
+ # --- STRATEGIA 2: CATTURA PREFIX (Standard) ---
40
+ match = re.search(r"FINAL\s*ANSWER\s*:\s*(.*)", text, re.IGNORECASE | re.DOTALL)
41
+ if match:
42
+ text = match.group(1).strip()
43
+
44
+ # --- PULIZIA GENERALE ---
45
+ # Rimuove markdown residuo
46
+ text = text.replace("**", "").replace("`", "").strip()
47
+
48
+ # Prende solo la prima riga (spesso le spiegazioni sono sotto)
49
+ lines = [L.strip() for L in text.split('\n') if L.strip()]
50
+ if lines: text = lines[0]
51
+
52
+ # --- STRATEGIA 3: HEURISTICA "THE ANSWER IS..." ---
53
+ # Se dopo tutto ciò abbiamo ancora una frase lunga che finisce con il codice
54
+ # Esempio: "The number is 80GSFC21M0002."
55
+ # Proviamo a splittare su " is " o " are "
56
+ if len(text.split()) > 5: # Se è una frase lunga
57
+ split_match = re.search(r"\b(is|are|was|were)\b\s*(.*)$", text, re.IGNORECASE)
58
+ if split_match:
59
+ candidate = split_match.group(2).strip()
60
+ # Se quello che rimane è breve (la risposta), prendiamo quello
61
+ if len(candidate.split()) <= 3:
62
+ text = candidate
63
+
64
+ # Rimuove punto finale
65
+ if text.endswith(".") and len(text) > 4:
66
+ text = text[:-1]
67
+
68
+ return text
69
+
70
+
71
+ def __call__(self, question: str) -> str:
72
+ from langchain_core.messages import HumanMessage
73
+ messages = [HumanMessage(content=question)]
74
+
75
+ # Eseguiamo il grafo con un limite di ricorsione alto
76
+ result = self.graph.invoke(
77
+ {"messages": messages},
78
+ config={"recursion_limit": 100}
79
+ )
80
+
81
+ raw_content = result["messages"][-1].content
82
+
83
+ # --- FASE DI PULIZIA ---
84
+ clean_content = self._clean_answer(raw_content)
85
+
86
+ # Debug print per vedere cosa stiamo tagliando (utile per te)
87
+ if raw_content != clean_content:
88
+ print(f"CLEANED OUTPUT:\nRaw: '{raw_content}'\nClean: '{clean_content}'")
89
+
90
+
91
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
92
+ """
93
+ Fetches all questions, runs the GAIAgent on them, submits all answers,
94
+ and displays the results.
95
+ """
96
+ # --- Determine HF Space Runtime URL and Repo URL ---
97
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
98
+
99
+ if profile:
100
+ username= f"{profile.username}"
101
+ print(f"User logged in: {username}")
102
+ else:
103
+ print("User not logged in.")
104
+ return "Please Login to Hugging Face with the button.", None
105
+
106
+ api_url = DEFAULT_API_URL
107
+ questions_url = f"{api_url}/questions"
108
+ submit_url = f"{api_url}/submit"
109
+
110
+ # 1. Instantiate Agent ( modify this part to create your agent)
111
+ try:
112
+ agent = GAIAgent()
113
+ except Exception as e:
114
+ print(f"Error instantiating agent: {e}")
115
+ return f"Error initializing agent: {e}", None
116
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
117
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
118
+ print(agent_code)
119
+
120
+ # 2. Fetch Questions
121
+ print(f"Fetching questions from: {questions_url}")
122
+ try:
123
+ response = requests.get(questions_url, timeout=15)
124
+ response.raise_for_status()
125
+ questions_data = response.json()
126
+ if not questions_data:
127
+ print("Fetched questions list is empty.")
128
+ return "Fetched questions list is empty or invalid format.", None
129
+ print(f"Fetched {len(questions_data)} questions.")
130
+ except requests.exceptions.RequestException as e:
131
+ print(f"Error fetching questions: {e}")
132
+ return f"Error fetching questions: {e}", None
133
+ except requests.exceptions.JSONDecodeError as e:
134
+ print(f"Error decoding JSON response from questions endpoint: {e}")
135
+ print(f"Response text: {response.text[:500]}")
136
+ return f"Error decoding server response for questions: {e}", None
137
+ except Exception as e:
138
+ print(f"An unexpected error occurred fetching questions: {e}")
139
+ return f"An unexpected error occurred fetching questions: {e}", None
140
+
141
+ # 3. Run your Agent
142
+ results_log = []
143
+ answers_payload = []
144
+ print(f"Running agent on {len(questions_data)} questions...")
145
+ for item in questions_data:
146
+ task_id = item.get("task_id")
147
+ question_text = item.get("question")
148
+ if not task_id or question_text is None:
149
+ print(f"Skipping item with missing task_id or question: {item}")
150
+ continue
151
+ try:
152
+ submitted_answer = agent(question_text)
153
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
154
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
155
+ except Exception as e:
156
+ print(f"Error running agent on task {task_id}: {e}")
157
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
158
+
159
+ if not answers_payload:
160
+ print("Agent did not produce any answers to submit.")
161
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
162
+
163
+ # 4. Prepare Submission
164
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
165
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
166
+ print(status_update)
167
+
168
+ # 5. Submit
169
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
170
+ try:
171
+ response = requests.post(submit_url, json=submission_data, timeout=60)
172
+ response.raise_for_status()
173
+ result_data = response.json()
174
+ final_status = (
175
+ f"Submission Successful!\n"
176
+ f"User: {result_data.get('username')}\n"
177
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
178
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
179
+ f"Message: {result_data.get('message', 'No message received.')}"
180
+ )
181
+ print("Submission successful.")
182
+ results_df = pd.DataFrame(results_log)
183
+ return final_status, results_df
184
+ except requests.exceptions.HTTPError as e:
185
+ error_detail = f"Server responded with status {e.response.status_code}."
186
+ try:
187
+ error_json = e.response.json()
188
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
189
+ except requests.exceptions.JSONDecodeError:
190
+ error_detail += f" Response: {e.response.text[:500]}"
191
+ status_message = f"Submission Failed: {error_detail}"
192
+ print(status_message)
193
+ results_df = pd.DataFrame(results_log)
194
+ return status_message, results_df
195
+ except requests.exceptions.Timeout:
196
+ status_message = "Submission Failed: The request timed out."
197
+ print(status_message)
198
+ results_df = pd.DataFrame(results_log)
199
+ return status_message, results_df
200
+ except requests.exceptions.RequestException as e:
201
+ status_message = f"Submission Failed: Network error - {e}"
202
+ print(status_message)
203
+ results_df = pd.DataFrame(results_log)
204
+ return status_message, results_df
205
+ except Exception as e:
206
+ status_message = f"An unexpected error occurred during submission: {e}"
207
+ print(status_message)
208
+ results_df = pd.DataFrame(results_log)
209
+ return status_message, results_df
210
+
211
+
212
+ # --- Build Gradio Interface using Blocks ---
213
+ with gr.Blocks() as demo:
214
+ gr.Markdown("# GAIAgent Evaluation Runner")
215
+ gr.Markdown(
216
+ """
217
+ **Instructions:**
218
+
219
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
220
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
221
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
222
+
223
+ ---
224
+ **Disclaimers:**
225
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
226
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
227
+ """
228
+ )
229
+
230
+ gr.LoginButton()
231
+
232
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
233
+
234
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
235
+ # Removed max_rows=10 from DataFrame constructor
236
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
237
+
238
+ run_button.click(
239
+ fn=run_and_submit_all,
240
+ outputs=[status_output, results_table]
241
+ )
242
+
243
+ if __name__ == "__main__":
244
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
245
+ # Check for SPACE_HOST and SPACE_ID at startup for information
246
+ space_host_startup = os.getenv("SPACE_HOST")
247
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
248
+
249
+ if space_host_startup:
250
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
251
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
252
+ else:
253
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
254
+
255
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
256
+ print(f"✅ SPACE_ID found: {space_id_startup}")
257
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
258
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
259
+ else:
260
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
261
+
262
+ print("-"*(60 + len(" App Starting ")) + "\n")
263
+
264
+ print("Launching Gradio Interface for GAIAgent Evaluation...")
265
+ demo.launch(debug=True, share=False)