bebechien commited on
Commit
3890a8c
·
verified ·
1 Parent(s): 50779fe

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/background.jpg filter=lfs diff=lfs merge=lfs -text
37
- transformers-4.57.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  assets/background.jpg filter=lfs diff=lfs merge=lfs -text
 
app.py CHANGED
@@ -3,6 +3,7 @@ import requests
3
  import os
4
  import pickle
5
  import spaces
 
6
  from bs4 import BeautifulSoup
7
  from html_to_markdown import convert_to_markdown
8
  from huggingface_hub import login
@@ -22,13 +23,43 @@ LLM_MODEL_ID = "google/gemma-3-12B-it"
22
  # Data Source Configuration
23
  BASE_URL = "https://hollowknight.wiki"
24
 
25
- # Hollow Knight Boss Data
26
- ENTRY_POINT_HOLLOW_KNIGHT = "/w/Category:Bosses_(Hollow_Knight)"
27
- CACHE_FILE_HOLLOW_KNIGHT = "hollow_knight_boss.pkl"
28
-
29
- # Silksong Boss Data
30
- ENTRY_POINT_SILKSONG = "/w/Category:Bosses_(Silksong)"
31
- CACHE_FILE_SILKSONG = "silksong_boss.pkl"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  # Gradio App Configuration
34
  DEFAULT_SIMILARITY_THRESHOLD = 0.5
@@ -38,7 +69,7 @@ DEFAULT_MESSAGE_NO_MATCH = "I'm sorry, I can't find a relevant document to answe
38
  # --- 2. HELPER FUNCTIONS ---
39
  # Reusable functions for web scraping and data processing.
40
 
41
- def get_html(url: str) -> str:
42
  """Fetches HTML content from a URL."""
43
  try:
44
  response = requests.get(url)
@@ -48,7 +79,7 @@ def get_html(url: str) -> str:
48
  print(f"Error fetching {url}: {e}")
49
  return ""
50
 
51
- def find_wiki_links(html_content: str) -> list[str]:
52
  """Parses HTML to find all boss links within the 'mw-pages' div."""
53
  soup = BeautifulSoup(html_content, 'html.parser')
54
  mw_pages_div = soup.find('div', id='mw-pages')
@@ -56,19 +87,42 @@ def find_wiki_links(html_content: str) -> list[str]:
56
  return []
57
  return [a['href'] for a in mw_pages_div.find_all('a', href=True)]
58
 
59
- def get_markdown_from_url(url: str) -> str:
60
- """Fetches and converts a webpage's content to Markdown."""
61
- html = get_html(url)
62
  if not html:
63
  return ""
 
64
  soup = BeautifulSoup(html, 'html.parser')
65
- # Assuming convert_to_markdown correctly processes the soup object
66
  return convert_to_markdown(soup)
67
 
 
 
 
68
 
69
  # --- 3. DATA PROCESSING & CACHING ---
70
  # Scrapes data and generates embeddings, using a cache to avoid re-running.
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
73
  """
74
  Loads processed data from a cache file if it exists. Otherwise, scrapes,
@@ -80,30 +134,28 @@ def load_or_process_source(entry_point: str, cache_file: str, label: str, embedd
80
  return pickle.load(f)
81
 
82
  print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
83
- main_page_html = get_html(BASE_URL + entry_point)
84
- extracted_links = find_wiki_links(main_page_html)
85
 
86
- contents = {"titles": [], "texts": [], "embeddings": []}
 
 
 
 
 
87
 
88
  for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
89
  full_url = BASE_URL + doc_path
90
- original_text = get_markdown_from_url(full_url)
91
-
92
- # Trim text from the "References" section onwards for cleaner context
93
- text = original_text.split("References\n----------\n", 1)[0].strip()
94
 
95
- if text:
96
- contents["titles"].append(doc_path.split('/')[-1])
97
- contents["texts"].append(text)
98
- # Generate and add embedding
99
- embedding = embedding_model.encode(text, prompt=f"title: {doc_path.split('/')[-1]} | text: ")
100
- contents["embeddings"].append(embedding)
101
 
102
- print(f"✅ {label} processing complete. Saving data to '{cache_file}'...")
103
  with open(cache_file, 'wb') as f:
104
- pickle.dump(contents, f)
105
 
106
- return contents
107
 
108
 
109
  # --- 4. CORE AI LOGIC ---
@@ -111,27 +163,34 @@ def load_or_process_source(entry_point: str, cache_file: str, label: str, embedd
111
 
112
  def find_best_context(model, query: str, contents: dict, similarity_threshold: float):
113
  """Finds the most relevant document text based on semantic similarity."""
114
- if not query or not contents["embeddings"]:
115
  return None
116
 
117
  query_embedding = model.encode(query, prompt_name="query")
118
- similarities = model.similarity(query_embedding, contents["embeddings"])
 
119
 
120
  best_index = similarities.argmax().item()
121
  best_score = similarities[0, best_index].item()
122
 
123
  print(best_score)
124
  if best_score >= similarity_threshold:
125
- return contents["texts"][best_index]
 
126
  return None
127
 
128
  context = None
129
 
130
  @spaces.GPU
131
- def respond(message: str, history: list, similarity_threshold: float):
132
  """Generates a streaming response from the LLM based on the best context found."""
133
  global context
134
- if (context := find_best_context(embedding_model, message, combined_contents, similarity_threshold) or context):
 
 
 
 
 
135
  # SUCCESS: A valid context was found and has been saved.
136
  pass
137
  else:
@@ -146,7 +205,7 @@ def respond(message: str, history: list, similarity_threshold: float):
146
  messages.extend(history)
147
  messages.append({"role": "user", "content": user_prompt})
148
 
149
- for item in messages:
150
  print(item['role'])
151
  print(item['content'])
152
 
@@ -188,20 +247,15 @@ llm_pipeline = pipeline(
188
  )
189
 
190
  print("\n--- Processing Game Data ---")
191
- hk_contents = load_or_process_source(
192
- ENTRY_POINT_HOLLOW_KNIGHT, CACHE_FILE_HOLLOW_KNIGHT, "Hollow Knight", embedding_model
193
- )
194
- silksong_contents = load_or_process_source(
195
- ENTRY_POINT_SILKSONG, CACHE_FILE_SILKSONG, "Silksong", embedding_model
196
- )
197
 
198
- print("\nCombining data sources...")
199
- combined_contents = {
200
- "titles": hk_contents["titles"] + silksong_contents["titles"],
201
- "texts": hk_contents["texts"] + silksong_contents["texts"],
202
- "embeddings": hk_contents["embeddings"] + silksong_contents["embeddings"],
203
- }
204
- print(f"✅ Total documents processed: {len(combined_contents['texts'])}")
205
 
206
 
207
  # --- 6. GRADIO UI ---
@@ -239,7 +293,7 @@ with gr.Blocks(theme=silksong_theme, css=silksong_css) as demo:
239
  <div class="header-text">
240
  <h1>A Weaver's Counsel</h1>
241
  <p>Speak, little traveler. What secrets of Pharloom do you seek?</p>
242
- <p style="font-style: italic;">(Note: This bot currently only has knowledge about bosses)</p>
243
  </div>
244
  """)
245
 
@@ -249,13 +303,14 @@ with gr.Blocks(theme=silksong_theme, css=silksong_css) as demo:
249
  chatbot=gr.Chatbot(type="messages", label=LLM_MODEL_ID),
250
  textbox=gr.Textbox(placeholder="Ask about the haunted kingdom...", container=False, submit_btn=True, scale=7),
251
  additional_inputs=[
 
252
  gr.Slider(minimum=0.1, maximum=1.0, value=DEFAULT_SIMILARITY_THRESHOLD, step=0.1, label="Similarity Threshold"),
253
  ],
254
  examples=[
255
- ["Where can I find the Moorwing?", DEFAULT_SIMILARITY_THRESHOLD],
256
- ["Who is the voice of Lace?", DEFAULT_SIMILARITY_THRESHOLD],
257
- ["How can I beat the False Knight?", DEFAULT_SIMILARITY_THRESHOLD],
258
- ["Any achievement for Hornet Protector?", DEFAULT_SIMILARITY_THRESHOLD],
259
  ],
260
  )
261
 
 
3
  import os
4
  import pickle
5
  import spaces
6
+ import torch
7
  from bs4 import BeautifulSoup
8
  from html_to_markdown import convert_to_markdown
9
  from huggingface_hub import login
 
23
  # Data Source Configuration
24
  BASE_URL = "https://hollowknight.wiki"
25
 
26
+ GAME_KNOWLEDGE_DATA = [
27
+ {
28
+ "title": "Hollow Knight",
29
+ "category_list": [
30
+ {
31
+ "entry": "/w/Category:Bosses_(Hollow_Knight)",
32
+ "cache": "hollow_knight_boss.pkl",
33
+ "label": "Bosses",
34
+ },
35
+ ],
36
+ },
37
+ {
38
+ "title": "Silksong",
39
+ "category_list": [
40
+ {
41
+ "entry": "/w/Category:Areas_(Silksong)",
42
+ "cache": "silksong_areas.pkl",
43
+ "label": "Areas",
44
+ },
45
+ {
46
+ "entry": "/w/Category:Bosses_(Silksong)",
47
+ "cache": "silksong_bosses.pkl",
48
+ "label": "Bosses",
49
+ },
50
+ {
51
+ "entry": "/w/Category:Tools_and_Skills_(Silksong)",
52
+ "cache": "silksong_tools_and_skills.pkl",
53
+ "label": "Tools and Skills",
54
+ },
55
+ {
56
+ "entry": "/w/Category:NPCs_(Silksong)",
57
+ "cache": "silksong_npcs.pkl",
58
+ "label": "NPCs",
59
+ }
60
+ ],
61
+ },
62
+ ]
63
 
64
  # Gradio App Configuration
65
  DEFAULT_SIMILARITY_THRESHOLD = 0.5
 
69
  # --- 2. HELPER FUNCTIONS ---
70
  # Reusable functions for web scraping and data processing.
71
 
72
+ def _get_html(url: str) -> str:
73
  """Fetches HTML content from a URL."""
74
  try:
75
  response = requests.get(url)
 
79
  print(f"Error fetching {url}: {e}")
80
  return ""
81
 
82
+ def _find_wiki_links(html_content: str) -> list[str]:
83
  """Parses HTML to find all boss links within the 'mw-pages' div."""
84
  soup = BeautifulSoup(html_content, 'html.parser')
85
  mw_pages_div = soup.find('div', id='mw-pages')
 
87
  return []
88
  return [a['href'] for a in mw_pages_div.find_all('a', href=True)]
89
 
90
+ def _get_markdown_from_html(html: str) -> str:
 
 
91
  if not html:
92
  return ""
93
+
94
  soup = BeautifulSoup(html, 'html.parser')
 
95
  return convert_to_markdown(soup)
96
 
97
+ def _get_markdown_from_url(url: str) -> str:
98
+ return _get_markdown_from_html(_get_html(url))
99
+
100
 
101
  # --- 3. DATA PROCESSING & CACHING ---
102
  # Scrapes data and generates embeddings, using a cache to avoid re-running.
103
 
104
+ def _clean_text(text: str) -> str:
105
+ """Removes the references section from the raw text."""
106
+ return text.split("References\n----------\n", 1)[0].strip()
107
+
108
+ def _create_data_entry(text: str, doc_path: str, label: str, embedding_model) -> dict | None:
109
+ """Creates a single structured data entry with text, metadata, and embedding."""
110
+ cleaned_text = _clean_text(text)
111
+ if not cleaned_text:
112
+ return None
113
+
114
+ title = doc_path.split('/')[-1]
115
+ embedding = embedding_model.encode(cleaned_text, prompt=f"title: {title} | text: ")
116
+ return {
117
+ "text": cleaned_text,
118
+ "embedding": embedding,
119
+ "metadata": {
120
+ "category": label,
121
+ "source": BASE_URL + doc_path,
122
+ "title": title
123
+ }
124
+ }
125
+
126
  def load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
127
  """
128
  Loads processed data from a cache file if it exists. Otherwise, scrapes,
 
134
  return pickle.load(f)
135
 
136
  print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
137
+ processed_data = []
 
138
 
139
+ main_page_html = _get_html(BASE_URL + entry_point)
140
+ data_entry = _create_data_entry(_get_markdown_from_html(main_page_html), entry_point, label, embedding_model)
141
+ if (data_entry):
142
+ processed_data.append(data_entry)
143
+
144
+ extracted_links = _find_wiki_links(main_page_html)
145
 
146
  for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
147
  full_url = BASE_URL + doc_path
148
+ text = _get_markdown_from_url(full_url)
 
 
 
149
 
150
+ data_entry = _create_data_entry(text, doc_path, label, embedding_model)
151
+ if data_entry:
152
+ processed_data.append(data_entry)
 
 
 
153
 
154
+ print(f"✅ {label} processing complete. Saving {len(processed_data)} entries to '{cache_file}'...")
155
  with open(cache_file, 'wb') as f:
156
+ pickle.dump(processed_data, f)
157
 
158
+ return processed_data
159
 
160
 
161
  # --- 4. CORE AI LOGIC ---
 
163
 
164
  def find_best_context(model, query: str, contents: dict, similarity_threshold: float):
165
  """Finds the most relevant document text based on semantic similarity."""
166
+ if not query:
167
  return None
168
 
169
  query_embedding = model.encode(query, prompt_name="query")
170
+ contents_embeddings = torch.stack([torch.tensor(item["embedding"]) for item in contents])
171
+ similarities = model.similarity(query_embedding, contents_embeddings)
172
 
173
  best_index = similarities.argmax().item()
174
  best_score = similarities[0, best_index].item()
175
 
176
  print(best_score)
177
  if best_score >= similarity_threshold:
178
+ print(f"Using \"{contents[best_index]['metadata']['source']}\"...")
179
+ return contents[best_index]["text"]
180
  return None
181
 
182
  context = None
183
 
184
  @spaces.GPU
185
+ def respond(message: str, history: list, game: str, similarity_threshold: float):
186
  """Generates a streaming response from the LLM based on the best context found."""
187
  global context
188
+ contents = _select_content(game)
189
+ if not contents:
190
+ yield DEFAULT_MESSAGE_NO_MATCH
191
+ return
192
+
193
+ if (context := find_best_context(embedding_model, message, contents, similarity_threshold) or context):
194
  # SUCCESS: A valid context was found and has been saved.
195
  pass
196
  else:
 
205
  messages.extend(history)
206
  messages.append({"role": "user", "content": user_prompt})
207
 
208
+ for item in messages[1:]:
209
  print(item['role'])
210
  print(item['content'])
211
 
 
247
  )
248
 
249
  print("\n--- Processing Game Data ---")
250
+ knowledge_base = {}
 
 
 
 
 
251
 
252
+ for item in GAME_KNOWLEDGE_DATA:
253
+ knowledge_base[item['title']] = []
254
+ for category in item['category_list']:
255
+ knowledge_base[item['title']] += load_or_process_source(category['entry'], category['cache'], category['label'], embedding_model)
256
+
257
+ def _select_content(game: str):
258
+ return knowledge_base[game]
259
 
260
 
261
  # --- 6. GRADIO UI ---
 
293
  <div class="header-text">
294
  <h1>A Weaver's Counsel</h1>
295
  <p>Speak, little traveler. What secrets of Pharloom do you seek?</p>
296
+ <p style="font-style: italic;">(Note: This bot has a limited knowledge.)</p>
297
  </div>
298
  """)
299
 
 
303
  chatbot=gr.Chatbot(type="messages", label=LLM_MODEL_ID),
304
  textbox=gr.Textbox(placeholder="Ask about the haunted kingdom...", container=False, submit_btn=True, scale=7),
305
  additional_inputs=[
306
+ gr.Dropdown(["Hollow Knight", "Silksong"], label="Game"),
307
  gr.Slider(minimum=0.1, maximum=1.0, value=DEFAULT_SIMILARITY_THRESHOLD, step=0.1, label="Similarity Threshold"),
308
  ],
309
  examples=[
310
+ ["Where can I find the Moorwing?", "Silksong", DEFAULT_SIMILARITY_THRESHOLD],
311
+ ["Who is the voice of Lace?", "Silksong", DEFAULT_SIMILARITY_THRESHOLD],
312
+ ["How can I beat the False Knight?", "Hollow Knight", DEFAULT_SIMILARITY_THRESHOLD],
313
+ ["Any achievement for Hornet Protector?", "Hollow Knight", DEFAULT_SIMILARITY_THRESHOLD],
314
  ],
315
  )
316
 
hollow_knight_boss.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:608417424fb5f9670689cb318868bc19ddba6a524fa6df8f3d43c47393e65a13
3
- size 976095
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68d0acf0286e68a4183da2b78e638e6792f1eba19f84cda331512f6301c50039
3
+ size 995501
silksong_areas.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:147a37b653b5c5016ab6067a7aaccb68e0e2e5ca0ef1c0bd4b1591f59f56b3ec
3
+ size 84007
silksong_bosses.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd7e5a122cc5075fa8d45f73a5fe2dc3f893bf1934da82a5bf30044fe69b72b2
3
+ size 73446
silksong_npcs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dde39276b5c2f84d074d7c0c60b3ac076dd31ee6ed81fb9e023dca91a3d8576
3
+ size 115874
silksong_tools_and_skills.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b953f6e1be940f561c4b6234ae22d22702e6b054fe394f8e984869b3eecb023a
3
+ size 17485