Spaces:

bebechien
/

hollow-knight-helper

Running on Zero

App Files Files Community

bebechien commited on Sep 5

Commit

3890a8c

verified ·

1 Parent(s): 50779fe

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.gitattributes +0 -1
app.py +108 -53
hollow_knight_boss.pkl +2 -2
silksong_areas.pkl +3 -0
silksong_bosses.pkl +3 -0
silksong_npcs.pkl +3 -0
silksong_tools_and_skills.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -34,4 +34,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/background.jpg filter=lfs diff=lfs merge=lfs -text
-transformers-4.57.0.dev0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 assets/background.jpg filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import requests
 import os
 import pickle
 import spaces
 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 from huggingface_hub import login
@@ -22,13 +23,43 @@ LLM_MODEL_ID = "google/gemma-3-12B-it"
 # Data Source Configuration
 BASE_URL = "https://hollowknight.wiki"
-# Hollow Knight Boss Data
-ENTRY_POINT_HOLLOW_KNIGHT = "/w/Category:Bosses_(Hollow_Knight)"
-CACHE_FILE_HOLLOW_KNIGHT = "hollow_knight_boss.pkl"
-# Silksong Boss Data
-ENTRY_POINT_SILKSONG = "/w/Category:Bosses_(Silksong)"
-CACHE_FILE_SILKSONG = "silksong_boss.pkl"
 # Gradio App Configuration
 DEFAULT_SIMILARITY_THRESHOLD = 0.5
@@ -38,7 +69,7 @@ DEFAULT_MESSAGE_NO_MATCH = "I'm sorry, I can't find a relevant document to answe
 # --- 2. HELPER FUNCTIONS ---
 # Reusable functions for web scraping and data processing.
-def get_html(url: str) -> str:
     """Fetches HTML content from a URL."""
     try:
         response = requests.get(url)
@@ -48,7 +79,7 @@ def get_html(url: str) -> str:
         print(f"Error fetching {url}: {e}")
         return ""
-def find_wiki_links(html_content: str) -> list[str]:
     """Parses HTML to find all boss links within the 'mw-pages' div."""
     soup = BeautifulSoup(html_content, 'html.parser')
     mw_pages_div = soup.find('div', id='mw-pages')
@@ -56,19 +87,42 @@ def find_wiki_links(html_content: str) -> list[str]:
         return []
     return [a['href'] for a in mw_pages_div.find_all('a', href=True)]
-def get_markdown_from_url(url: str) -> str:
-    """Fetches and converts a webpage's content to Markdown."""
-    html = get_html(url)
     if not html:
         return ""
     soup = BeautifulSoup(html, 'html.parser')
-    # Assuming convert_to_markdown correctly processes the soup object
     return convert_to_markdown(soup)
 # --- 3. DATA PROCESSING & CACHING ---
 # Scrapes data and generates embeddings, using a cache to avoid re-running.
 def load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
     """
     Loads processed data from a cache file if it exists. Otherwise, scrapes,
@@ -80,30 +134,28 @@ def load_or_process_source(entry_point: str, cache_file: str, label: str, embedd
             return pickle.load(f)
     print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
-    main_page_html = get_html(BASE_URL + entry_point)
-    extracted_links = find_wiki_links(main_page_html)
-    contents = {"titles": [], "texts": [], "embeddings": []}
     for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
         full_url = BASE_URL + doc_path
-        original_text = get_markdown_from_url(full_url)
-        # Trim text from the "References" section onwards for cleaner context
-        text = original_text.split("References\n----------\n", 1)[0].strip()
-        if text:
-            contents["titles"].append(doc_path.split('/')[-1])
-            contents["texts"].append(text)
-            # Generate and add embedding
-            embedding = embedding_model.encode(text, prompt=f"title: {doc_path.split('/')[-1]} | text: ")
-            contents["embeddings"].append(embedding)
-    print(f"✅ {label} processing complete. Saving data to '{cache_file}'...")
     with open(cache_file, 'wb') as f:
-        pickle.dump(contents, f)
-    return contents
 # --- 4. CORE AI LOGIC ---
@@ -111,27 +163,34 @@ def load_or_process_source(entry_point: str, cache_file: str, label: str, embedd
 def find_best_context(model, query: str, contents: dict, similarity_threshold: float):
     """Finds the most relevant document text based on semantic similarity."""
-    if not query or not contents["embeddings"]:
         return None
     query_embedding = model.encode(query, prompt_name="query")
-    similarities = model.similarity(query_embedding, contents["embeddings"])
     best_index = similarities.argmax().item()
     best_score = similarities[0, best_index].item()
     print(best_score)
     if best_score >= similarity_threshold:
-        return contents["texts"][best_index]
     return None
 context = None
 @spaces.GPU
-def respond(message: str, history: list, similarity_threshold: float):
     """Generates a streaming response from the LLM based on the best context found."""
     global context
-    if (context := find_best_context(embedding_model, message, combined_contents, similarity_threshold) or context):
         # SUCCESS: A valid context was found and has been saved.
         pass
     else:
@@ -146,7 +205,7 @@ def respond(message: str, history: list, similarity_threshold: float):
     messages.extend(history)
     messages.append({"role": "user", "content": user_prompt})
-    for item in messages:
         print(item['role'])
         print(item['content'])
@@ -188,20 +247,15 @@ llm_pipeline = pipeline(
 )
 print("\n--- Processing Game Data ---")
-hk_contents = load_or_process_source(
-    ENTRY_POINT_HOLLOW_KNIGHT, CACHE_FILE_HOLLOW_KNIGHT, "Hollow Knight", embedding_model
-)
-silksong_contents = load_or_process_source(
-    ENTRY_POINT_SILKSONG, CACHE_FILE_SILKSONG, "Silksong", embedding_model
-)
-print("\nCombining data sources...")
-combined_contents = {
-    "titles": hk_contents["titles"] + silksong_contents["titles"],
-    "texts": hk_contents["texts"] + silksong_contents["texts"],
-    "embeddings": hk_contents["embeddings"] + silksong_contents["embeddings"],
-}
-print(f"✅ Total documents processed: {len(combined_contents['texts'])}")
 # --- 6. GRADIO UI ---
@@ -239,7 +293,7 @@ with gr.Blocks(theme=silksong_theme, css=silksong_css) as demo:
         <div class="header-text">
             <h1>A Weaver's Counsel</h1>
             <p>Speak, little traveler. What secrets of Pharloom do you seek?</p>
-            <p style="font-style: italic;">(Note: This bot currently only has knowledge about bosses)</p>
         </div>
     """)
@@ -249,13 +303,14 @@ with gr.Blocks(theme=silksong_theme, css=silksong_css) as demo:
         chatbot=gr.Chatbot(type="messages", label=LLM_MODEL_ID),
         textbox=gr.Textbox(placeholder="Ask about the haunted kingdom...", container=False, submit_btn=True, scale=7),
         additional_inputs=[
             gr.Slider(minimum=0.1, maximum=1.0, value=DEFAULT_SIMILARITY_THRESHOLD, step=0.1, label="Similarity Threshold"),
         ],
         examples=[
-            ["Where can I find the Moorwing?", DEFAULT_SIMILARITY_THRESHOLD],
-            ["Who is the voice of Lace?", DEFAULT_SIMILARITY_THRESHOLD],
-            ["How can I beat the False Knight?", DEFAULT_SIMILARITY_THRESHOLD],
-            ["Any achievement for Hornet Protector?", DEFAULT_SIMILARITY_THRESHOLD],
         ],
     )

 import os
 import pickle
 import spaces
+import torch
 from bs4 import BeautifulSoup
 from html_to_markdown import convert_to_markdown
 from huggingface_hub import login
 # Data Source Configuration
 BASE_URL = "https://hollowknight.wiki"
+GAME_KNOWLEDGE_DATA = [
+    {
+        "title": "Hollow Knight",
+        "category_list": [
+            {
+                "entry": "/w/Category:Bosses_(Hollow_Knight)",
+                "cache": "hollow_knight_boss.pkl",
+                "label": "Bosses",
+            },
+        ],
+    },
+    {
+        "title": "Silksong",
+        "category_list": [
+            {
+                "entry": "/w/Category:Areas_(Silksong)",
+                "cache": "silksong_areas.pkl",
+                "label": "Areas",
+            },
+            {
+                "entry": "/w/Category:Bosses_(Silksong)",
+                "cache": "silksong_bosses.pkl",
+                "label": "Bosses",
+            },
+            {
+                "entry": "/w/Category:Tools_and_Skills_(Silksong)",
+                "cache": "silksong_tools_and_skills.pkl",
+                "label": "Tools and Skills",
+            },
+            {
+                "entry": "/w/Category:NPCs_(Silksong)",
+                "cache": "silksong_npcs.pkl",
+                "label": "NPCs",
+            }
+        ],
+    },
+]
 # Gradio App Configuration
 DEFAULT_SIMILARITY_THRESHOLD = 0.5
 # --- 2. HELPER FUNCTIONS ---
 # Reusable functions for web scraping and data processing.
+def _get_html(url: str) -> str:
     """Fetches HTML content from a URL."""
     try:
         response = requests.get(url)
         print(f"Error fetching {url}: {e}")
         return ""
+def _find_wiki_links(html_content: str) -> list[str]:
     """Parses HTML to find all boss links within the 'mw-pages' div."""
     soup = BeautifulSoup(html_content, 'html.parser')
     mw_pages_div = soup.find('div', id='mw-pages')
         return []
     return [a['href'] for a in mw_pages_div.find_all('a', href=True)]
+def _get_markdown_from_html(html: str) -> str:
     if not html:
         return ""
     soup = BeautifulSoup(html, 'html.parser')
     return convert_to_markdown(soup)
+def _get_markdown_from_url(url: str) -> str:
+    return _get_markdown_from_html(_get_html(url))
 # --- 3. DATA PROCESSING & CACHING ---
 # Scrapes data and generates embeddings, using a cache to avoid re-running.
+def _clean_text(text: str) -> str:
+    """Removes the references section from the raw text."""
+    return text.split("References\n----------\n", 1)[0].strip()
+def _create_data_entry(text: str, doc_path: str, label: str, embedding_model) -> dict | None:
+    """Creates a single structured data entry with text, metadata, and embedding."""
+    cleaned_text = _clean_text(text)
+    if not cleaned_text:
+        return None
+    title = doc_path.split('/')[-1]
+    embedding = embedding_model.encode(cleaned_text, prompt=f"title: {title} | text: ")
+    return {
+        "text": cleaned_text,
+        "embedding": embedding,
+        "metadata": {
+            "category": label,
+            "source": BASE_URL + doc_path,
+            "title": title
+        }
+    }
 def load_or_process_source(entry_point: str, cache_file: str, label: str, embedding_model):
     """
     Loads processed data from a cache file if it exists. Otherwise, scrapes,
             return pickle.load(f)
     print(f"ℹ️ No cache for {label}. Starting data scraping and processing...")
+    processed_data = []
+    main_page_html = _get_html(BASE_URL + entry_point)
+    data_entry = _create_data_entry(_get_markdown_from_html(main_page_html), entry_point, label, embedding_model)
+    if (data_entry):
+        processed_data.append(data_entry)
+    extracted_links = _find_wiki_links(main_page_html)
     for doc_path in tqdm(extracted_links, desc=f"Processing {label} Pages"):
         full_url = BASE_URL + doc_path
+        text = _get_markdown_from_url(full_url)
+        data_entry = _create_data_entry(text, doc_path, label, embedding_model)
+        if data_entry:
+            processed_data.append(data_entry)
+    print(f"✅ {label} processing complete. Saving {len(processed_data)} entries to '{cache_file}'...")
     with open(cache_file, 'wb') as f:
+        pickle.dump(processed_data, f)
+    return processed_data
 # --- 4. CORE AI LOGIC ---
 def find_best_context(model, query: str, contents: dict, similarity_threshold: float):
     """Finds the most relevant document text based on semantic similarity."""
+    if not query:
         return None
     query_embedding = model.encode(query, prompt_name="query")
+    contents_embeddings = torch.stack([torch.tensor(item["embedding"]) for item in contents])
+    similarities = model.similarity(query_embedding, contents_embeddings)
     best_index = similarities.argmax().item()
     best_score = similarities[0, best_index].item()
     print(best_score)
     if best_score >= similarity_threshold:
+        print(f"Using \"{contents[best_index]['metadata']['source']}\"...")
+        return contents[best_index]["text"]
     return None
 context = None
 @spaces.GPU
+def respond(message: str, history: list, game: str, similarity_threshold: float):
     """Generates a streaming response from the LLM based on the best context found."""
     global context
+    contents = _select_content(game)
+    if not contents:
+        yield DEFAULT_MESSAGE_NO_MATCH
+        return
+    if (context := find_best_context(embedding_model, message, contents, similarity_threshold) or context):
         # SUCCESS: A valid context was found and has been saved.
         pass
     else:
     messages.extend(history)
     messages.append({"role": "user", "content": user_prompt})
+    for item in messages[1:]:
         print(item['role'])
         print(item['content'])
 )
 print("\n--- Processing Game Data ---")
+knowledge_base = {}
+for item in GAME_KNOWLEDGE_DATA:
+    knowledge_base[item['title']] = []
+    for category in item['category_list']:
+        knowledge_base[item['title']] += load_or_process_source(category['entry'], category['cache'], category['label'], embedding_model)
+def _select_content(game: str):
+    return knowledge_base[game]
 # --- 6. GRADIO UI ---
         <div class="header-text">
             <h1>A Weaver's Counsel</h1>
             <p>Speak, little traveler. What secrets of Pharloom do you seek?</p>
+            <p style="font-style: italic;">(Note: This bot has a limited knowledge.)</p>
         </div>
     """)
         chatbot=gr.Chatbot(type="messages", label=LLM_MODEL_ID),
         textbox=gr.Textbox(placeholder="Ask about the haunted kingdom...", container=False, submit_btn=True, scale=7),
         additional_inputs=[
+            gr.Dropdown(["Hollow Knight", "Silksong"], label="Game"),
             gr.Slider(minimum=0.1, maximum=1.0, value=DEFAULT_SIMILARITY_THRESHOLD, step=0.1, label="Similarity Threshold"),
         ],
         examples=[
+            ["Where can I find the Moorwing?", "Silksong", DEFAULT_SIMILARITY_THRESHOLD],
+            ["Who is the voice of Lace?", "Silksong", DEFAULT_SIMILARITY_THRESHOLD],
+            ["How can I beat the False Knight?", "Hollow Knight", DEFAULT_SIMILARITY_THRESHOLD],
+            ["Any achievement for Hornet Protector?", "Hollow Knight", DEFAULT_SIMILARITY_THRESHOLD],
         ],
     )

hollow_knight_boss.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:608417424fb5f9670689cb318868bc19ddba6a524fa6df8f3d43c47393e65a13
-size 976095

 version https://git-lfs.github.com/spec/v1
+oid sha256:68d0acf0286e68a4183da2b78e638e6792f1eba19f84cda331512f6301c50039
+size 995501

silksong_areas.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:147a37b653b5c5016ab6067a7aaccb68e0e2e5ca0ef1c0bd4b1591f59f56b3ec
+size 84007

silksong_bosses.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd7e5a122cc5075fa8d45f73a5fe2dc3f893bf1934da82a5bf30044fe69b72b2
+size 73446

silksong_npcs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1dde39276b5c2f84d074d7c0c60b3ac076dd31ee6ed81fb9e023dca91a3d8576
+size 115874

silksong_tools_and_skills.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b953f6e1be940f561c4b6234ae22d22702e6b054fe394f8e984869b3eecb023a
+size 17485