Spaces:

atuanlausu
/

kaxabu-grammar-padudu

Sleeping

App Files Files Community

atuanlausu commited on Sep 23

Commit

ce5dd1b

verified ·

1 Parent(s): bba3c57

Create data_setup.py

Browse files

Files changed (1) hide show

data_setup.py +85 -0

data_setup.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import fitz
+import pandas as pd
+import faiss
+import pickle
+from sentence_transformers import SentenceTransformer
+import torch
+def setup_knowledge_base(pdf_file_part_1, pdf_file_part_2, model_name='paraphrase-multilingual-mpnet-base-v2'):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def _extract_text_with_page_number(pdf_path):
+        doc = fitz.open(pdf_path)
+        content = []
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text = page.get_text("text")
+            content.append({
+                'page_number': page_num + 1,
+                'text': text
+            })
+        return content
+    def _create_chunks(df):
+        chunks = []
+        current_chunk = ""
+        current_page_numbers = []
+        for index, row in df.iterrows():
+            page_number = row['page_number']
+            text = row['text']
+            paragraphs = text.split('\n\n')
+            for para in paragraphs:
+                if para.strip():
+                    is_new_section = any(header in para for header in [
+                        '第一章', '第二章', '第三章', '第四章', '第五章',
+                        '第六章', '第七章', '第八章', '第九章', '第十章',
+                        '第十一章', '第十二章', '第十三章', '第十四章', '第十五章', '第十六章'
+                    ])
+                    if is_new_section and current_chunk:
+                        chunks.append({
+                            'content': current_chunk.strip(),
+                            'page_numbers': sorted(list(set(current_page_numbers)))
+                        })
+                        current_chunk = para
+                        current_page_numbers = [page_number]
+                    else:
+                        current_chunk += "\n" + para
+                        if page_number not in current_page_numbers:
+                            current_page_numbers.append(page_number)
+        if current_chunk:
+            chunks.append({
+                'content': current_chunk.strip(),
+                'page_numbers': sorted(list(set(current_page_numbers)))
+            })
+        return chunks
+    print("--- 步驟一：提取PDF內容 ---")
+    book_content_part_1 = _extract_text_with_page_number(pdf_file_part_1)
+    book_content_part_2 = _extract_text_with_page_number(pdf_file_part_2)
+    all_book_content = book_content_part_1 + book_content_part_2
+    df = pd.DataFrame(all_book_content)
+    print("--- 步驟二：切分語塊 ---")
+    chunks = _create_chunks(df)
+    print(f"總共產生了 {len(chunks)} 個語塊。")
+    print("--- 步驟三：將語塊向量化並建立FAISS索引 ---")
+    retriever_model = SentenceTransformer(model_name, device=device)
+    texts = [chunk['content'] for chunk in chunks]
+    embeddings = retriever_model.encode(texts, convert_to_tensor=True).cpu().numpy()
+    d = embeddings.shape[1]
+    index = faiss.IndexFlatL2(d)
+    index.add(embeddings)
+    # 儲存FAISS索引和語塊資訊
+    faiss.write_index(index, 'faiss_index.bin')
+    with open('chunks.pkl', 'wb') as f:
+        pickle.dump(chunks, f)
+    print("知識庫建立成功！")
+    return 'faiss_index.bin', 'chunks.pkl'
+if __name__ == '__main__':
+    pdf_file_part_1 = '噶哈巫語參考語法（上）.pdf'
+    pdf_file_part_2 = '噶哈巫語參考語法（下）.pdf'
+    setup_knowledge_base(pdf_file_part_1, pdf_file_part_2)