atuanlausu commited on
Commit
ce5dd1b
·
verified ·
1 Parent(s): bba3c57

Create data_setup.py

Browse files
Files changed (1) hide show
  1. data_setup.py +85 -0
data_setup.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+ import pandas as pd
3
+ import faiss
4
+ import pickle
5
+ from sentence_transformers import SentenceTransformer
6
+ import torch
7
+
8
+ def setup_knowledge_base(pdf_file_part_1, pdf_file_part_2, model_name='paraphrase-multilingual-mpnet-base-v2'):
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+ def _extract_text_with_page_number(pdf_path):
12
+ doc = fitz.open(pdf_path)
13
+ content = []
14
+ for page_num in range(doc.page_count):
15
+ page = doc.load_page(page_num)
16
+ text = page.get_text("text")
17
+ content.append({
18
+ 'page_number': page_num + 1,
19
+ 'text': text
20
+ })
21
+ return content
22
+
23
+ def _create_chunks(df):
24
+ chunks = []
25
+ current_chunk = ""
26
+ current_page_numbers = []
27
+ for index, row in df.iterrows():
28
+ page_number = row['page_number']
29
+ text = row['text']
30
+ paragraphs = text.split('\n\n')
31
+ for para in paragraphs:
32
+ if para.strip():
33
+ is_new_section = any(header in para for header in [
34
+ '第一章', '第二章', '第三章', '第四章', '第五章',
35
+ '第六章', '第七章', '第八章', '第九章', '第十章',
36
+ '第十一章', '第十二章', '第十三章', '第十四章', '第十五章', '第十六章'
37
+ ])
38
+ if is_new_section and current_chunk:
39
+ chunks.append({
40
+ 'content': current_chunk.strip(),
41
+ 'page_numbers': sorted(list(set(current_page_numbers)))
42
+ })
43
+ current_chunk = para
44
+ current_page_numbers = [page_number]
45
+ else:
46
+ current_chunk += "\n" + para
47
+ if page_number not in current_page_numbers:
48
+ current_page_numbers.append(page_number)
49
+ if current_chunk:
50
+ chunks.append({
51
+ 'content': current_chunk.strip(),
52
+ 'page_numbers': sorted(list(set(current_page_numbers)))
53
+ })
54
+ return chunks
55
+
56
+ print("--- 步驟一:提取PDF內容 ---")
57
+ book_content_part_1 = _extract_text_with_page_number(pdf_file_part_1)
58
+ book_content_part_2 = _extract_text_with_page_number(pdf_file_part_2)
59
+ all_book_content = book_content_part_1 + book_content_part_2
60
+ df = pd.DataFrame(all_book_content)
61
+
62
+ print("--- 步驟二:切分語塊 ---")
63
+ chunks = _create_chunks(df)
64
+ print(f"總共產生了 {len(chunks)} 個語塊。")
65
+
66
+ print("--- 步驟三:將語塊向量化並建立FAISS索引 ---")
67
+ retriever_model = SentenceTransformer(model_name, device=device)
68
+ texts = [chunk['content'] for chunk in chunks]
69
+ embeddings = retriever_model.encode(texts, convert_to_tensor=True).cpu().numpy()
70
+ d = embeddings.shape[1]
71
+ index = faiss.IndexFlatL2(d)
72
+ index.add(embeddings)
73
+
74
+ # 儲存FAISS索引和語塊資訊
75
+ faiss.write_index(index, 'faiss_index.bin')
76
+ with open('chunks.pkl', 'wb') as f:
77
+ pickle.dump(chunks, f)
78
+
79
+ print("知識庫建立成功!")
80
+ return 'faiss_index.bin', 'chunks.pkl'
81
+
82
+ if __name__ == '__main__':
83
+ pdf_file_part_1 = '噶哈巫語參考語法(上).pdf'
84
+ pdf_file_part_2 = '噶哈巫語參考語法(下).pdf'
85
+ setup_knowledge_base(pdf_file_part_1, pdf_file_part_2)