Upload prediction_multilabel.py

#2
Files changed (1) hide show
  1. prediction_multilabel.py +66 -0
prediction_multilabel.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ import torch
6
+ import torch.nn as nn
7
+ from sentence_transformers import util
8
+
9
+ # Set random seed for reproducibility
10
+ torch.manual_seed(1)
11
+
12
+ # Load datasets
13
+ df_inmemory = pd.read_csv('df_360k_41lables_05012023.csv')
14
+ df_paragraph = pd.read_csv('german_plc_all_paragraphs_unnested_only.csv')
15
+
16
+ # Load stored embeddings
17
+ with open('embeddings_paragraphs_07012023.pkl', "rb") as f:
18
+ stored_data = pickle.load(f)
19
+ pred_embeddings = stored_data['parg_embeddings']
20
+
21
+ with open('embeddings_sentences_360k_09012023.pkl', "rb") as f:
22
+ stored_data = pickle.load(f)
23
+ embeddings = stored_data['sent_embeddings']
24
+
25
+ # Define function for cosine similarity search
26
+ def get_top_n_similar_patents(new_claim, claim_embeddings, top_n=20):
27
+ search_hits = util.semantic_search(new_claim, claim_embeddings, top_k=top_n)
28
+ top_claim_ids = [hit['corpus_id'] for hit in search_hits[0]]
29
+ similarity_scores = [hit['score'] for hit in search_hits[0]]
30
+
31
+ return pd.DataFrame({'top_claim_ids': top_claim_ids, 'cosine_similarity': similarity_scores})
32
+
33
+ # 1. Perform cosine similarity search
34
+ test_embeddings = pred_embeddings[:50000]
35
+ all_predictions = []
36
+
37
+ start = time.time()
38
+ for i, test_embedding in enumerate(test_embeddings):
39
+ result_df = get_top_n_similar_patents(test_embedding.reshape(1, -1), embeddings)
40
+ result = pd.merge(result_df, df_inmemory, left_on='top_claim_ids', right_on='index', how='left')
41
+ all_predictions.append(result)
42
+
43
+ df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0)
44
+
45
+ # 2. Apply K-Nearest Neighbor (KNN) algorithm
46
+ top_n = 12
47
+ predict = pd.DataFrame(columns=df_inmemory.columns[6:])
48
+ for item in range(len(all_predictions)):
49
+ k_similar_patents = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity'])
50
+ result_knn = pd.DataFrame(0, index=[0], columns=k_similar_patents.columns[8:])
51
+ for i in range(top_n):
52
+ result_knn += k_similar_patents.iloc[i, 8:].values
53
+ predict = pd.concat([predict, result_knn], ignore_index=True)
54
+
55
+ # 3. Apply Sigmoid activation function
56
+ sigmoid = nn.Sigmoid()
57
+ data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32)
58
+ output = sigmoid(data_tensor)
59
+ output = (output > 0.90).float()
60
+
61
+ # Save results
62
+ output_df = pd.DataFrame(output.numpy(), columns=predict.columns)
63
+ df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1)
64
+ df_results.to_csv('df_results_0_50k.csv', index=False)
65
+
66
+ print(f"Processing completed in {time.time() - start:.2f} seconds.")