Upload prediction_multilabel.py
#2
by
HamidBekam
- opened
- prediction_multilabel.py +66 -0
prediction_multilabel.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pickle
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
from sentence_transformers import util
|
| 8 |
+
|
| 9 |
+
# Set random seed for reproducibility
|
| 10 |
+
torch.manual_seed(1)
|
| 11 |
+
|
| 12 |
+
# Load datasets
|
| 13 |
+
df_inmemory = pd.read_csv('df_360k_41lables_05012023.csv')
|
| 14 |
+
df_paragraph = pd.read_csv('german_plc_all_paragraphs_unnested_only.csv')
|
| 15 |
+
|
| 16 |
+
# Load stored embeddings
|
| 17 |
+
with open('embeddings_paragraphs_07012023.pkl', "rb") as f:
|
| 18 |
+
stored_data = pickle.load(f)
|
| 19 |
+
pred_embeddings = stored_data['parg_embeddings']
|
| 20 |
+
|
| 21 |
+
with open('embeddings_sentences_360k_09012023.pkl', "rb") as f:
|
| 22 |
+
stored_data = pickle.load(f)
|
| 23 |
+
embeddings = stored_data['sent_embeddings']
|
| 24 |
+
|
| 25 |
+
# Define function for cosine similarity search
|
| 26 |
+
def get_top_n_similar_patents(new_claim, claim_embeddings, top_n=20):
|
| 27 |
+
search_hits = util.semantic_search(new_claim, claim_embeddings, top_k=top_n)
|
| 28 |
+
top_claim_ids = [hit['corpus_id'] for hit in search_hits[0]]
|
| 29 |
+
similarity_scores = [hit['score'] for hit in search_hits[0]]
|
| 30 |
+
|
| 31 |
+
return pd.DataFrame({'top_claim_ids': top_claim_ids, 'cosine_similarity': similarity_scores})
|
| 32 |
+
|
| 33 |
+
# 1. Perform cosine similarity search
|
| 34 |
+
test_embeddings = pred_embeddings[:50000]
|
| 35 |
+
all_predictions = []
|
| 36 |
+
|
| 37 |
+
start = time.time()
|
| 38 |
+
for i, test_embedding in enumerate(test_embeddings):
|
| 39 |
+
result_df = get_top_n_similar_patents(test_embedding.reshape(1, -1), embeddings)
|
| 40 |
+
result = pd.merge(result_df, df_inmemory, left_on='top_claim_ids', right_on='index', how='left')
|
| 41 |
+
all_predictions.append(result)
|
| 42 |
+
|
| 43 |
+
df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0)
|
| 44 |
+
|
| 45 |
+
# 2. Apply K-Nearest Neighbor (KNN) algorithm
|
| 46 |
+
top_n = 12
|
| 47 |
+
predict = pd.DataFrame(columns=df_inmemory.columns[6:])
|
| 48 |
+
for item in range(len(all_predictions)):
|
| 49 |
+
k_similar_patents = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity'])
|
| 50 |
+
result_knn = pd.DataFrame(0, index=[0], columns=k_similar_patents.columns[8:])
|
| 51 |
+
for i in range(top_n):
|
| 52 |
+
result_knn += k_similar_patents.iloc[i, 8:].values
|
| 53 |
+
predict = pd.concat([predict, result_knn], ignore_index=True)
|
| 54 |
+
|
| 55 |
+
# 3. Apply Sigmoid activation function
|
| 56 |
+
sigmoid = nn.Sigmoid()
|
| 57 |
+
data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32)
|
| 58 |
+
output = sigmoid(data_tensor)
|
| 59 |
+
output = (output > 0.90).float()
|
| 60 |
+
|
| 61 |
+
# Save results
|
| 62 |
+
output_df = pd.DataFrame(output.numpy(), columns=predict.columns)
|
| 63 |
+
df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1)
|
| 64 |
+
df_results.to_csv('df_results_0_50k.csv', index=False)
|
| 65 |
+
|
| 66 |
+
print(f"Processing completed in {time.time() - start:.2f} seconds.")
|