Gaykar commited on
Commit
e01340e
·
1 Parent(s): 1acb857

Added pipe

Browse files
Files changed (1) hide show
  1. pipeline.py +57 -0
pipeline.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ from utils import *
4
+ from config import URL_FEATURES
5
+
6
+ class EmailFeatureExtractor:
7
+ def __init__(self):
8
+ self.required_features = URL_FEATURES
9
+
10
+ def transform(self, subject: str, body: str) -> pd.DataFrame:
11
+ # Create initial DataFrame from user input
12
+ df = pd.DataFrame([{'subject': subject, 'body': body}])
13
+
14
+ # 1. URL & Attachment Extraction
15
+ df['URL'] = df.apply(extract_urls_from_body, axis=1)
16
+ df['URL_COUNT'] = df['URL'].apply(count_urls)
17
+
18
+
19
+
20
+ # 2. Combined Text for BERT
21
+ df['text_combined'] = df.apply(create_combined_text, axis=1)
22
+
23
+ # 3. IP Address Detection
24
+ df['USE_OF_IP'] = df['URL'].apply(
25
+ lambda x: having_ip_address(x) if x else 0
26
+ )
27
+
28
+ # 4. Basic URL Stats
29
+ # Note: We apply result_type='expand' if utils returns a Series
30
+ stats = df['URL'].apply(extract_basic_url_stats)
31
+ df[['url_length_max', 'url_length_avg', 'url_subdom_max', 'url_subdom_avg']] = stats
32
+
33
+ # 5. Shorteners & Suspicious Keywords
34
+ df['short_url_count'] = df['URL'].apply(count_shortened_urls)
35
+ df['sus_url_count'] = df['URL'].apply(suspicious_words_count)
36
+ df['sus_url_flag'] = (df['sus_url_count'] > 0).astype(int)
37
+
38
+ # 6. Dot Features
39
+ df[['dot_count_max', 'dot_count_avg']] = df['URL'].apply(extract_dot_features)
40
+
41
+ # 7. Generic Character Counts
42
+ char_map = {'perc': '%', 'ques': '?', 'hyphen': '-', 'equal': '='}
43
+ for name, char in char_map.items():
44
+ df[[f'{name}_max', f'{name}_avg']] = df['URL'].apply(
45
+ lambda x: extract_char_features(x, char, name)
46
+ )
47
+
48
+ return self._verify_and_order(df)
49
+
50
+ def _verify_and_order(self, df: pd.DataFrame) -> pd.DataFrame:
51
+ missing = [col for col in self.required_features if col not in df.columns]
52
+ if missing:
53
+ for col in missing:
54
+ df[col] = 0
55
+
56
+ cols_to_return = self.required_features + ['text_combined']
57
+ return df[cols_to_return]