Spaces:
Sleeping
Sleeping
Upload 13 files
Browse files- src/__pycache__/create_comprehensive_image.cpython-311.pyc +0 -0
- src/__pycache__/detailed_model_comparison.cpython-311.pyc +0 -0
- src/__pycache__/model_comparison.cpython-311.pyc +0 -0
- src/__pycache__/prediction_process.cpython-311.pyc +0 -0
- src/__pycache__/system_summary.cpython-311.pyc +0 -0
- src/__pycache__/train_model.cpython-311.pyc +0 -0
- src/models/__pycache__/loan_recovery_model.cpython-311.pyc +0 -0
- src/models/loan_recovery_model.py +274 -0
- src/preprocessing/__pycache__/data_processor.cpython-311.pyc +0 -0
- src/preprocessing/data_processor.py +144 -0
- src/train_model.py +87 -0
- src/utils/__pycache__/data_generator.cpython-311.pyc +0 -0
- src/utils/data_generator.py +202 -0
src/__pycache__/create_comprehensive_image.cpython-311.pyc
ADDED
|
Binary file (3.38 kB). View file
|
|
|
src/__pycache__/detailed_model_comparison.cpython-311.pyc
ADDED
|
Binary file (12.7 kB). View file
|
|
|
src/__pycache__/model_comparison.cpython-311.pyc
ADDED
|
Binary file (9.22 kB). View file
|
|
|
src/__pycache__/prediction_process.cpython-311.pyc
ADDED
|
Binary file (7.38 kB). View file
|
|
|
src/__pycache__/system_summary.cpython-311.pyc
ADDED
|
Binary file (5.87 kB). View file
|
|
|
src/__pycache__/train_model.cpython-311.pyc
ADDED
|
Binary file (4.81 kB). View file
|
|
|
src/models/__pycache__/loan_recovery_model.cpython-311.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
src/models/loan_recovery_model.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import joblib
|
| 4 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 5 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 6 |
+
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import seaborn as sns
|
| 9 |
+
from src.preprocessing.data_processor import LoanDataProcessor
|
| 10 |
+
|
| 11 |
+
class LoanRecoveryModel:
|
| 12 |
+
"""
|
| 13 |
+
Machine learning model for predicting loan recovery.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, model_type='random_forest'):
|
| 17 |
+
"""
|
| 18 |
+
Initialize the loan recovery model.
|
| 19 |
+
|
| 20 |
+
Parameters:
|
| 21 |
+
-----------
|
| 22 |
+
model_type : str, optional
|
| 23 |
+
Type of model to use, by default 'random_forest'
|
| 24 |
+
Only 'random_forest' is supported
|
| 25 |
+
"""
|
| 26 |
+
self.model_type = 'random_forest' # Always use Random Forest
|
| 27 |
+
self.model = None
|
| 28 |
+
self.processor = LoanDataProcessor()
|
| 29 |
+
|
| 30 |
+
# Initialize the Random Forest model
|
| 31 |
+
self.model = RandomForestClassifier(random_state=42)
|
| 32 |
+
|
| 33 |
+
def train(self, data, target_column='recovery_status', test_size=0.2, tune_hyperparameters=False):
|
| 34 |
+
"""
|
| 35 |
+
Train the model on the provided data.
|
| 36 |
+
|
| 37 |
+
Parameters:
|
| 38 |
+
-----------
|
| 39 |
+
data : pandas.DataFrame
|
| 40 |
+
The training data
|
| 41 |
+
target_column : str, optional
|
| 42 |
+
The name of the target column, by default 'recovery_status'
|
| 43 |
+
test_size : float, optional
|
| 44 |
+
Proportion of data to use for testing, by default 0.2
|
| 45 |
+
tune_hyperparameters : bool, optional
|
| 46 |
+
Whether to perform hyperparameter tuning, by default False
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
--------
|
| 50 |
+
dict
|
| 51 |
+
Dictionary containing model performance metrics
|
| 52 |
+
"""
|
| 53 |
+
# Prepare data
|
| 54 |
+
X, y = self.processor.prepare_data(data, target_column)
|
| 55 |
+
|
| 56 |
+
# Split data into training and testing sets
|
| 57 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
|
| 58 |
+
|
| 59 |
+
# Preprocess the data
|
| 60 |
+
X_train_processed = self.processor.fit_transform(X_train)
|
| 61 |
+
X_test_processed = self.processor.transform(X_test)
|
| 62 |
+
|
| 63 |
+
# Tune hyperparameters if requested
|
| 64 |
+
if tune_hyperparameters:
|
| 65 |
+
self._tune_hyperparameters(X_train_processed, y_train)
|
| 66 |
+
|
| 67 |
+
# Train the model
|
| 68 |
+
self.model.fit(X_train_processed, y_train)
|
| 69 |
+
|
| 70 |
+
# Evaluate the model
|
| 71 |
+
y_pred = self.model.predict(X_test_processed)
|
| 72 |
+
y_prob = self.model.predict_proba(X_test_processed)[:, 1]
|
| 73 |
+
|
| 74 |
+
# Calculate metrics
|
| 75 |
+
metrics = {
|
| 76 |
+
'accuracy': self.model.score(X_test_processed, y_test),
|
| 77 |
+
'roc_auc': roc_auc_score(y_test, y_prob),
|
| 78 |
+
'classification_report': classification_report(y_test, y_pred, output_dict=True),
|
| 79 |
+
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
# Feature importance
|
| 83 |
+
if hasattr(self.model, 'feature_importances_'):
|
| 84 |
+
feature_names = self.processor.get_feature_names()
|
| 85 |
+
metrics['feature_importance'] = dict(zip(feature_names, self.model.feature_importances_))
|
| 86 |
+
|
| 87 |
+
return metrics
|
| 88 |
+
|
| 89 |
+
def predict(self, data):
|
| 90 |
+
"""
|
| 91 |
+
Make predictions on new data.
|
| 92 |
+
|
| 93 |
+
Parameters:
|
| 94 |
+
-----------
|
| 95 |
+
data : pandas.DataFrame
|
| 96 |
+
The data to make predictions on
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
--------
|
| 100 |
+
numpy.ndarray
|
| 101 |
+
Array of predicted probabilities of recovery
|
| 102 |
+
"""
|
| 103 |
+
if self.model is None:
|
| 104 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
| 105 |
+
|
| 106 |
+
# Prepare data
|
| 107 |
+
if 'recovery_status' in data.columns:
|
| 108 |
+
X, _ = self.processor.prepare_data(data)
|
| 109 |
+
else:
|
| 110 |
+
X = self.processor.prepare_data(data)
|
| 111 |
+
|
| 112 |
+
# Preprocess the data
|
| 113 |
+
X_processed = self.processor.transform(X)
|
| 114 |
+
|
| 115 |
+
# Make predictions
|
| 116 |
+
return self.model.predict_proba(X_processed)[:, 1]
|
| 117 |
+
|
| 118 |
+
def save_model(self, model_path, processor_path=None):
|
| 119 |
+
"""
|
| 120 |
+
Save the trained model and preprocessor to disk.
|
| 121 |
+
|
| 122 |
+
Parameters:
|
| 123 |
+
-----------
|
| 124 |
+
model_path : str
|
| 125 |
+
Path to save the model
|
| 126 |
+
processor_path : str, optional
|
| 127 |
+
Path to save the preprocessor, by default None
|
| 128 |
+
If None, will use model_path with '_processor' appended
|
| 129 |
+
"""
|
| 130 |
+
if self.model is None:
|
| 131 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
| 132 |
+
|
| 133 |
+
# Save the model
|
| 134 |
+
joblib.dump(self.model, model_path)
|
| 135 |
+
|
| 136 |
+
# Save the preprocessor
|
| 137 |
+
if processor_path is None:
|
| 138 |
+
processor_path = model_path.replace('.pkl', '_processor.pkl')
|
| 139 |
+
|
| 140 |
+
joblib.dump(self.processor, processor_path)
|
| 141 |
+
|
| 142 |
+
@classmethod
|
| 143 |
+
def load_model(cls, model_path, processor_path=None):
|
| 144 |
+
"""
|
| 145 |
+
Load a trained model and preprocessor from disk.
|
| 146 |
+
|
| 147 |
+
Parameters:
|
| 148 |
+
-----------
|
| 149 |
+
model_path : str
|
| 150 |
+
Path to the saved model
|
| 151 |
+
processor_path : str, optional
|
| 152 |
+
Path to the saved preprocessor, by default None
|
| 153 |
+
If None, will use model_path with '_processor' appended
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
--------
|
| 157 |
+
LoanRecoveryModel
|
| 158 |
+
The loaded model
|
| 159 |
+
"""
|
| 160 |
+
# Create a new instance
|
| 161 |
+
instance = cls()
|
| 162 |
+
|
| 163 |
+
# Load the model
|
| 164 |
+
instance.model = joblib.load(model_path)
|
| 165 |
+
|
| 166 |
+
# Load the preprocessor
|
| 167 |
+
if processor_path is None:
|
| 168 |
+
processor_path = model_path.replace('.pkl', '_processor.pkl')
|
| 169 |
+
|
| 170 |
+
instance.processor = joblib.load(processor_path)
|
| 171 |
+
|
| 172 |
+
return instance
|
| 173 |
+
|
| 174 |
+
def _tune_hyperparameters(self, X_train, y_train):
|
| 175 |
+
"""
|
| 176 |
+
Perform hyperparameter tuning for Random Forest model.
|
| 177 |
+
|
| 178 |
+
Parameters:
|
| 179 |
+
-----------
|
| 180 |
+
X_train : numpy.ndarray
|
| 181 |
+
The processed training features
|
| 182 |
+
y_train : numpy.ndarray
|
| 183 |
+
The training target values
|
| 184 |
+
"""
|
| 185 |
+
# Random Forest hyperparameters
|
| 186 |
+
param_grid = {
|
| 187 |
+
'n_estimators': [50, 100, 200],
|
| 188 |
+
'max_depth': [None, 10, 20, 30],
|
| 189 |
+
'min_samples_split': [2, 5, 10],
|
| 190 |
+
'min_samples_leaf': [1, 2, 4]
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
# Create grid search
|
| 194 |
+
grid_search = GridSearchCV(
|
| 195 |
+
self.model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# Fit grid search
|
| 199 |
+
grid_search.fit(X_train, y_train)
|
| 200 |
+
|
| 201 |
+
# Update model with best parameters
|
| 202 |
+
self.model = grid_search.best_estimator_
|
| 203 |
+
|
| 204 |
+
def plot_feature_importance(self, top_n=10):
|
| 205 |
+
"""
|
| 206 |
+
Plot feature importance for the trained model.
|
| 207 |
+
|
| 208 |
+
Parameters:
|
| 209 |
+
-----------
|
| 210 |
+
top_n : int, optional
|
| 211 |
+
Number of top features to display, by default 10
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
--------
|
| 215 |
+
matplotlib.figure.Figure
|
| 216 |
+
The feature importance plot
|
| 217 |
+
"""
|
| 218 |
+
if self.model is None:
|
| 219 |
+
raise ValueError("Model has not been trained. Call train() first.")
|
| 220 |
+
|
| 221 |
+
if not hasattr(self.model, 'feature_importances_'):
|
| 222 |
+
raise ValueError("Model does not have feature importances.")
|
| 223 |
+
|
| 224 |
+
# Get feature names and importances
|
| 225 |
+
feature_names = self.processor.get_feature_names()
|
| 226 |
+
importances = self.model.feature_importances_
|
| 227 |
+
|
| 228 |
+
# Sort by importance
|
| 229 |
+
indices = np.argsort(importances)[::-1]
|
| 230 |
+
|
| 231 |
+
# Take top N features
|
| 232 |
+
indices = indices[:top_n]
|
| 233 |
+
|
| 234 |
+
# Create plot
|
| 235 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 236 |
+
ax.barh(range(len(indices)), importances[indices], align='center')
|
| 237 |
+
ax.set_yticks(range(len(indices)))
|
| 238 |
+
ax.set_yticklabels([feature_names[i] for i in indices])
|
| 239 |
+
ax.set_xlabel('Feature Importance')
|
| 240 |
+
ax.set_title('Top {} Feature Importances'.format(top_n))
|
| 241 |
+
plt.tight_layout()
|
| 242 |
+
|
| 243 |
+
return fig
|
| 244 |
+
|
| 245 |
+
def plot_confusion_matrix(self, y_true, y_pred):
|
| 246 |
+
"""
|
| 247 |
+
Plot confusion matrix for model predictions.
|
| 248 |
+
|
| 249 |
+
Parameters:
|
| 250 |
+
-----------
|
| 251 |
+
y_true : array-like
|
| 252 |
+
True labels
|
| 253 |
+
y_pred : array-like
|
| 254 |
+
Predicted labels
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
--------
|
| 258 |
+
matplotlib.figure.Figure
|
| 259 |
+
The confusion matrix plot
|
| 260 |
+
"""
|
| 261 |
+
# Calculate confusion matrix
|
| 262 |
+
cm = confusion_matrix(y_true, y_pred)
|
| 263 |
+
|
| 264 |
+
# Create plot
|
| 265 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
| 266 |
+
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
|
| 267 |
+
ax.set_xlabel('Predicted labels')
|
| 268 |
+
ax.set_ylabel('True labels')
|
| 269 |
+
ax.set_title('Confusion Matrix')
|
| 270 |
+
ax.set_xticklabels(['Not Recovered', 'Recovered'])
|
| 271 |
+
ax.set_yticklabels(['Not Recovered', 'Recovered'])
|
| 272 |
+
plt.tight_layout()
|
| 273 |
+
|
| 274 |
+
return fig
|
src/preprocessing/__pycache__/data_processor.cpython-311.pyc
ADDED
|
Binary file (5.53 kB). View file
|
|
|
src/preprocessing/data_processor.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 4 |
+
from sklearn.compose import ColumnTransformer
|
| 5 |
+
from sklearn.pipeline import Pipeline
|
| 6 |
+
from sklearn.impute import SimpleImputer
|
| 7 |
+
|
| 8 |
+
class LoanDataProcessor:
|
| 9 |
+
"""
|
| 10 |
+
Class for preprocessing loan data for machine learning models.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def __init__(self):
|
| 14 |
+
"""Initialize the data processor."""
|
| 15 |
+
self.preprocessor = None
|
| 16 |
+
self.categorical_features = ['gender', 'employment_status', 'payment_history']
|
| 17 |
+
self.numerical_features = ['age', 'annual_income', 'credit_score', 'loan_amount',
|
| 18 |
+
'interest_rate', 'loan_term', 'days_past_due',
|
| 19 |
+
'previous_defaults', 'monthly_payment', 'debt_to_income']
|
| 20 |
+
|
| 21 |
+
def fit(self, X):
|
| 22 |
+
"""
|
| 23 |
+
Fit the preprocessor on the training data.
|
| 24 |
+
|
| 25 |
+
Parameters:
|
| 26 |
+
-----------
|
| 27 |
+
X : pandas.DataFrame
|
| 28 |
+
The training data
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
--------
|
| 32 |
+
self : LoanDataProcessor
|
| 33 |
+
The fitted processor
|
| 34 |
+
"""
|
| 35 |
+
# Define preprocessing for numerical features
|
| 36 |
+
numerical_transformer = Pipeline(steps=[
|
| 37 |
+
('imputer', SimpleImputer(strategy='median')),
|
| 38 |
+
('scaler', StandardScaler())
|
| 39 |
+
])
|
| 40 |
+
|
| 41 |
+
# Define preprocessing for categorical features
|
| 42 |
+
categorical_transformer = Pipeline(steps=[
|
| 43 |
+
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 44 |
+
('onehot', OneHotEncoder(handle_unknown='ignore'))
|
| 45 |
+
])
|
| 46 |
+
|
| 47 |
+
# Combine preprocessing steps
|
| 48 |
+
self.preprocessor = ColumnTransformer(
|
| 49 |
+
transformers=[
|
| 50 |
+
('num', numerical_transformer, self.numerical_features),
|
| 51 |
+
('cat', categorical_transformer, self.categorical_features)
|
| 52 |
+
])
|
| 53 |
+
|
| 54 |
+
# Fit the preprocessor
|
| 55 |
+
self.preprocessor.fit(X)
|
| 56 |
+
|
| 57 |
+
return self
|
| 58 |
+
|
| 59 |
+
def transform(self, X):
|
| 60 |
+
"""
|
| 61 |
+
Transform the data using the fitted preprocessor.
|
| 62 |
+
|
| 63 |
+
Parameters:
|
| 64 |
+
-----------
|
| 65 |
+
X : pandas.DataFrame
|
| 66 |
+
The data to transform
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
--------
|
| 70 |
+
numpy.ndarray
|
| 71 |
+
The transformed data
|
| 72 |
+
"""
|
| 73 |
+
if self.preprocessor is None:
|
| 74 |
+
raise ValueError("Preprocessor has not been fitted. Call fit() first.")
|
| 75 |
+
|
| 76 |
+
return self.preprocessor.transform(X)
|
| 77 |
+
|
| 78 |
+
def fit_transform(self, X):
|
| 79 |
+
"""
|
| 80 |
+
Fit the preprocessor and transform the data.
|
| 81 |
+
|
| 82 |
+
Parameters:
|
| 83 |
+
-----------
|
| 84 |
+
X : pandas.DataFrame
|
| 85 |
+
The data to fit and transform
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
--------
|
| 89 |
+
numpy.ndarray
|
| 90 |
+
The transformed data
|
| 91 |
+
"""
|
| 92 |
+
return self.fit(X).transform(X)
|
| 93 |
+
|
| 94 |
+
def get_feature_names(self):
|
| 95 |
+
"""
|
| 96 |
+
Get the names of the transformed features.
|
| 97 |
+
|
| 98 |
+
Returns:
|
| 99 |
+
--------
|
| 100 |
+
list
|
| 101 |
+
List of feature names after transformation
|
| 102 |
+
"""
|
| 103 |
+
if self.preprocessor is None:
|
| 104 |
+
raise ValueError("Preprocessor has not been fitted. Call fit() first.")
|
| 105 |
+
|
| 106 |
+
# Get feature names from the column transformer
|
| 107 |
+
feature_names = []
|
| 108 |
+
|
| 109 |
+
# Get numerical feature names (these stay the same)
|
| 110 |
+
feature_names.extend(self.numerical_features)
|
| 111 |
+
|
| 112 |
+
# Get categorical feature names (these are expanded by one-hot encoding)
|
| 113 |
+
categorical_features = self.preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
|
| 114 |
+
self.categorical_features)
|
| 115 |
+
feature_names.extend(categorical_features)
|
| 116 |
+
|
| 117 |
+
return feature_names
|
| 118 |
+
|
| 119 |
+
def prepare_data(self, data, target_column='recovery_status'):
|
| 120 |
+
"""
|
| 121 |
+
Prepare data for model training or prediction.
|
| 122 |
+
|
| 123 |
+
Parameters:
|
| 124 |
+
-----------
|
| 125 |
+
data : pandas.DataFrame
|
| 126 |
+
The data to prepare
|
| 127 |
+
target_column : str, optional
|
| 128 |
+
The name of the target column, by default 'recovery_status'
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
--------
|
| 132 |
+
tuple
|
| 133 |
+
(X, y) if target_column is in data, otherwise just X
|
| 134 |
+
"""
|
| 135 |
+
# Drop customer_id as it's not a feature
|
| 136 |
+
if 'customer_id' in data.columns:
|
| 137 |
+
data = data.drop('customer_id', axis=1)
|
| 138 |
+
|
| 139 |
+
if target_column in data.columns:
|
| 140 |
+
X = data.drop(target_column, axis=1)
|
| 141 |
+
y = data[target_column]
|
| 142 |
+
return X, y
|
| 143 |
+
else:
|
| 144 |
+
return data
|
src/train_model.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import matplotlib
|
| 4 |
+
matplotlib.use('Agg') # Use non-interactive backend
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
from src.utils.data_generator import generate_loan_data
|
| 7 |
+
from src.models.loan_recovery_model import LoanRecoveryModel
|
| 8 |
+
|
| 9 |
+
def train_and_save_model(data_path=None, model_type='random_forest', tune_hyperparameters=False):
|
| 10 |
+
"""
|
| 11 |
+
Train a loan recovery model and save it to disk.
|
| 12 |
+
|
| 13 |
+
Parameters:
|
| 14 |
+
-----------
|
| 15 |
+
data_path : str, optional
|
| 16 |
+
Path to the loan data CSV file, by default None
|
| 17 |
+
If None, generates synthetic data
|
| 18 |
+
model_type : str, optional
|
| 19 |
+
Type of model to train, by default 'random_forest'
|
| 20 |
+
tune_hyperparameters : bool, optional
|
| 21 |
+
Whether to tune hyperparameters, by default False
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
--------
|
| 25 |
+
dict
|
| 26 |
+
Dictionary containing model performance metrics
|
| 27 |
+
"""
|
| 28 |
+
# Create directories if they don't exist
|
| 29 |
+
os.makedirs('data', exist_ok=True)
|
| 30 |
+
os.makedirs('models', exist_ok=True)
|
| 31 |
+
|
| 32 |
+
# Load or generate data
|
| 33 |
+
if data_path and os.path.exists(data_path):
|
| 34 |
+
print(f"Loading data from {data_path}")
|
| 35 |
+
data = pd.read_csv(data_path)
|
| 36 |
+
else:
|
| 37 |
+
print("Generating synthetic loan data")
|
| 38 |
+
data = generate_loan_data(n_samples=1000)
|
| 39 |
+
|
| 40 |
+
# Save generated data
|
| 41 |
+
data_path = 'data/loan_data.csv'
|
| 42 |
+
data.to_csv(data_path, index=False)
|
| 43 |
+
print(f"Saved generated data to {data_path}")
|
| 44 |
+
|
| 45 |
+
# Print data summary
|
| 46 |
+
print(f"\nData shape: {data.shape}")
|
| 47 |
+
print(f"Recovery rate: {data['recovery_status'].mean() * 100:.2f}%")
|
| 48 |
+
|
| 49 |
+
# Train model
|
| 50 |
+
print(f"\nTraining {model_type} model...")
|
| 51 |
+
model = LoanRecoveryModel(model_type=model_type)
|
| 52 |
+
metrics = model.train(data, tune_hyperparameters=tune_hyperparameters)
|
| 53 |
+
|
| 54 |
+
# Print performance metrics
|
| 55 |
+
print("\nModel Performance:")
|
| 56 |
+
print(f"Accuracy: {metrics['accuracy']:.4f}")
|
| 57 |
+
print(f"ROC AUC: {metrics['roc_auc']:.4f}")
|
| 58 |
+
print("\nClassification Report:")
|
| 59 |
+
for label, values in metrics['classification_report'].items():
|
| 60 |
+
if label in ['0', '1']:
|
| 61 |
+
label_name = 'Not Recovered' if label == '0' else 'Recovered'
|
| 62 |
+
print(f"{label_name}:")
|
| 63 |
+
print(f" Precision: {values['precision']:.4f}")
|
| 64 |
+
print(f" Recall: {values['recall']:.4f}")
|
| 65 |
+
print(f" F1-score: {values['f1-score']:.4f}")
|
| 66 |
+
|
| 67 |
+
# Save model
|
| 68 |
+
model_path = f"models/loan_recovery_{model_type}.pkl"
|
| 69 |
+
model.save_model(model_path)
|
| 70 |
+
print(f"\nSaved model to {model_path}")
|
| 71 |
+
|
| 72 |
+
# Plot feature importance if available
|
| 73 |
+
if 'feature_importance' in metrics:
|
| 74 |
+
fig = model.plot_feature_importance(top_n=10)
|
| 75 |
+
fig_path = f"models/feature_importance_{model_type}.png"
|
| 76 |
+
fig.savefig(fig_path)
|
| 77 |
+
plt.close(fig)
|
| 78 |
+
print(f"Saved feature importance plot to {fig_path}")
|
| 79 |
+
|
| 80 |
+
return metrics
|
| 81 |
+
|
| 82 |
+
if __name__ == "__main__":
|
| 83 |
+
# Train only Random Forest model
|
| 84 |
+
print(f"\n{'='*50}")
|
| 85 |
+
print(f"Training Random Forest Model")
|
| 86 |
+
print(f"{'='*50}")
|
| 87 |
+
train_and_save_model(model_type='random_forest', tune_hyperparameters=True)
|
src/utils/__pycache__/data_generator.cpython-311.pyc
ADDED
|
Binary file (10.8 kB). View file
|
|
|
src/utils/data_generator.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from datetime import datetime, timedelta
|
| 4 |
+
import random
|
| 5 |
+
|
| 6 |
+
def generate_loan_data(n_samples=1000, seed=42):
|
| 7 |
+
"""
|
| 8 |
+
Generate synthetic loan data for the loan recovery system.
|
| 9 |
+
|
| 10 |
+
Parameters:
|
| 11 |
+
-----------
|
| 12 |
+
n_samples : int
|
| 13 |
+
Number of loan records to generate
|
| 14 |
+
seed : int
|
| 15 |
+
Random seed for reproducibility
|
| 16 |
+
|
| 17 |
+
Returns:
|
| 18 |
+
--------
|
| 19 |
+
pandas.DataFrame
|
| 20 |
+
DataFrame containing synthetic loan data
|
| 21 |
+
"""
|
| 22 |
+
np.random.seed(seed)
|
| 23 |
+
random.seed(seed)
|
| 24 |
+
|
| 25 |
+
# Customer information
|
| 26 |
+
customer_ids = [f'CUST{i:06d}' for i in range(1, n_samples + 1)]
|
| 27 |
+
ages = np.random.randint(22, 65, n_samples)
|
| 28 |
+
genders = np.random.choice(['Male', 'Female'], n_samples)
|
| 29 |
+
|
| 30 |
+
# Employment information
|
| 31 |
+
employment_statuses = np.random.choice(
|
| 32 |
+
['Employed', 'Self-employed', 'Unemployed', 'Retired'],
|
| 33 |
+
n_samples,
|
| 34 |
+
p=[0.65, 0.20, 0.10, 0.05]
|
| 35 |
+
)
|
| 36 |
+
annual_incomes = []
|
| 37 |
+
for status in employment_statuses:
|
| 38 |
+
if status == 'Employed':
|
| 39 |
+
annual_incomes.append(np.random.normal(60000, 20000))
|
| 40 |
+
elif status == 'Self-employed':
|
| 41 |
+
annual_incomes.append(np.random.normal(75000, 30000))
|
| 42 |
+
elif status == 'Unemployed':
|
| 43 |
+
annual_incomes.append(np.random.normal(15000, 10000))
|
| 44 |
+
else: # Retired
|
| 45 |
+
annual_incomes.append(np.random.normal(40000, 15000))
|
| 46 |
+
|
| 47 |
+
# Credit information
|
| 48 |
+
credit_scores = []
|
| 49 |
+
for income in annual_incomes:
|
| 50 |
+
base_score = 300 + (income / 100000) * 400 # Higher income tends to have higher credit score
|
| 51 |
+
credit_scores.append(min(850, max(300, int(np.random.normal(base_score, 50)))))
|
| 52 |
+
|
| 53 |
+
# Loan information
|
| 54 |
+
loan_amounts = []
|
| 55 |
+
for income, credit in zip(annual_incomes, credit_scores):
|
| 56 |
+
# Higher income and credit score can get larger loans
|
| 57 |
+
max_loan = income * (0.5 + (credit - 300) / 850)
|
| 58 |
+
loan_amounts.append(np.random.uniform(5000, max_loan))
|
| 59 |
+
|
| 60 |
+
interest_rates = []
|
| 61 |
+
for credit in credit_scores:
|
| 62 |
+
# Lower credit scores get higher interest rates
|
| 63 |
+
base_rate = 15 - (credit - 300) * (10 / 550) # Range from ~5% to ~15%
|
| 64 |
+
interest_rates.append(max(5, min(15, base_rate + np.random.normal(0, 1))))
|
| 65 |
+
|
| 66 |
+
loan_terms = np.random.choice([12, 24, 36, 48, 60], n_samples)
|
| 67 |
+
|
| 68 |
+
# Loan performance
|
| 69 |
+
payment_histories = []
|
| 70 |
+
for credit in credit_scores:
|
| 71 |
+
# Better credit scores tend to have better payment histories
|
| 72 |
+
if credit > 750:
|
| 73 |
+
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair'], p=[0.8, 0.15, 0.05]))
|
| 74 |
+
elif credit > 650:
|
| 75 |
+
payment_histories.append(np.random.choice(['Excellent', 'Good', 'Fair', 'Poor'], p=[0.4, 0.4, 0.15, 0.05]))
|
| 76 |
+
elif credit > 550:
|
| 77 |
+
payment_histories.append(np.random.choice(['Good', 'Fair', 'Poor'], p=[0.3, 0.5, 0.2]))
|
| 78 |
+
else:
|
| 79 |
+
payment_histories.append(np.random.choice(['Fair', 'Poor', 'Very Poor'], p=[0.3, 0.5, 0.2]))
|
| 80 |
+
|
| 81 |
+
days_past_due = []
|
| 82 |
+
for history in payment_histories:
|
| 83 |
+
if history == 'Excellent':
|
| 84 |
+
days_past_due.append(np.random.choice([0, 0, 0, 0, np.random.randint(1, 10)], p=[0.9, 0.025, 0.025, 0.025, 0.025]))
|
| 85 |
+
elif history == 'Good':
|
| 86 |
+
days_past_due.append(np.random.choice([0, np.random.randint(1, 15), np.random.randint(15, 30)], p=[0.7, 0.2, 0.1]))
|
| 87 |
+
elif history == 'Fair':
|
| 88 |
+
days_past_due.append(np.random.choice([0, np.random.randint(1, 30), np.random.randint(30, 60)], p=[0.5, 0.3, 0.2]))
|
| 89 |
+
elif history == 'Poor':
|
| 90 |
+
days_past_due.append(np.random.choice([np.random.randint(1, 30), np.random.randint(30, 90), np.random.randint(90, 120)], p=[0.3, 0.4, 0.3]))
|
| 91 |
+
else: # Very Poor
|
| 92 |
+
days_past_due.append(np.random.choice([np.random.randint(30, 90), np.random.randint(90, 180), np.random.randint(180, 365)], p=[0.2, 0.4, 0.4]))
|
| 93 |
+
|
| 94 |
+
# Previous defaults
|
| 95 |
+
previous_defaults = []
|
| 96 |
+
for credit, history in zip(credit_scores, payment_histories):
|
| 97 |
+
if credit < 500 or history in ['Poor', 'Very Poor']:
|
| 98 |
+
previous_defaults.append(np.random.choice([0, 1, 2, 3], p=[0.2, 0.4, 0.3, 0.1]))
|
| 99 |
+
elif credit < 650:
|
| 100 |
+
previous_defaults.append(np.random.choice([0, 1], p=[0.8, 0.2]))
|
| 101 |
+
else:
|
| 102 |
+
previous_defaults.append(np.random.choice([0, 1], p=[0.95, 0.05]))
|
| 103 |
+
|
| 104 |
+
# Recovery status (target variable)
|
| 105 |
+
recovery_status = []
|
| 106 |
+
for credit, history, dpd, defaults in zip(credit_scores, payment_histories, days_past_due, previous_defaults):
|
| 107 |
+
# Factors affecting recovery:
|
| 108 |
+
# 1. Credit score
|
| 109 |
+
# 2. Payment history
|
| 110 |
+
# 3. Days past due
|
| 111 |
+
# 4. Previous defaults
|
| 112 |
+
|
| 113 |
+
recovery_prob = 0.9 # Base probability
|
| 114 |
+
|
| 115 |
+
# Adjust based on credit score
|
| 116 |
+
if credit < 500:
|
| 117 |
+
recovery_prob -= 0.3
|
| 118 |
+
elif credit < 650:
|
| 119 |
+
recovery_prob -= 0.1
|
| 120 |
+
|
| 121 |
+
# Adjust based on payment history
|
| 122 |
+
if history == 'Very Poor':
|
| 123 |
+
recovery_prob -= 0.4
|
| 124 |
+
elif history == 'Poor':
|
| 125 |
+
recovery_prob -= 0.2
|
| 126 |
+
elif history == 'Fair':
|
| 127 |
+
recovery_prob -= 0.1
|
| 128 |
+
|
| 129 |
+
# Adjust based on days past due
|
| 130 |
+
if dpd > 180:
|
| 131 |
+
recovery_prob -= 0.4
|
| 132 |
+
elif dpd > 90:
|
| 133 |
+
recovery_prob -= 0.3
|
| 134 |
+
elif dpd > 30:
|
| 135 |
+
recovery_prob -= 0.15
|
| 136 |
+
elif dpd > 0:
|
| 137 |
+
recovery_prob -= 0.05
|
| 138 |
+
|
| 139 |
+
# Adjust based on previous defaults
|
| 140 |
+
recovery_prob -= 0.1 * defaults
|
| 141 |
+
|
| 142 |
+
# Ensure probability is between 0 and 1
|
| 143 |
+
recovery_prob = max(0.05, min(0.95, recovery_prob))
|
| 144 |
+
|
| 145 |
+
recovery_status.append(np.random.choice([1, 0], p=[recovery_prob, 1-recovery_prob]))
|
| 146 |
+
|
| 147 |
+
# Create DataFrame
|
| 148 |
+
data = {
|
| 149 |
+
'customer_id': customer_ids,
|
| 150 |
+
'age': ages,
|
| 151 |
+
'gender': genders,
|
| 152 |
+
'employment_status': employment_statuses,
|
| 153 |
+
'annual_income': annual_incomes,
|
| 154 |
+
'credit_score': credit_scores,
|
| 155 |
+
'loan_amount': loan_amounts,
|
| 156 |
+
'interest_rate': interest_rates,
|
| 157 |
+
'loan_term': loan_terms,
|
| 158 |
+
'payment_history': payment_histories,
|
| 159 |
+
'days_past_due': days_past_due,
|
| 160 |
+
'previous_defaults': previous_defaults,
|
| 161 |
+
'recovery_status': recovery_status # 1 = recovered, 0 = not recovered
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
df = pd.DataFrame(data)
|
| 165 |
+
|
| 166 |
+
# Add some additional calculated features
|
| 167 |
+
df['monthly_payment'] = (df['loan_amount'] * (df['interest_rate']/100/12) *
|
| 168 |
+
(1 + df['interest_rate']/100/12)**(df['loan_term'])) / \
|
| 169 |
+
((1 + df['interest_rate']/100/12)**(df['loan_term']) - 1)
|
| 170 |
+
|
| 171 |
+
df['debt_to_income'] = (df['monthly_payment'] * 12) / df['annual_income']
|
| 172 |
+
|
| 173 |
+
# Round numeric columns for readability
|
| 174 |
+
df['annual_income'] = df['annual_income'].round(2)
|
| 175 |
+
df['loan_amount'] = df['loan_amount'].round(2)
|
| 176 |
+
df['interest_rate'] = df['interest_rate'].round(2)
|
| 177 |
+
df['monthly_payment'] = df['monthly_payment'].round(2)
|
| 178 |
+
df['debt_to_income'] = df['debt_to_income'].round(4)
|
| 179 |
+
|
| 180 |
+
return df
|
| 181 |
+
|
| 182 |
+
if __name__ == "__main__":
|
| 183 |
+
# Generate sample data
|
| 184 |
+
loan_data = generate_loan_data(n_samples=1000)
|
| 185 |
+
|
| 186 |
+
# Save to CSV
|
| 187 |
+
import os
|
| 188 |
+
os.makedirs('data', exist_ok=True)
|
| 189 |
+
loan_data.to_csv('data/loan_data.csv', index=False)
|
| 190 |
+
print(f"Generated {len(loan_data)} loan records and saved to data/loan_data.csv")
|
| 191 |
+
|
| 192 |
+
# Display sample
|
| 193 |
+
print("\nSample data:")
|
| 194 |
+
print(loan_data.head())
|
| 195 |
+
|
| 196 |
+
# Display summary statistics
|
| 197 |
+
print("\nSummary statistics:")
|
| 198 |
+
print(loan_data.describe())
|
| 199 |
+
|
| 200 |
+
# Display recovery rate
|
| 201 |
+
recovery_rate = loan_data['recovery_status'].mean() * 100
|
| 202 |
+
print(f"\nOverall recovery rate: {recovery_rate:.2f}%")
|