|
|
|
|
|
from pathlib import Path |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import joblib |
|
|
|
|
|
|
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.preprocessing import OneHotEncoder, StandardScaler |
|
|
from sklearn.impute import SimpleImputer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.metrics import classification_report, roc_auc_score |
|
|
|
|
|
DATA_PATH = Path("C:\Users\wissa\Downloads\data\stroke-flask-docker\data\healthcare-dataset-stroke-data.csv") |
|
|
OUT_PATH = Path("C:\Users\wissa\Downloads\data\stroke-flask-docker\model\stroke_pipeline.joblib") |
|
|
OUT_PATH.parent.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
CATEGORICAL = ["gender","ever_married","work_type","Residence_type","smoking_status"] |
|
|
NUMERIC = ["age","avg_glucose_level","bmi"] |
|
|
BINARY_INT = ["hypertension","heart_disease"] |
|
|
|
|
|
|
|
|
def load_real_or_synthetic(): |
|
|
if DATA_PATH.exists(): |
|
|
|
|
|
df = pd.read_csv(DATA_PATH) |
|
|
|
|
|
|
|
|
must_have = ["gender","age","hypertension","heart_disease","ever_married", |
|
|
"work_type","Residence_type","avg_glucose_level","bmi", |
|
|
"smoking_status","stroke"] |
|
|
|
|
|
|
|
|
missing = set(must_have) - set(df.columns) |
|
|
if missing: |
|
|
raise ValueError(f"Dataset is missing columns: {missing}") |
|
|
|
|
|
|
|
|
df = df[[c for c in df.columns if c in must_have]] |
|
|
return df |
|
|
else: |
|
|
|
|
|
rng = np.random.RandomState(42) |
|
|
N = 2000 |
|
|
|
|
|
|
|
|
df = pd.DataFrame({ |
|
|
"gender": rng.choice(["Male","Female","Other"], size=N, p=[0.49,0.50,0.01]), |
|
|
"age": rng.randint(1, 90, size=N), |
|
|
"hypertension": rng.binomial(1, 0.15, size=N), |
|
|
"heart_disease": rng.binomial(1, 0.08, size=N), |
|
|
"ever_married": rng.choice(["Yes","No"], size=N, p=[0.7,0.3]), |
|
|
"work_type": rng.choice(["Private","Self-employed","Govt_job","children","Never_worked"], |
|
|
size=N, p=[0.6,0.2,0.18,0.01,0.01]), |
|
|
"Residence_type": rng.choice(["Urban","Rural"], size=N, p=[0.55,0.45]), |
|
|
"avg_glucose_level": rng.normal(100, 30, size=N).clip(50, 300), |
|
|
"bmi": rng.normal(28, 6, size=N).clip(10, 60), |
|
|
"smoking_status": rng.choice(["formerly smoked","never smoked","smokes","Unknown"], |
|
|
size=N, p=[0.2,0.6,0.15,0.05]), |
|
|
}) |
|
|
|
|
|
|
|
|
logit = ( |
|
|
0.03*df["age"] + |
|
|
0.02*(df["avg_glucose_level"]-100) + |
|
|
0.05*(df["bmi"]-28) + |
|
|
0.8*df["hypertension"] + |
|
|
0.9*df["heart_disease"] + |
|
|
0.3*(df["ever_married"]=="Yes").astype(int) |
|
|
) |
|
|
|
|
|
|
|
|
prob = 1/(1+np.exp(- (logit-4.0))) |
|
|
|
|
|
|
|
|
df["stroke"] = (rng.rand(len(df)) < prob).astype(int) |
|
|
return df |
|
|
|
|
|
|
|
|
def build_pipeline(): |
|
|
|
|
|
cat_proc = Pipeline(steps=[ |
|
|
("impute", SimpleImputer(strategy="most_frequent")), |
|
|
("ohe", OneHotEncoder(handle_unknown="ignore")) |
|
|
]) |
|
|
|
|
|
|
|
|
num_proc = Pipeline(steps=[ |
|
|
("impute", SimpleImputer(strategy="median")), |
|
|
("scale", StandardScaler()) |
|
|
]) |
|
|
|
|
|
|
|
|
bin_proc = Pipeline(steps=[ |
|
|
("impute", SimpleImputer(strategy="most_frequent")), |
|
|
("scale", StandardScaler(with_mean=False)) |
|
|
]) |
|
|
|
|
|
|
|
|
pre = ColumnTransformer(transformers=[ |
|
|
("cat", cat_proc, CATEGORICAL), |
|
|
("num", num_proc, NUMERIC), |
|
|
("bin", bin_proc, BINARY_INT), |
|
|
]) |
|
|
|
|
|
|
|
|
clf = LogisticRegression(max_iter=1000, n_jobs=None) |
|
|
|
|
|
|
|
|
pipeline = Pipeline([("pre", pre), ("clf", clf)]) |
|
|
return pipeline |
|
|
|
|
|
|
|
|
def main(): |
|
|
df = load_real_or_synthetic() |
|
|
|
|
|
|
|
|
X = df.drop(columns=["stroke"]) |
|
|
y = df["stroke"].astype(int) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
X, y, test_size=0.2, random_state=42, stratify=y |
|
|
) |
|
|
|
|
|
pipeline = build_pipeline() |
|
|
pipeline.fit(X_train, y_train) |
|
|
|
|
|
y_prob = pipeline.predict_proba(X_test)[:,1] |
|
|
y_pred = (y_prob >= 0.3).astype(int) |
|
|
|
|
|
print("AUC:", roc_auc_score(y_test, y_prob)) |
|
|
print("Report:\n", classification_report(y_test, y_pred)) |
|
|
|
|
|
joblib.dump(pipeline, OUT_PATH) |
|
|
print(f"Saved pipeline to {OUT_PATH.resolve()}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|