Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

File size: 4,304 Bytes

# narrative_safetynet.py
from __future__ import annotations
from typing import Dict, Any, List, Optional, Tuple
import re
import math
import numpy as np
import pandas as pd

# -------------------- helpers: dtype / formatting --------------------

_DEF_MIN_SAMPLE = 5  # generic caution threshold for group sizes

_HINT_METRICS_DEFAULT = [
    "surgery_median", "consult_median",
    "surgery_90th", "consult_90th",
    "surgery", "consult",
    "wait", "median", "p90", "90th"
]

_HINT_GROUPS_DEFAULT = [
    "facility", "specialty", "zone",
    "hospital", "city", "region"
]

_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]

def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
    dff = df.copy()
    for c in dff.columns:
        if dff[c].dtype == "object":
            dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
    return dff

def _is_numeric_series(s: pd.Series) -> bool:
    try:
        return pd.api.types.is_numeric_dtype(s)
    except Exception:
        return False

def _to_numeric(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def _fmt_num(x: Any, decimals: int = 1) -> str:
    try:
        if x is None or (isinstance(x, float) and math.isnan(x)):
            return "n/a"
        if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
            return f"{int(round(float(x))):,}"
        return f"{float(x):,.{decimals}f}"
    except Exception:
        return str(x)

# -------------------- metric & dataset selection (dynamic) --------------------

def _score_metric_name(col: str, hints: List[str]) -> int:
    name = (col or "").lower()
    if any(bad in name for bad in _BAD_METRIC_NAMES):
        return -10**6  # disqualify obvious counters/ids
    score = 0
    for h in hints:
        if h in name:
            score += 3
    return score

def _choose_df_and_metric(
    datasets: Dict[str, Any],
    metric_hints: List[str]
) -> Optional[Tuple[str, pd.DataFrame, str]]:
    """
    Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
      +3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
    """
    best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
    for key, v in datasets.items():
        if not isinstance(v, pd.DataFrame) or v.empty:
            continue
        df = _nanlike_to_nan(v)
        for col in df.columns:
            col_num = _to_numeric(df[col])
            if not _is_numeric_series(col_num):
                continue
            s = _score_metric_name(col, metric_hints)
            if col_num.nunique(dropna=True) > 1:
                s += 1
            if best is None or s > best[0]:
                best = (s, key, df, col)
    if best is None:
        return None
    _, key, df, metric = best
    return key, df, metric

# -------------------- grouping detection (dynamic) --------------------

def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
    avoid = [a.lower() for a in (avoid or [])]
    cols = list(df.columns)
    # prefer name matches
    for cand in candidates:
        for c in cols:
            cname = c.lower()
            if cand.lower() in cname and all(a not in cname for a in avoid):
                return c
    # fallback: a categorical with reasonable cardinality
    obj_cols = [c for c in cols if df[c].dtype == "object"]
    for c in obj_cols:
        nuniq = df[c].nunique(dropna=True)
        if 1 < nuniq < max(50, len(df)//10):
            return c
    return None

# -------------------- labels & cautions --------------------

def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
    if pd.isna(x) or pd.isna(mu) or mu == 0:
        return "unknown"
    rel = (x - mu) / mu
    if rel > band:
        return "higher than average"
    if rel < -band:
        return "lower than average"
    return "about average"

def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
    return f"Interpret averages cautiously (only {n} records)." if n < min_n else None

def _pluralize(word: str, n: int) -> str:
    return f"{word}{'' if n == 1 else 's'}"

# -------------------- geo join (Top-5 only) --------------------

def _canon(s: str) -> s_