File size: 4,304 Bytes
42a6bd6
 
5651d3e
 
42a6bd6
 
 
 
5651d3e
 
 
 
 
 
 
 
 
 
42a6bd6
5651d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a6bd6
 
 
 
 
5651d3e
 
42a6bd6
 
 
 
5651d3e
 
 
 
 
 
 
42a6bd6
5651d3e
 
 
42a6bd6
5651d3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a6bd6
5651d3e
42a6bd6
 
5651d3e
 
42a6bd6
5651d3e
42a6bd6
 
 
5651d3e
42a6bd6
 
 
5651d3e
42a6bd6
5651d3e
 
42a6bd6
 
5651d3e
42a6bd6
5651d3e
42a6bd6
 
 
5651d3e
 
42a6bd6
5651d3e
 
42a6bd6
5651d3e
42a6bd6
5651d3e
42a6bd6
87088be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# narrative_safetynet.py
from __future__ import annotations
from typing import Dict, Any, List, Optional, Tuple
import re
import math
import numpy as np
import pandas as pd

# -------------------- helpers: dtype / formatting --------------------

_DEF_MIN_SAMPLE = 5  # generic caution threshold for group sizes

_HINT_METRICS_DEFAULT = [
    "surgery_median", "consult_median",
    "surgery_90th", "consult_90th",
    "surgery", "consult",
    "wait", "median", "p90", "90th"
]

_HINT_GROUPS_DEFAULT = [
    "facility", "specialty", "zone",
    "hospital", "city", "region"
]

_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]

def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
    dff = df.copy()
    for c in dff.columns:
        if dff[c].dtype == "object":
            dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
    return dff

def _is_numeric_series(s: pd.Series) -> bool:
    try:
        return pd.api.types.is_numeric_dtype(s)
    except Exception:
        return False

def _to_numeric(s: pd.Series) -> pd.Series:
    return pd.to_numeric(s, errors="coerce")

def _fmt_num(x: Any, decimals: int = 1) -> str:
    try:
        if x is None or (isinstance(x, float) and math.isnan(x)):
            return "n/a"
        if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
            return f"{int(round(float(x))):,}"
        return f"{float(x):,.{decimals}f}"
    except Exception:
        return str(x)

# -------------------- metric & dataset selection (dynamic) --------------------

def _score_metric_name(col: str, hints: List[str]) -> int:
    name = (col or "").lower()
    if any(bad in name for bad in _BAD_METRIC_NAMES):
        return -10**6  # disqualify obvious counters/ids
    score = 0
    for h in hints:
        if h in name:
            score += 3
    return score

def _choose_df_and_metric(
    datasets: Dict[str, Any],
    metric_hints: List[str]
) -> Optional[Tuple[str, pd.DataFrame, str]]:
    """
    Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
      +3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
    """
    best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
    for key, v in datasets.items():
        if not isinstance(v, pd.DataFrame) or v.empty:
            continue
        df = _nanlike_to_nan(v)
        for col in df.columns:
            col_num = _to_numeric(df[col])
            if not _is_numeric_series(col_num):
                continue
            s = _score_metric_name(col, metric_hints)
            if col_num.nunique(dropna=True) > 1:
                s += 1
            if best is None or s > best[0]:
                best = (s, key, df, col)
    if best is None:
        return None
    _, key, df, metric = best
    return key, df, metric

# -------------------- grouping detection (dynamic) --------------------

def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
    avoid = [a.lower() for a in (avoid or [])]
    cols = list(df.columns)
    # prefer name matches
    for cand in candidates:
        for c in cols:
            cname = c.lower()
            if cand.lower() in cname and all(a not in cname for a in avoid):
                return c
    # fallback: a categorical with reasonable cardinality
    obj_cols = [c for c in cols if df[c].dtype == "object"]
    for c in obj_cols:
        nuniq = df[c].nunique(dropna=True)
        if 1 < nuniq < max(50, len(df)//10):
            return c
    return None

# -------------------- labels & cautions --------------------

def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
    if pd.isna(x) or pd.isna(mu) or mu == 0:
        return "unknown"
    rel = (x - mu) / mu
    if rel > band:
        return "higher than average"
    if rel < -band:
        return "lower than average"
    return "about average"

def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
    return f"Interpret averages cautiously (only {n} records)." if n < min_n else None

def _pluralize(word: str, n: int) -> str:
    return f"{word}{'' if n == 1 else 's'}"

# -------------------- geo join (Top-5 only) --------------------

def _canon(s: str) -> s_