# narrative_safetynet.py from __future__ import annotations from typing import Dict, Any, List, Optional, Tuple import re import math import numpy as np import pandas as pd # -------------------- helpers: dtype / formatting -------------------- _DEF_MIN_SAMPLE = 5 # generic caution threshold for group sizes _HINT_METRICS_DEFAULT = [ "surgery_median", "consult_median", "surgery_90th", "consult_90th", "surgery", "consult", "wait", "median", "p90", "90th" ] _HINT_GROUPS_DEFAULT = [ "facility", "specialty", "zone", "hospital", "city", "region" ] _BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"] def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame: dff = df.copy() for c in dff.columns: if dff[c].dtype == "object": dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True) return dff def _is_numeric_series(s: pd.Series) -> bool: try: return pd.api.types.is_numeric_dtype(s) except Exception: return False def _to_numeric(s: pd.Series) -> pd.Series: return pd.to_numeric(s, errors="coerce") def _fmt_num(x: Any, decimals: int = 1) -> str: try: if x is None or (isinstance(x, float) and math.isnan(x)): return "n/a" if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()): return f"{int(round(float(x))):,}" return f"{float(x):,.{decimals}f}" except Exception: return str(x) # -------------------- metric & dataset selection (dynamic) -------------------- def _score_metric_name(col: str, hints: List[str]) -> int: name = (col or "").lower() if any(bad in name for bad in _BAD_METRIC_NAMES): return -10**6 # disqualify obvious counters/ids score = 0 for h in hints: if h in name: score += 3 return score def _choose_df_and_metric( datasets: Dict[str, Any], metric_hints: List[str] ) -> Optional[Tuple[str, pd.DataFrame, str]]: """ Sweep all dataframes & numeric columns. Pick the (df, metric) with best score: +3 per hint match; +1 if non-constant numeric. Disqualify id-like names. """ best: Optional[Tuple[int, str, pd.DataFrame, str]] = None for key, v in datasets.items(): if not isinstance(v, pd.DataFrame) or v.empty: continue df = _nanlike_to_nan(v) for col in df.columns: col_num = _to_numeric(df[col]) if not _is_numeric_series(col_num): continue s = _score_metric_name(col, metric_hints) if col_num.nunique(dropna=True) > 1: s += 1 if best is None or s > best[0]: best = (s, key, df, col) if best is None: return None _, key, df, metric = best return key, df, metric # -------------------- grouping detection (dynamic) -------------------- def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]: avoid = [a.lower() for a in (avoid or [])] cols = list(df.columns) # prefer name matches for cand in candidates: for c in cols: cname = c.lower() if cand.lower() in cname and all(a not in cname for a in avoid): return c # fallback: a categorical with reasonable cardinality obj_cols = [c for c in cols if df[c].dtype == "object"] for c in obj_cols: nuniq = df[c].nunique(dropna=True) if 1 < nuniq < max(50, len(df)//10): return c return None # -------------------- labels & cautions -------------------- def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str: if pd.isna(x) or pd.isna(mu) or mu == 0: return "unknown" rel = (x - mu) / mu if rel > band: return "higher than average" if rel < -band: return "lower than average" return "about average" def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]: return f"Interpret averages cautiously (only {n} records)." if n < min_n else None def _pluralize(word: str, n: int) -> str: return f"{word}{'' if n == 1 else 's'}" # -------------------- geo join (Top-5 only) -------------------- def _canon(s: str) -> s_