Spaces:
Sleeping
Sleeping
File size: 4,304 Bytes
42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 5651d3e 42a6bd6 87088be |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# narrative_safetynet.py
from __future__ import annotations
from typing import Dict, Any, List, Optional, Tuple
import re
import math
import numpy as np
import pandas as pd
# -------------------- helpers: dtype / formatting --------------------
_DEF_MIN_SAMPLE = 5 # generic caution threshold for group sizes
_HINT_METRICS_DEFAULT = [
"surgery_median", "consult_median",
"surgery_90th", "consult_90th",
"surgery", "consult",
"wait", "median", "p90", "90th"
]
_HINT_GROUPS_DEFAULT = [
"facility", "specialty", "zone",
"hospital", "city", "region"
]
_BAD_METRIC_NAMES = ["index", "id", "row", "unnamed"]
def _nanlike_to_nan(df: pd.DataFrame) -> pd.DataFrame:
dff = df.copy()
for c in dff.columns:
if dff[c].dtype == "object":
dff[c] = dff[c].replace({r"^\s*$": np.nan, r"^[-–—]$": np.nan}, regex=True)
return dff
def _is_numeric_series(s: pd.Series) -> bool:
try:
return pd.api.types.is_numeric_dtype(s)
except Exception:
return False
def _to_numeric(s: pd.Series) -> pd.Series:
return pd.to_numeric(s, errors="coerce")
def _fmt_num(x: Any, decimals: int = 1) -> str:
try:
if x is None or (isinstance(x, float) and math.isnan(x)):
return "n/a"
if isinstance(x, (int, np.integer)) or (isinstance(x, float) and float(x).is_integer()):
return f"{int(round(float(x))):,}"
return f"{float(x):,.{decimals}f}"
except Exception:
return str(x)
# -------------------- metric & dataset selection (dynamic) --------------------
def _score_metric_name(col: str, hints: List[str]) -> int:
name = (col or "").lower()
if any(bad in name for bad in _BAD_METRIC_NAMES):
return -10**6 # disqualify obvious counters/ids
score = 0
for h in hints:
if h in name:
score += 3
return score
def _choose_df_and_metric(
datasets: Dict[str, Any],
metric_hints: List[str]
) -> Optional[Tuple[str, pd.DataFrame, str]]:
"""
Sweep all dataframes & numeric columns. Pick the (df, metric) with best score:
+3 per hint match; +1 if non-constant numeric. Disqualify id-like names.
"""
best: Optional[Tuple[int, str, pd.DataFrame, str]] = None
for key, v in datasets.items():
if not isinstance(v, pd.DataFrame) or v.empty:
continue
df = _nanlike_to_nan(v)
for col in df.columns:
col_num = _to_numeric(df[col])
if not _is_numeric_series(col_num):
continue
s = _score_metric_name(col, metric_hints)
if col_num.nunique(dropna=True) > 1:
s += 1
if best is None or s > best[0]:
best = (s, key, df, col)
if best is None:
return None
_, key, df, metric = best
return key, df, metric
# -------------------- grouping detection (dynamic) --------------------
def _find_group_col(df: pd.DataFrame, candidates: List[str], avoid: Optional[List[str]] = None) -> Optional[str]:
avoid = [a.lower() for a in (avoid or [])]
cols = list(df.columns)
# prefer name matches
for cand in candidates:
for c in cols:
cname = c.lower()
if cand.lower() in cname and all(a not in cname for a in avoid):
return c
# fallback: a categorical with reasonable cardinality
obj_cols = [c for c in cols if df[c].dtype == "object"]
for c in obj_cols:
nuniq = df[c].nunique(dropna=True)
if 1 < nuniq < max(50, len(df)//10):
return c
return None
# -------------------- labels & cautions --------------------
def _label_vs_baseline(x: float, mu: float, band: float = 0.05) -> str:
if pd.isna(x) or pd.isna(mu) or mu == 0:
return "unknown"
rel = (x - mu) / mu
if rel > band:
return "higher than average"
if rel < -band:
return "lower than average"
return "about average"
def _small_sample_note(n: int, min_n: int = _DEF_MIN_SAMPLE) -> Optional[str]:
return f"Interpret averages cautiously (only {n} records)." if n < min_n else None
def _pluralize(word: str, n: int) -> str:
return f"{word}{'' if n == 1 else 's'}"
# -------------------- geo join (Top-5 only) --------------------
def _canon(s: str) -> s_
|