# Question: Does a higher bill amount lead to a lower tip percentage?

import numpy as np
import seaborn as sns
import streamlit as st
import altair as alt

st.set_page_config(page_title="Tips Explorer: Bill vs Tip %", page_icon="💸", layout="wide")

# 1) Data loading 
@st.cache_data
def load_data():
    df = sns.load_dataset("tips").copy()
    df["tip_pct"] = df["tip"] / df["total_bill"] * 100
    keep = ["total_bill", "tip", "tip_pct", "sex", "smoker", "day", "time", "size"]
    df = df[keep].dropna()
    return df

tips = load_data()


# 2) Title & problem statement
st.title("💸 Do Bigger Bills Mean Smaller Tip % ?")
st.caption("Explore whether higher bills are associated with lower tipping percentages.")

st.markdown(
"> **User question:** Does a higher bill amount lead to a lower tip percentage?"
)


# 3) Sidebar controls (≥ 2)
st.sidebar.header("Filters")

# (a) bill
bill_min = float(tips["total_bill"].min())
bill_max = float(tips["total_bill"].max())
bill_range = st.sidebar.slider(
    "Total bill range ($)",
    min_value=round(bill_min, 1),
    max_value=round(bill_max, 1),
    value=(round(bill_min, 1), round(bill_max, 1)),
    step=0.5,
)

# (b) weekdays
days = ["Thur", "Fri", "Sat", "Sun"]
day_choice = st.sidebar.multiselect("Day(s) of week", days, default=days)

# (c) mealtime
time_choice = st.sidebar.radio("Meal", options=["All", "Lunch", "Dinner"], index=0)

# (d) Outlier Removal 
clip_outliers = st.sidebar.checkbox("Remove extreme tip % (top/bottom 1%)", value=True)


# 4) Apply filters
df = tips[
    (tips["total_bill"] >= bill_range[0]) &
    (tips["total_bill"] <= bill_range[1]) &
    (tips["day"].isin(day_choice))
].copy()

if time_choice != "All":
    df = df[df["time"] == time_choice]

# Outlier Removal (for More Stable KPIs and Visualizations)
if clip_outliers and len(df) > 10:
    low, high = np.percentile(df["tip_pct"], [1, 99])
    df = df[(df["tip_pct"] >= low) & (df["tip_pct"] <= high)]


# 5) KPIs (≥ 1)
col1, col2, col3 = st.columns(3)
if len(df) > 0:
    avg_tip_pct = df["tip_pct"].mean()
    med_tip_pct = df["tip_pct"].median()
    corr = df["total_bill"].corr(df["tip_pct"])  # Pearson Correlation

    col1.metric("Average Tip %", f"{avg_tip_pct:.1f}%")
    col2.metric("Median Tip %", f"{med_tip_pct:.1f}%")
    col3.metric("Corr( Bill , Tip % )", f"{corr:+.2f}")
else:
    col1.metric("Average Tip %", "–")
    col2.metric("Median Tip %", "–")
    col3.metric("Corr( Bill , Tip % )", "–")

st.divider()


# 6) Visualization (≥ 1)
st.subheader("Tip Percentage vs. Bill Amount")

if len(df) == 0:
    st.info("No data under current filters. Try expanding the bill range or selecting more days.")
else:
    base = alt.Chart(df).properties(width=800, height=420)

    scatter = (
        base.mark_circle(size=70, opacity=0.65, color="#4C78A8")
        .encode(
            x=alt.X("total_bill:Q", title="Total Bill ($)"),
            y=alt.Y("tip_pct:Q", title="Tip Percentage (%)"),
            tooltip=[
                alt.Tooltip("total_bill:Q", title="Bill ($)", format=".2f"),
                alt.Tooltip("tip_pct:Q", title="Tip %", format=".1f"),
                alt.Tooltip("day:N", title="Day"),
                alt.Tooltip("time:N", title="Meal"),
                alt.Tooltip("size:Q", title="Party Size"),
            ],
        )
    )

    # Used Altair's built-in regression function, which automatically plots the trend line
    reg = (
        base.transform_regression("total_bill", "tip_pct")
        .mark_line(color="#E45756", size=3)
        .encode(x="total_bill:Q", y="tip_pct:Q")
    )

    chart = (scatter + reg).resolve_scale(y="independent")

    st.altair_chart(chart, use_container_width=True)


# 7) Dynamic insight text
def insight_text(n, r, avg):
    if n == 0:
        return "No data available under the current filters."

  # Turn the correlation (r) into a plain-English explanation
# For example:
#    Large r → Bigger bills usually mean higher tip percentages
#    Small r → Little to no relationship
#    Negative r → Bigger bills usually mean lower tip percentages
    if r <= -0.20:
        trend = "a **negative** association — larger bills tend to have **lower** tip percentages."
    elif r >= 0.20:
        trend = "a **positive** association — larger bills tend to have **higher** tip percentages."
    else:
        trend = "**little to no clear** linear association between bill size and tip percentage."

    return (
        f"**Insight:** Based on the current selection (n = {n}), the correlation between "
        f"total bill and tip percentage is **{r:+.2f}**, suggesting {trend} "
        f"The average tip percentage in this selection is **{avg:.1f}%**."
    )

st.markdown(
    insight_text(
        len(df),
        0.0 if len(df) == 0 else df["total_bill"].corr(df["tip_pct"]),
        0.0 if len(df) == 0 else df["tip_pct"].mean(),
    )
)


# 8) Footnote & performance hint
st.caption(
    "Notes: correlation is computed with Pearson’s r. "
    "Extreme tip % values (top/bottom 1%) can be optionally removed for stability."
)