666 / W2_assignment_streamlit.py
huohuobeixiaosile's picture
Upload 2 files
e3823e4 verified
# Question: Does a higher bill amount lead to a lower tip percentage?
import numpy as np
import seaborn as sns
import streamlit as st
import altair as alt
st.set_page_config(page_title="Tips Explorer: Bill vs Tip %", page_icon="πŸ’Έ", layout="wide")
# 1) Data loading
@st.cache_data
def load_data():
df = sns.load_dataset("tips").copy()
df["tip_pct"] = df["tip"] / df["total_bill"] * 100
keep = ["total_bill", "tip", "tip_pct", "sex", "smoker", "day", "time", "size"]
df = df[keep].dropna()
return df
tips = load_data()
# 2) Title & problem statement
st.title("πŸ’Έ Do Bigger Bills Mean Smaller Tip % ?")
st.caption("Explore whether higher bills are associated with lower tipping percentages.")
st.markdown(
"> **User question:** Does a higher bill amount lead to a lower tip percentage?"
)
# 3) Sidebar controls (β‰₯ 2)
st.sidebar.header("Filters")
# (a) bill
bill_min = float(tips["total_bill"].min())
bill_max = float(tips["total_bill"].max())
bill_range = st.sidebar.slider(
"Total bill range ($)",
min_value=round(bill_min, 1),
max_value=round(bill_max, 1),
value=(round(bill_min, 1), round(bill_max, 1)),
step=0.5,
)
# (b) weekdays
days = ["Thur", "Fri", "Sat", "Sun"]
day_choice = st.sidebar.multiselect("Day(s) of week", days, default=days)
# (c) mealtime
time_choice = st.sidebar.radio("Meal", options=["All", "Lunch", "Dinner"], index=0)
# (d) Outlier Removal
clip_outliers = st.sidebar.checkbox("Remove extreme tip % (top/bottom 1%)", value=True)
# 4) Apply filters
df = tips[
(tips["total_bill"] >= bill_range[0]) &
(tips["total_bill"] <= bill_range[1]) &
(tips["day"].isin(day_choice))
].copy()
if time_choice != "All":
df = df[df["time"] == time_choice]
# Outlier Removal (for More Stable KPIs and Visualizations)
if clip_outliers and len(df) > 10:
low, high = np.percentile(df["tip_pct"], [1, 99])
df = df[(df["tip_pct"] >= low) & (df["tip_pct"] <= high)]
# 5) KPIs (β‰₯ 1)
col1, col2, col3 = st.columns(3)
if len(df) > 0:
avg_tip_pct = df["tip_pct"].mean()
med_tip_pct = df["tip_pct"].median()
corr = df["total_bill"].corr(df["tip_pct"]) # Pearson Correlation
col1.metric("Average Tip %", f"{avg_tip_pct:.1f}%")
col2.metric("Median Tip %", f"{med_tip_pct:.1f}%")
col3.metric("Corr( Bill , Tip % )", f"{corr:+.2f}")
else:
col1.metric("Average Tip %", "–")
col2.metric("Median Tip %", "–")
col3.metric("Corr( Bill , Tip % )", "–")
st.divider()
# 6) Visualization (β‰₯ 1)
st.subheader("Tip Percentage vs. Bill Amount")
if len(df) == 0:
st.info("No data under current filters. Try expanding the bill range or selecting more days.")
else:
base = alt.Chart(df).properties(width=800, height=420)
scatter = (
base.mark_circle(size=70, opacity=0.65, color="#4C78A8")
.encode(
x=alt.X("total_bill:Q", title="Total Bill ($)"),
y=alt.Y("tip_pct:Q", title="Tip Percentage (%)"),
tooltip=[
alt.Tooltip("total_bill:Q", title="Bill ($)", format=".2f"),
alt.Tooltip("tip_pct:Q", title="Tip %", format=".1f"),
alt.Tooltip("day:N", title="Day"),
alt.Tooltip("time:N", title="Meal"),
alt.Tooltip("size:Q", title="Party Size"),
],
)
)
# Used Altair's built-in regression function, which automatically plots the trend line
reg = (
base.transform_regression("total_bill", "tip_pct")
.mark_line(color="#E45756", size=3)
.encode(x="total_bill:Q", y="tip_pct:Q")
)
chart = (scatter + reg).resolve_scale(y="independent")
st.altair_chart(chart, use_container_width=True)
# 7) Dynamic insight text
def insight_text(n, r, avg):
if n == 0:
return "No data available under the current filters."
# Turn the correlation (r) into a plain-English explanation
# For example:
# Large r β†’ Bigger bills usually mean higher tip percentages
# Small r β†’ Little to no relationship
# Negative r β†’ Bigger bills usually mean lower tip percentages
if r <= -0.20:
trend = "a **negative** association β€” larger bills tend to have **lower** tip percentages."
elif r >= 0.20:
trend = "a **positive** association β€” larger bills tend to have **higher** tip percentages."
else:
trend = "**little to no clear** linear association between bill size and tip percentage."
return (
f"**Insight:** Based on the current selection (n = {n}), the correlation between "
f"total bill and tip percentage is **{r:+.2f}**, suggesting {trend} "
f"The average tip percentage in this selection is **{avg:.1f}%**."
)
st.markdown(
insight_text(
len(df),
0.0 if len(df) == 0 else df["total_bill"].corr(df["tip_pct"]),
0.0 if len(df) == 0 else df["tip_pct"].mean(),
)
)
# 8) Footnote & performance hint
st.caption(
"Notes: correlation is computed with Pearson’s r. "
"Extreme tip % values (top/bottom 1%) can be optionally removed for stability."
)