{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a1",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis: USDA Branded Food Products\n",
    "**Dataset:** usda_branded_sample_175k.csv (175,000 row random sample from 1,026,891 master records)\n",
    "\n",
    "**Tools:** Python, Pandas, NumPy, Matplotlib, Seaborn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "pd.set_option(\"display.max_columns\", 200)\n",
    "pd.set_option(\"display.width\", 160)\n",
    "\n",
    "plt.rcParams.update({\n",
    "    'font.family': 'serif',\n",
    "    'font.size': 11,\n",
    "    'axes.titlesize': 13,\n",
    "    'axes.labelsize': 12,\n",
    "    'figure.dpi': 150,\n",
    "})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a2",
   "metadata": {},
   "source": [
    "## 1) Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"usda_branded_sample_175k.csv\", low_memory=False)\n",
    "print(\"Shape:\", df.shape)\n",
    "print(\"Columns:\", df.columns.tolist())\n",
    "df.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3",
   "metadata": {},
   "source": [
    "## 2) Define Nutrient Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "NUTRIENTS = [\n",
    "    \"calories_per_serving\",\n",
    "    \"protein_g_per_serving\",\n",
    "    \"carbs_g_per_serving\",\n",
    "    \"fat_g_per_serving\",\n",
    "    \"fiber_g_per_serving\",\n",
    "    \"sodium_mg_per_serving\",\n",
    "    \"cholesterol_mg_per_serving\",\n",
    "    \"satfat_g_per_serving\",\n",
    "    \"transfat_g_per_serving\",\n",
    "    \"total_sugar_g_per_serving\",\n",
    "    \"added_sugar_g_per_serving\",\n",
    "]\n",
    "\n",
    "# Confirm all present\n",
    "for c in NUTRIENTS:\n",
    "    assert c in df.columns, f\"Missing: {c}\"\n",
    "print(\"All nutrient columns present.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a4",
   "metadata": {},
   "source": [
    "## 3) Summary Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "summary = df[NUTRIENTS].describe(percentiles=[.05, .25, .5, .75, .95]).T.round(3)\n",
    "summary['skewness'] = df[NUTRIENTS].skew().round(2)\n",
    "summary"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5",
   "metadata": {},
   "source": [
    "## 4) Missing Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "miss_count = df[NUTRIENTS].isnull().sum()\n",
    "miss_pct = (miss_count / len(df) * 100).round(2)\n",
    "miss_df = pd.DataFrame({'missing_count': miss_count, 'missing_pct': miss_pct}).sort_values('missing_pct', ascending=False)\n",
    "print(miss_df)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
    "miss_sorted = miss_pct.sort_values(ascending=True)\n",
    "colors = ['#C44E52' if p > 15 else '#4C72B0' for p in miss_sorted.values]\n",
    "ax.barh(range(len(miss_sorted)), miss_sorted.values, color=colors, edgecolor='white')\n",
    "ax.set_yticks(range(len(miss_sorted)))\n",
    "labels = [c.replace('_per_serving','').replace('_g','(g)').replace('_mg','(mg)').replace('_',' ').title() for c in miss_sorted.index]\n",
    "ax.set_yticklabels(labels, fontsize=9)\n",
    "ax.set_xlabel('Missing (%)')\n",
    "ax.set_title('Figure 2. Missing Values by Nutrient Variable (n = 175,000)')\n",
    "for i, val in enumerate(miss_sorted.values):\n",
    "    ax.text(val + 0.5, i, f'{val:.1f}%', va='center', fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6",
   "metadata": {},
   "source": [
    "## 5) Distribution of Calories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "cal = df['calories_per_serving'].dropna()\n",
    "clip_val = cal.quantile(0.995)\n",
    "cal_clip = cal[cal <= clip_val]\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 4.5))\n",
    "ax.hist(cal_clip, bins=60, color='#4C72B0', edgecolor='white', alpha=0.85)\n",
    "ax.axvline(cal.median(), color='red', linestyle='--', linewidth=1.5, label=f'Median: {cal.median():.0f} kcal')\n",
    "ax.axvline(cal.mean(), color='orange', linestyle='--', linewidth=1.5, label=f'Mean: {cal.mean():.0f} kcal')\n",
    "ax.set_xlabel('Calories per Serving (kcal)')\n",
    "ax.set_ylabel('Frequency')\n",
    "ax.set_title('Figure 1. Distribution of Calories per Serving')\n",
    "ax.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"Calorie stats: median={cal.median():.0f}, mean={cal.mean():.1f}, std={cal.std():.1f}, skewness={cal.skew():.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7",
   "metadata": {},
   "source": [
    "## 6) Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "corr = df[NUTRIENTS].corr()\n",
    "\n",
    "short_labels = ['Calories','Protein','Carbs','Fat','Fiber','Sodium',\n",
    "                'Cholesterol','Sat Fat','Trans Fat','Total Sugar','Added Sugar']\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(10, 8.5))\n",
    "mask = np.triu(np.ones_like(corr, dtype=bool), k=1)\n",
    "sns.heatmap(corr, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', center=0,\n",
    "            xticklabels=short_labels, yticklabels=short_labels, ax=ax,\n",
    "            vmin=-0.3, vmax=1, square=True, linewidths=0.5,\n",
    "            cbar_kws={'shrink': 0.7}, annot_kws={'size': 8})\n",
    "ax.set_title('Figure 3. Correlation Matrix of Nutrient Variables')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Key correlations\n",
    "print(\"Key correlations:\")\n",
    "print(f\"  Fat vs Sat Fat: {corr.loc['fat_g_per_serving','satfat_g_per_serving']:.3f}\")\n",
    "print(f\"  Calories vs Carbs: {corr.loc['calories_per_serving','carbs_g_per_serving']:.3f}\")\n",
    "print(f\"  Calories vs Fat: {corr.loc['calories_per_serving','fat_g_per_serving']:.3f}\")\n",
    "print(f\"  Total Sugar vs Added Sugar: {corr.loc['total_sugar_g_per_serving','added_sugar_g_per_serving']:.3f}\")\n",
    "print(f\"  Total Sugar vs Carbs: {corr.loc['total_sugar_g_per_serving','carbs_g_per_serving']:.3f}\")\n",
    "print(f\"  Carbs vs Fat: {corr.loc['carbs_g_per_serving','fat_g_per_serving']:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8",
   "metadata": {},
   "source": [
    "## 7) Food Category Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Number of unique categories: {df['branded_food_category'].nunique()}\")\n",
    "print(f\"Number of unique brands: {df['company_brand'].nunique()}\")\n",
    "\n",
    "# Top 15 categories by count\n",
    "top_cats = df['branded_food_category'].value_counts().head(15)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
    "ax.barh(range(len(top_cats)), top_cats.values[::-1], color='#4C72B0', edgecolor='white')\n",
    "ax.set_yticks(range(len(top_cats)))\n",
    "ax.set_yticklabels(top_cats.index[::-1], fontsize=9)\n",
    "ax.set_xlabel('Number of Products')\n",
    "ax.set_title('Figure 4. Top 15 Food Categories by Product Count')\n",
    "for i, v in enumerate(top_cats.values[::-1]):\n",
    "    ax.text(v + 50, i, f'{v:,}', va='center', fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9",
   "metadata": {},
   "source": [
    "## 8) Nutrient Boxplots by Category"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "top_8_cats = df['branded_food_category'].value_counts().head(8).index.tolist()\n",
    "df_top = df[df['branded_food_category'].isin(top_8_cats)].copy()\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(15, 5.5))\n",
    "for i, (col, label, clip_hi) in enumerate([\n",
    "    ('calories_per_serving', 'Calories (kcal)', 1000),\n",
    "    ('fat_g_per_serving', 'Fat (g)', 50),\n",
    "    ('sodium_mg_per_serving', 'Sodium (mg)', 2000)]):\n",
    "    data_list, labels_list = [], []\n",
    "    for cat in top_8_cats:\n",
    "        vals = df_top.loc[df_top['branded_food_category']==cat, col].dropna()\n",
    "        vals = vals[vals <= clip_hi]\n",
    "        if len(vals) > 0:\n",
    "            data_list.append(vals.values)\n",
    "            short = cat.replace(' & ', '/').replace(', ', ',')\n",
    "            if len(short) > 22: short = short[:19] + '...'\n",
    "            labels_list.append(short)\n",
    "    bp = axes[i].boxplot(data_list, vert=True, patch_artist=True, showfliers=False)\n",
    "    for patch in bp['boxes']:\n",
    "        patch.set_facecolor('#4C72B0')\n",
    "        patch.set_alpha(0.6)\n",
    "    axes[i].set_xticklabels(labels_list, rotation=50, ha='right', fontsize=7)\n",
    "    axes[i].set_ylabel(label)\n",
    "    axes[i].set_title(label)\n",
    "fig.suptitle('Figure 5. Nutrient Distribution Across Top 8 Food Categories', y=1.02, fontsize=13)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a10",
   "metadata": {},
   "source": [
    "## 9) Multivariate Scatter: Calories vs Fat vs Carbs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b10",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_plot = df.dropna(subset=['calories_per_serving','fat_g_per_serving','carbs_g_per_serving'])\n",
    "df_plot = df_plot[(df_plot['calories_per_serving'] <= 1200) & (df_plot['fat_g_per_serving'] <= 80)]\n",
    "if len(df_plot) > 8000:\n",
    "    df_plot = df_plot.sample(8000, random_state=42)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 5.5))\n",
    "sc = ax.scatter(df_plot['fat_g_per_serving'], df_plot['calories_per_serving'],\n",
    "                c=df_plot['carbs_g_per_serving'], cmap='viridis', alpha=0.35, s=12, edgecolors='none')\n",
    "cbar = plt.colorbar(sc, ax=ax)\n",
    "cbar.set_label('Carbs (g)')\n",
    "ax.set_xlabel('Fat (g per Serving)')\n",
    "ax.set_ylabel('Calories per Serving (kcal)')\n",
    "ax.set_title('Figure 6. Calories vs. Fat, Colored by Carbohydrate Content')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a11",
   "metadata": {},
   "source": [
    "## 10) Macronutrient Caloric Contribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b11",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_m = df.dropna(subset=['calories_per_serving','protein_g_per_serving','carbs_g_per_serving','fat_g_per_serving']).copy()\n",
    "df_m = df_m[df_m['calories_per_serving'] > 0]\n",
    "df_m['carb_pct'] = df_m['carbs_g_per_serving'] * 4 / df_m['calories_per_serving'] * 100\n",
    "df_m['fat_pct'] = df_m['fat_g_per_serving'] * 9 / df_m['calories_per_serving'] * 100\n",
    "df_m['prot_pct'] = df_m['protein_g_per_serving'] * 4 / df_m['calories_per_serving'] * 100\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 5))\n",
    "ax.hist(df_m['carb_pct'].clip(0,150), bins=50, alpha=0.6, label='Carbs', color='#4C72B0')\n",
    "ax.hist(df_m['fat_pct'].clip(0,150), bins=50, alpha=0.6, label='Fat', color='#C44E52')\n",
    "ax.hist(df_m['prot_pct'].clip(0,150), bins=50, alpha=0.6, label='Protein', color='#55A868')\n",
    "ax.set_xlabel('Percentage of Calories (%)')\n",
    "ax.set_ylabel('Frequency')\n",
    "ax.set_title('Figure 7. Macronutrient Contribution to Total Calories')\n",
    "ax.legend()\n",
    "ax.set_xlim(0, 120)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"Protein: mean={df_m['prot_pct'].mean():.1f}%, median={df_m['prot_pct'].median():.1f}%\")\n",
    "print(f\"Carbs:   mean={df_m['carb_pct'].mean():.1f}%, median={df_m['carb_pct'].median():.1f}%\")\n",
    "print(f\"Fat:     mean={df_m['fat_pct'].mean():.1f}%, median={df_m['fat_pct'].median():.1f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a12",
   "metadata": {},
   "source": [
    "## 11) Sodium Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b12",
   "metadata": {},
   "outputs": [],
   "source": [
    "sod = df['sodium_mg_per_serving'].dropna()\n",
    "sod_clip = sod[sod <= sod.quantile(0.99)]\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 4.5))\n",
    "ax.hist(sod_clip, bins=60, color='#4C72B0', edgecolor='white', alpha=0.85)\n",
    "ax.axvline(460, color='red', linestyle='--', linewidth=2, label='FDA High Sodium (460 mg)')\n",
    "ax.axvline(sod.median(), color='orange', linestyle='--', linewidth=1.5, label=f'Median: {sod.median():.0f} mg')\n",
    "ax.set_xlabel('Sodium per Serving (mg)')\n",
    "ax.set_ylabel('Frequency')\n",
    "ax.set_title('Figure 8. Distribution of Sodium per Serving')\n",
    "ax.legend(fontsize=9)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "pct_high = (df['sodium_mg_per_serving'] >= 460).mean()\n",
    "print(f\"Products exceeding 460mg sodium: {pct_high:.1%}\")\n",
    "\n",
    "# Top sodium categories\n",
    "cat_sod = df.groupby('branded_food_category')['sodium_mg_per_serving'].agg(['median','count'])\n",
    "cat_sod = cat_sod[cat_sod['count'] >= 50].sort_values('median', ascending=False)\n",
    "print(\"\\nTop 12 categories by median sodium:\")\n",
    "print(cat_sod.head(12))\n",
    "\n",
    "# Bar chart\n",
    "cat_sod_top = cat_sod.head(12).sort_values('median', ascending=True)\n",
    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
    "colors = ['#C44E52' if m >= 460 else '#4C72B0' for m in cat_sod_top['median'].values]\n",
    "ax.barh(range(len(cat_sod_top)), cat_sod_top['median'].values, color=colors, edgecolor='white')\n",
    "ax.set_yticks(range(len(cat_sod_top)))\n",
    "ax.set_yticklabels([c[:40] for c in cat_sod_top.index], fontsize=8)\n",
    "ax.axvline(460, color='red', linestyle='--', linewidth=1.5, label='FDA High Sodium (460 mg)')\n",
    "ax.set_xlabel('Median Sodium per Serving (mg)')\n",
    "ax.set_title('Figure 10. Top 12 Categories by Median Sodium Content')\n",
    "ax.legend(fontsize=9)\n",
    "for i, v in enumerate(cat_sod_top['median'].values):\n",
    "    ax.text(v + 50, i, f'{v:.0f} mg', va='center', fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a13",
   "metadata": {},
   "source": [
    "## 12) Sugar Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b13",
   "metadata": {},
   "outputs": [],
   "source": [
    "sug = df['total_sugar_g_per_serving'].dropna()\n",
    "sug_clip = sug[sug <= sug.quantile(0.99)]\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(8, 4.5))\n",
    "ax.hist(sug_clip, bins=60, color='#DD8452', edgecolor='white', alpha=0.85)\n",
    "ax.axvline(sug.median(), color='red', linestyle='--', linewidth=1.5, label=f'Median: {sug.median():.1f}g')\n",
    "ax.axvline(sug.mean(), color='orange', linestyle='--', linewidth=1.5, label=f'Mean: {sug.mean():.1f}g')\n",
    "ax.set_xlabel('Total Sugar per Serving (g)')\n",
    "ax.set_ylabel('Frequency')\n",
    "ax.set_title('Figure 11. Distribution of Total Sugar per Serving')\n",
    "ax.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"Total sugar: median={sug.median():.1f}g, mean={sug.mean():.1f}g\")\n",
    "print(f\"Added sugar: median={df['added_sugar_g_per_serving'].median():.1f}g, mean={df['added_sugar_g_per_serving'].mean():.1f}g\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b14",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Added sugar vs total sugar\n",
    "df_sug = df.dropna(subset=['total_sugar_g_per_serving','added_sugar_g_per_serving'])\n",
    "df_sug = df_sug[(df_sug['total_sugar_g_per_serving'] <= 100) & (df_sug['added_sugar_g_per_serving'] <= 100)]\n",
    "if len(df_sug) > 5000:\n",
    "    df_sug_plot = df_sug.sample(5000, random_state=42)\n",
    "else:\n",
    "    df_sug_plot = df_sug\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(7, 5.5))\n",
    "ax.scatter(df_sug_plot['total_sugar_g_per_serving'], df_sug_plot['added_sugar_g_per_serving'],\n",
    "           alpha=0.25, s=10, color='#DD8452', edgecolors='none')\n",
    "ax.plot([0,100],[0,100], 'r--', linewidth=1, alpha=0.5, label='y = x (all sugar is added)')\n",
    "ax.set_xlabel('Total Sugar per Serving (g)')\n",
    "ax.set_ylabel('Added Sugar per Serving (g)')\n",
    "ax.set_title('Figure 12. Added Sugar vs. Total Sugar per Serving')\n",
    "ax.legend(fontsize=9)\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(f\"Correlation (total vs added sugar): {df_sug['total_sugar_g_per_serving'].corr(df_sug['added_sugar_g_per_serving']):.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b15",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Top sugar categories\n",
    "cat_sug = df.groupby('branded_food_category')['total_sugar_g_per_serving'].agg(['median','count'])\n",
    "cat_sug = cat_sug[cat_sug['count'] >= 100].sort_values('median', ascending=True).tail(12)\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
    "ax.barh(range(len(cat_sug)), cat_sug['median'].values, color='#DD8452', edgecolor='white')\n",
    "ax.set_yticks(range(len(cat_sug)))\n",
    "ax.set_yticklabels([c[:40] for c in cat_sug.index], fontsize=8)\n",
    "ax.set_xlabel('Median Total Sugar per Serving (g)')\n",
    "ax.set_title('Figure 13. Top 12 Categories by Median Sugar Content')\n",
    "for i, v in enumerate(cat_sug['median'].values):\n",
    "    ax.text(v + 0.5, i, f'{v:.1f}g', va='center', fontsize=8)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a14",
   "metadata": {},
   "source": [
    "## 13) Trans Fat Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b16",
   "metadata": {},
   "outputs": [],
   "source": [
    "tf = df['transfat_g_per_serving'].dropna()\n",
    "print(f\"Trans fat: median={tf.median()}, mean={tf.mean():.3f}\")\n",
    "print(f\"Products with trans fat > 0: {(tf > 0).sum():,} / {len(tf):,} = {(tf > 0).mean():.1%}\")\n",
    "print(f\"Max trans fat: {tf.max():.1f}g\")\n",
    "\n",
    "# Of products with nonzero trans fat, what categories?\n",
    "df_tf = df[df['transfat_g_per_serving'] > 0]\n",
    "print(f\"\\nTop categories among products with trans fat > 0:\")\n",
    "print(df_tf['branded_food_category'].value_counts().head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a15",
   "metadata": {},
   "source": [
    "## 14) Serving Size Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b17",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ss = df.copy()\n",
    "df_ss['unit_clean'] = df_ss['serving_size_unit'].replace({'GRM':'g','MLT':'ml','GM':'g'})\n",
    "\n",
    "print(\"Serving size unit distribution:\")\n",
    "print(df_ss['unit_clean'].value_counts())\n",
    "\n",
    "fig, axes = plt.subplots(1, 2, figsize=(10, 4))\n",
    "df_g = df_ss[df_ss['unit_clean']=='g']\n",
    "df_ml = df_ss[df_ss['unit_clean']=='ml']\n",
    "\n",
    "axes[0].hist(df_g['serving_size'].clip(0,400), bins=50, color='#4C72B0', edgecolor='white', alpha=0.85)\n",
    "axes[0].set_title('Solid Products (grams)')\n",
    "axes[0].set_xlabel('Serving Size (g)')\n",
    "axes[0].set_ylabel('Frequency')\n",
    "axes[0].axvline(df_g['serving_size'].median(), color='red', linestyle='--', label=f\"Median: {df_g['serving_size'].median():.0f}g\")\n",
    "axes[0].legend(fontsize=9)\n",
    "\n",
    "axes[1].hist(df_ml['serving_size'].clip(0,600), bins=40, color='#C44E52', edgecolor='white', alpha=0.85)\n",
    "axes[1].set_title('Liquid Products (mL)')\n",
    "axes[1].set_xlabel('Serving Size (mL)')\n",
    "axes[1].set_ylabel('Frequency')\n",
    "axes[1].axvline(df_ml['serving_size'].median(), color='red', linestyle='--', label=f\"Median: {df_ml['serving_size'].median():.0f}mL\")\n",
    "axes[1].legend(fontsize=9)\n",
    "\n",
    "fig.suptitle('Figure 9. Serving Size Distribution by Measurement Unit', y=1.02)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a16",
   "metadata": {},
   "source": [
    "## 15) Outlier Identification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b18",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Top 5 extreme values by variable:\\n\")\n",
    "for col in ['calories_per_serving', 'sodium_mg_per_serving', 'cholesterol_mg_per_serving']:\n",
    "    print(f\"=== {col} ===\")\n",
    "    print(df.nlargest(5, col)[['product','branded_food_category', col]].to_string())\n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}