|
|
|
|
|
""" |
|
|
============================================================================= |
|
|
COMPREHENSIVE ACTIVATION FUNCTION TUTORIAL |
|
|
============================================================================= |
|
|
|
|
|
This script provides both THEORETICAL explanations and EMPIRICAL experiments |
|
|
to understand how different activation functions affect: |
|
|
|
|
|
1. GRADIENT FLOW: Do gradients vanish or explode? |
|
|
2. SPARSITY & DEAD NEURONS: How easily do units turn on/off? |
|
|
3. STABILITY: How robust is training under big learning rates / deep stacks? |
|
|
4. REPRESENTATIONAL CAPACITY: How well can the model represent functions? |
|
|
|
|
|
Activation Functions Studied: |
|
|
- Linear (Identity) |
|
|
- Sigmoid |
|
|
- Tanh |
|
|
- ReLU |
|
|
- Leaky ReLU |
|
|
- ELU |
|
|
- GELU |
|
|
- Swish/SiLU |
|
|
|
|
|
Author: Orchestra Research Assistant |
|
|
Date: 2024 |
|
|
============================================================================= |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
import matplotlib.gridspec as gridspec |
|
|
from collections import defaultdict |
|
|
import json |
|
|
import os |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
|
|
|
torch.manual_seed(42) |
|
|
np.random.seed(42) |
|
|
|
|
|
|
|
|
os.makedirs('activation_functions', exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
THEORETICAL_BACKGROUND = """ |
|
|
============================================================================= |
|
|
THEORETICAL BACKGROUND: ACTIVATION FUNCTIONS |
|
|
============================================================================= |
|
|
|
|
|
1. WHY DO WE NEED ACTIVATION FUNCTIONS? |
|
|
--------------------------------------- |
|
|
Without non-linear activations, a neural network of any depth is equivalent |
|
|
to a single linear transformation: |
|
|
|
|
|
f(x) = W_n @ W_{n-1} @ ... @ W_1 @ x = W_combined @ x |
|
|
|
|
|
Non-linear activations allow networks to approximate any continuous function |
|
|
(Universal Approximation Theorem). |
|
|
|
|
|
|
|
|
2. GRADIENT FLOW THEORY |
|
|
----------------------- |
|
|
During backpropagation, gradients flow through the chain rule: |
|
|
|
|
|
βL/βW_i = βL/βa_n Γ βa_n/βa_{n-1} Γ ... Γ βa_{i+1}/βa_i Γ βa_i/βW_i |
|
|
|
|
|
Each layer contributes a factor of Ο'(z) Γ W, where Ο' is the activation derivative. |
|
|
|
|
|
VANISHING GRADIENTS occur when |Ο'(z)| < 1 repeatedly: |
|
|
- Sigmoid: Ο'(z) β (0, 0.25], maximum at z=0 |
|
|
- Tanh: Ο'(z) β (0, 1], maximum at z=0 |
|
|
- For deep networks: gradient β (0.25)^n β 0 as n β β |
|
|
|
|
|
EXPLODING GRADIENTS occur when |Ο'(z) Γ W| > 1 repeatedly: |
|
|
- More common with ReLU (gradient = 1 for z > 0) |
|
|
- Mitigated by proper initialization and gradient clipping |
|
|
|
|
|
|
|
|
3. ACTIVATION FUNCTION PROPERTIES |
|
|
--------------------------------- |
|
|
|
|
|
| Function | Range | Ο'(z) Range | Zero-Centered | Saturates | |
|
|
|-------------|-------------|-------------|---------------|-----------| |
|
|
| Linear | (-β, β) | 1 | Yes | No | |
|
|
| Sigmoid | (0, 1) | (0, 0.25] | No | Yes | |
|
|
| Tanh | (-1, 1) | (0, 1] | Yes | Yes | |
|
|
| ReLU | [0, β) | {0, 1} | No | Half | |
|
|
| Leaky ReLU | (-β, β) | {Ξ±, 1} | No | No | |
|
|
| ELU | (-Ξ±, β) | (0, 1] | ~Yes | Half | |
|
|
| GELU | (-0.17, β) | smooth | No | Soft | |
|
|
| Swish | (-0.28, β) | smooth | No | Soft | |
|
|
|
|
|
|
|
|
4. DEAD NEURON PROBLEM |
|
|
---------------------- |
|
|
ReLU neurons can "die" when they always output 0: |
|
|
- If z < 0 for all inputs, gradient = 0, weights never update |
|
|
- Caused by: large learning rates, bad initialization, unlucky gradients |
|
|
- Solutions: Leaky ReLU, ELU, careful initialization |
|
|
|
|
|
|
|
|
5. REPRESENTATIONAL CAPACITY |
|
|
---------------------------- |
|
|
Different activations have different "expressiveness": |
|
|
- Smooth activations (GELU, Swish) β smoother decision boundaries |
|
|
- Piecewise linear (ReLU) β piecewise linear boundaries |
|
|
- Bounded activations (Sigmoid, Tanh) β can struggle with unbounded targets |
|
|
""" |
|
|
|
|
|
print(THEORETICAL_BACKGROUND) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ActivationFunctions: |
|
|
"""Collection of activation functions with their derivatives.""" |
|
|
|
|
|
@staticmethod |
|
|
def get_all(): |
|
|
"""Return dict of activation name -> (function, derivative, nn.Module)""" |
|
|
return { |
|
|
'Linear': ( |
|
|
lambda x: x, |
|
|
lambda x: torch.ones_like(x), |
|
|
nn.Identity() |
|
|
), |
|
|
'Sigmoid': ( |
|
|
torch.sigmoid, |
|
|
lambda x: torch.sigmoid(x) * (1 - torch.sigmoid(x)), |
|
|
nn.Sigmoid() |
|
|
), |
|
|
'Tanh': ( |
|
|
torch.tanh, |
|
|
lambda x: 1 - torch.tanh(x)**2, |
|
|
nn.Tanh() |
|
|
), |
|
|
'ReLU': ( |
|
|
F.relu, |
|
|
lambda x: (x > 0).float(), |
|
|
nn.ReLU() |
|
|
), |
|
|
'LeakyReLU': ( |
|
|
lambda x: F.leaky_relu(x, 0.01), |
|
|
lambda x: torch.where(x > 0, torch.ones_like(x), 0.01 * torch.ones_like(x)), |
|
|
nn.LeakyReLU(0.01) |
|
|
), |
|
|
'ELU': ( |
|
|
F.elu, |
|
|
lambda x: torch.where(x > 0, torch.ones_like(x), F.elu(x) + 1), |
|
|
nn.ELU() |
|
|
), |
|
|
'GELU': ( |
|
|
F.gelu, |
|
|
lambda x: _gelu_derivative(x), |
|
|
nn.GELU() |
|
|
), |
|
|
'Swish': ( |
|
|
F.silu, |
|
|
lambda x: torch.sigmoid(x) + x * torch.sigmoid(x) * (1 - torch.sigmoid(x)), |
|
|
nn.SiLU() |
|
|
), |
|
|
} |
|
|
|
|
|
def _gelu_derivative(x): |
|
|
"""Approximate GELU derivative.""" |
|
|
cdf = 0.5 * (1 + torch.erf(x / np.sqrt(2))) |
|
|
pdf = torch.exp(-0.5 * x**2) / np.sqrt(2 * np.pi) |
|
|
return cdf + x * pdf |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def experiment_1_gradient_flow(): |
|
|
""" |
|
|
EXPERIMENT 1: How do gradients flow through deep networks? |
|
|
|
|
|
Theory: |
|
|
- Sigmoid/Tanh: Ο'(z) β€ 0.25/1.0, gradients shrink exponentially |
|
|
- ReLU: Ο'(z) β {0, 1}, gradients preserved but can die |
|
|
- Modern activations: designed to maintain gradient flow |
|
|
|
|
|
We measure: |
|
|
- Gradient magnitude at each layer during forward/backward pass |
|
|
- How gradients change with network depth |
|
|
""" |
|
|
print("\n" + "="*80) |
|
|
print("EXPERIMENT 1: GRADIENT FLOW ANALYSIS") |
|
|
print("="*80) |
|
|
|
|
|
activations = ActivationFunctions.get_all() |
|
|
depths = [5, 10, 20, 50] |
|
|
width = 64 |
|
|
|
|
|
results = {name: {} for name in activations} |
|
|
|
|
|
for depth in depths: |
|
|
print(f"\n--- Testing depth = {depth} ---") |
|
|
|
|
|
for name, (func, deriv, module) in activations.items(): |
|
|
|
|
|
layers = [] |
|
|
for i in range(depth): |
|
|
layers.append(nn.Linear(width if i > 0 else 1, width)) |
|
|
layers.append(module if isinstance(module, nn.Identity) else type(module)()) |
|
|
layers.append(nn.Linear(width, 1)) |
|
|
|
|
|
model = nn.Sequential(*layers) |
|
|
|
|
|
|
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear): |
|
|
nn.init.xavier_uniform_(m.weight) |
|
|
nn.init.zeros_(m.bias) |
|
|
|
|
|
|
|
|
x = torch.randn(32, 1, requires_grad=True) |
|
|
y = model(x) |
|
|
loss = y.mean() |
|
|
loss.backward() |
|
|
|
|
|
|
|
|
grad_mags = [] |
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear) and m.weight.grad is not None: |
|
|
grad_mags.append(m.weight.grad.abs().mean().item()) |
|
|
|
|
|
results[name][depth] = { |
|
|
'grad_magnitudes': grad_mags, |
|
|
'grad_ratio': grad_mags[-1] / (grad_mags[0] + 1e-10) if grad_mags[0] > 1e-10 else float('inf'), |
|
|
'min_grad': min(grad_mags), |
|
|
'max_grad': max(grad_mags), |
|
|
} |
|
|
|
|
|
print(f" {name:12s}: grad_ratio={results[name][depth]['grad_ratio']:.2e}, " |
|
|
f"min={results[name][depth]['min_grad']:.2e}, max={results[name][depth]['max_grad']:.2e}") |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(2, 2, figsize=(14, 10)) |
|
|
colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
|
|
|
|
|
for idx, depth in enumerate(depths): |
|
|
ax = axes[idx // 2, idx % 2] |
|
|
for (name, data), color in zip(results.items(), colors): |
|
|
grads = data[depth]['grad_magnitudes'] |
|
|
ax.semilogy(range(1, len(grads)+1), grads, 'o-', label=name, color=color, markersize=4) |
|
|
|
|
|
ax.set_xlabel('Layer (from input to output)') |
|
|
ax.set_ylabel('Gradient Magnitude (log scale)') |
|
|
ax.set_title(f'Gradient Flow: Depth = {depth}') |
|
|
ax.legend(loc='best', fontsize=8) |
|
|
ax.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp1_gradient_flow.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
print("\nβ Saved: exp1_gradient_flow.png") |
|
|
|
|
|
|
|
|
with open('activation_functions/exp1_gradient_flow.json', 'w') as f: |
|
|
json.dump({k: {str(d): v for d, v in data.items()} for k, data in results.items()}, f, indent=2) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def experiment_2_sparsity_dead_neurons(): |
|
|
""" |
|
|
EXPERIMENT 2: How do activation functions affect sparsity and dead neurons? |
|
|
|
|
|
Theory: |
|
|
- ReLU creates sparse activations (many zeros) - good for efficiency |
|
|
- But neurons can "die" (always output 0) - bad for learning |
|
|
- Leaky ReLU/ELU prevent dead neurons with small negative slope |
|
|
- Sigmoid/Tanh rarely have exactly zero activations |
|
|
|
|
|
We measure: |
|
|
- Activation sparsity (% of zeros or near-zeros) |
|
|
- Dead neuron rate (neurons that never activate across dataset) |
|
|
- Activation distribution statistics |
|
|
""" |
|
|
print("\n" + "="*80) |
|
|
print("EXPERIMENT 2: SPARSITY AND DEAD NEURONS") |
|
|
print("="*80) |
|
|
|
|
|
activations = ActivationFunctions.get_all() |
|
|
|
|
|
|
|
|
depth = 10 |
|
|
width = 128 |
|
|
n_samples = 1000 |
|
|
|
|
|
|
|
|
x_data = torch.randn(n_samples, 10) |
|
|
y_data = torch.sin(x_data.sum(dim=1, keepdim=True)) + 0.1 * torch.randn(n_samples, 1) |
|
|
|
|
|
results = {} |
|
|
activation_distributions = {} |
|
|
|
|
|
for name, (func, deriv, module) in activations.items(): |
|
|
print(f"\n--- Testing {name} ---") |
|
|
|
|
|
|
|
|
class NetworkWithHooks(nn.Module): |
|
|
def __init__(self): |
|
|
super().__init__() |
|
|
self.layers = nn.ModuleList() |
|
|
self.activations_list = nn.ModuleList() |
|
|
|
|
|
for i in range(depth): |
|
|
self.layers.append(nn.Linear(width if i > 0 else 10, width)) |
|
|
self.activations_list.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
|
|
self.layers.append(nn.Linear(width, 1)) |
|
|
|
|
|
self.activation_values = [] |
|
|
|
|
|
def forward(self, x): |
|
|
self.activation_values = [] |
|
|
for i, (layer, act) in enumerate(zip(self.layers[:-1], self.activations_list)): |
|
|
x = act(layer(x)) |
|
|
self.activation_values.append(x.detach().clone()) |
|
|
return self.layers[-1](x) |
|
|
|
|
|
model = NetworkWithHooks() |
|
|
|
|
|
|
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear): |
|
|
nn.init.xavier_uniform_(m.weight) |
|
|
nn.init.zeros_(m.bias) |
|
|
|
|
|
|
|
|
optimizer = torch.optim.SGD(model.parameters(), lr=0.1) |
|
|
|
|
|
for epoch in range(100): |
|
|
optimizer.zero_grad() |
|
|
pred = model(x_data) |
|
|
loss = F.mse_loss(pred, y_data) |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
|
|
|
model.eval() |
|
|
with torch.no_grad(): |
|
|
_ = model(x_data) |
|
|
|
|
|
layer_sparsity = [] |
|
|
layer_dead_neurons = [] |
|
|
all_activations = [] |
|
|
|
|
|
for layer_idx, acts in enumerate(model.activation_values): |
|
|
|
|
|
sparsity = (acts.abs() < 1e-6).float().mean().item() |
|
|
layer_sparsity.append(sparsity) |
|
|
|
|
|
|
|
|
neuron_activity = (acts.abs() > 1e-6).float().sum(dim=0) |
|
|
dead_neurons = (neuron_activity == 0).float().mean().item() |
|
|
layer_dead_neurons.append(dead_neurons) |
|
|
|
|
|
all_activations.extend(acts.flatten().numpy()) |
|
|
|
|
|
results[name] = { |
|
|
'avg_sparsity': np.mean(layer_sparsity), |
|
|
'layer_sparsity': layer_sparsity, |
|
|
'avg_dead_neurons': np.mean(layer_dead_neurons), |
|
|
'layer_dead_neurons': layer_dead_neurons, |
|
|
} |
|
|
|
|
|
activation_distributions[name] = np.array(all_activations) |
|
|
|
|
|
print(f" Avg Sparsity: {results[name]['avg_sparsity']*100:.1f}%") |
|
|
print(f" Avg Dead Neurons: {results[name]['avg_dead_neurons']*100:.1f}%") |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
|
|
|
|
|
names = list(results.keys()) |
|
|
sparsities = [results[n]['avg_sparsity'] * 100 for n in names] |
|
|
dead_rates = [results[n]['avg_dead_neurons'] * 100 for n in names] |
|
|
|
|
|
colors = plt.cm.Set2(np.linspace(0, 1, len(names))) |
|
|
|
|
|
ax1 = axes[0] |
|
|
bars1 = ax1.bar(names, sparsities, color=colors) |
|
|
ax1.set_ylabel('Sparsity (%)') |
|
|
ax1.set_title('Activation Sparsity (% of near-zero activations)') |
|
|
ax1.set_xticklabels(names, rotation=45, ha='right') |
|
|
for bar, val in zip(bars1, sparsities): |
|
|
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', |
|
|
ha='center', va='bottom', fontsize=9) |
|
|
|
|
|
ax2 = axes[1] |
|
|
bars2 = ax2.bar(names, dead_rates, color=colors) |
|
|
ax2.set_ylabel('Dead Neuron Rate (%)') |
|
|
ax2.set_title('Dead Neurons (% never activating)') |
|
|
ax2.set_xticklabels(names, rotation=45, ha='right') |
|
|
for bar, val in zip(bars2, dead_rates): |
|
|
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{val:.1f}%', |
|
|
ha='center', va='bottom', fontsize=9) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp2_sparsity_dead_neurons.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(2, 4, figsize=(16, 8)) |
|
|
axes = axes.flatten() |
|
|
|
|
|
for idx, (name, acts) in enumerate(activation_distributions.items()): |
|
|
ax = axes[idx] |
|
|
|
|
|
acts_clean = acts[np.isfinite(acts)] |
|
|
if len(acts_clean) == 0: |
|
|
acts_clean = np.array([0.0]) |
|
|
acts_clipped = np.clip(acts_clean, -5, 5) |
|
|
ax.hist(acts_clipped, bins=100, density=True, alpha=0.7, color=colors[idx]) |
|
|
ax.set_title(f'{name}') |
|
|
ax.set_xlabel('Activation Value') |
|
|
ax.set_ylabel('Density') |
|
|
ax.axvline(x=0, color='red', linestyle='--', alpha=0.5) |
|
|
|
|
|
|
|
|
ax.text(0.95, 0.95, f'mean={np.nanmean(acts_clean):.2f}\nstd={np.nanstd(acts_clean):.2f}', |
|
|
transform=ax.transAxes, ha='right', va='top', fontsize=8, |
|
|
bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) |
|
|
|
|
|
plt.suptitle('Activation Value Distributions (after training)', fontsize=14) |
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp2_activation_distributions.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
print("\nβ Saved: exp2_sparsity_dead_neurons.png") |
|
|
print("β Saved: exp2_activation_distributions.png") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def experiment_3_stability(): |
|
|
""" |
|
|
EXPERIMENT 3: How stable is training under stress conditions? |
|
|
|
|
|
Theory: |
|
|
- Large learning rates can cause gradient explosion |
|
|
- Deep networks amplify instability |
|
|
- Bounded activations (Sigmoid, Tanh) are more stable but learn slower |
|
|
- Unbounded activations (ReLU, GELU) can diverge but learn faster |
|
|
|
|
|
We test: |
|
|
- Training with increasingly large learning rates |
|
|
- Training with increasing depth |
|
|
- Measuring loss divergence and gradient explosion |
|
|
""" |
|
|
print("\n" + "="*80) |
|
|
print("EXPERIMENT 3: STABILITY UNDER STRESS") |
|
|
print("="*80) |
|
|
|
|
|
activations = ActivationFunctions.get_all() |
|
|
|
|
|
|
|
|
print("\n--- Test 3a: Learning Rate Stress ---") |
|
|
learning_rates = [0.001, 0.01, 0.1, 0.5, 1.0] |
|
|
depth = 10 |
|
|
width = 64 |
|
|
|
|
|
|
|
|
x_data = torch.linspace(-2, 2, 200).unsqueeze(1) |
|
|
y_data = torch.sin(x_data * np.pi) |
|
|
|
|
|
lr_results = {name: {} for name in activations} |
|
|
|
|
|
for name, (func, deriv, module) in activations.items(): |
|
|
print(f"\n {name}:") |
|
|
|
|
|
for lr in learning_rates: |
|
|
|
|
|
layers = [] |
|
|
for i in range(depth): |
|
|
layers.append(nn.Linear(width if i > 0 else 1, width)) |
|
|
layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
|
|
layers.append(nn.Linear(width, 1)) |
|
|
model = nn.Sequential(*layers) |
|
|
|
|
|
|
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear): |
|
|
nn.init.xavier_uniform_(m.weight) |
|
|
nn.init.zeros_(m.bias) |
|
|
|
|
|
optimizer = torch.optim.SGD(model.parameters(), lr=lr) |
|
|
|
|
|
|
|
|
losses = [] |
|
|
diverged = False |
|
|
|
|
|
for epoch in range(100): |
|
|
optimizer.zero_grad() |
|
|
pred = model(x_data) |
|
|
loss = F.mse_loss(pred, y_data) |
|
|
|
|
|
if torch.isnan(loss) or torch.isinf(loss) or loss.item() > 1e6: |
|
|
diverged = True |
|
|
break |
|
|
|
|
|
losses.append(loss.item()) |
|
|
loss.backward() |
|
|
|
|
|
|
|
|
max_grad = max(p.grad.abs().max().item() for p in model.parameters() if p.grad is not None) |
|
|
if max_grad > 1e6: |
|
|
diverged = True |
|
|
break |
|
|
|
|
|
optimizer.step() |
|
|
|
|
|
lr_results[name][lr] = { |
|
|
'diverged': diverged, |
|
|
'final_loss': losses[-1] if losses else float('inf'), |
|
|
'epochs_completed': len(losses), |
|
|
} |
|
|
|
|
|
status = "DIVERGED" if diverged else f"loss={losses[-1]:.4f}" |
|
|
print(f" lr={lr}: {status}") |
|
|
|
|
|
|
|
|
print("\n--- Test 3b: Depth Stress ---") |
|
|
depths = [5, 10, 20, 50, 100] |
|
|
lr = 0.01 |
|
|
|
|
|
depth_results = {name: {} for name in activations} |
|
|
|
|
|
for name, (func, deriv, module) in activations.items(): |
|
|
print(f"\n {name}:") |
|
|
|
|
|
for depth in depths: |
|
|
|
|
|
layers = [] |
|
|
for i in range(depth): |
|
|
layers.append(nn.Linear(width if i > 0 else 1, width)) |
|
|
layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
|
|
layers.append(nn.Linear(width, 1)) |
|
|
model = nn.Sequential(*layers) |
|
|
|
|
|
|
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear): |
|
|
nn.init.xavier_uniform_(m.weight) |
|
|
nn.init.zeros_(m.bias) |
|
|
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=lr) |
|
|
|
|
|
|
|
|
losses = [] |
|
|
diverged = False |
|
|
|
|
|
for epoch in range(200): |
|
|
optimizer.zero_grad() |
|
|
pred = model(x_data) |
|
|
loss = F.mse_loss(pred, y_data) |
|
|
|
|
|
if torch.isnan(loss) or torch.isinf(loss) or loss.item() > 1e6: |
|
|
diverged = True |
|
|
break |
|
|
|
|
|
losses.append(loss.item()) |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
depth_results[name][depth] = { |
|
|
'diverged': diverged, |
|
|
'final_loss': losses[-1] if losses else float('inf'), |
|
|
'loss_history': losses, |
|
|
} |
|
|
|
|
|
status = "DIVERGED" if diverged else f"loss={losses[-1]:.4f}" |
|
|
print(f" depth={depth}: {status}") |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5)) |
|
|
|
|
|
|
|
|
ax1 = axes[0] |
|
|
names = list(lr_results.keys()) |
|
|
x_pos = np.arange(len(learning_rates)) |
|
|
width_bar = 0.1 |
|
|
|
|
|
for idx, name in enumerate(names): |
|
|
final_losses = [] |
|
|
for lr in learning_rates: |
|
|
data = lr_results[name][lr] |
|
|
if data['diverged']: |
|
|
final_losses.append(10) |
|
|
else: |
|
|
final_losses.append(min(data['final_loss'], 10)) |
|
|
|
|
|
ax1.bar(x_pos + idx * width_bar, final_losses, width_bar, label=name) |
|
|
|
|
|
ax1.set_xlabel('Learning Rate') |
|
|
ax1.set_ylabel('Final Loss (capped at 10)') |
|
|
ax1.set_title('Stability vs Learning Rate (depth=10)') |
|
|
ax1.set_xticks(x_pos + width_bar * len(names) / 2) |
|
|
ax1.set_xticklabels([str(lr) for lr in learning_rates]) |
|
|
ax1.legend(loc='upper left', fontsize=7) |
|
|
ax1.set_yscale('log') |
|
|
ax1.axhline(y=10, color='red', linestyle='--', label='Diverged') |
|
|
|
|
|
|
|
|
ax2 = axes[1] |
|
|
colors = plt.cm.tab10(np.linspace(0, 1, len(names))) |
|
|
|
|
|
for idx, name in enumerate(names): |
|
|
final_losses = [] |
|
|
for depth in depths: |
|
|
data = depth_results[name][depth] |
|
|
if data['diverged']: |
|
|
final_losses.append(10) |
|
|
else: |
|
|
final_losses.append(min(data['final_loss'], 10)) |
|
|
|
|
|
ax2.semilogy(depths, final_losses, 'o-', label=name, color=colors[idx]) |
|
|
|
|
|
ax2.set_xlabel('Network Depth') |
|
|
ax2.set_ylabel('Final Loss (log scale)') |
|
|
ax2.set_title('Stability vs Network Depth (lr=0.01)') |
|
|
ax2.legend(loc='upper left', fontsize=7) |
|
|
ax2.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp3_stability.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
print("\nβ Saved: exp3_stability.png") |
|
|
|
|
|
return {'lr_results': lr_results, 'depth_results': depth_results} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def experiment_4_representational_capacity(): |
|
|
""" |
|
|
EXPERIMENT 4: How well can networks represent different functions? |
|
|
|
|
|
Theory: |
|
|
- Universal Approximation: Any continuous function can be approximated |
|
|
with enough neurons, but activation choice affects efficiency |
|
|
- Smooth activations β smoother approximations |
|
|
- Piecewise linear (ReLU) β piecewise linear approximations |
|
|
- Some functions are easier/harder for certain activations |
|
|
|
|
|
We test approximation of: |
|
|
- Smooth function: sin(x) |
|
|
- Sharp function: |x| |
|
|
- Discontinuous-like: step function (smoothed) |
|
|
- High-frequency: sin(10x) |
|
|
- Polynomial: x^3 |
|
|
""" |
|
|
print("\n" + "="*80) |
|
|
print("EXPERIMENT 4: REPRESENTATIONAL CAPACITY") |
|
|
print("="*80) |
|
|
|
|
|
activations = ActivationFunctions.get_all() |
|
|
|
|
|
|
|
|
target_functions = { |
|
|
'sin(x)': lambda x: torch.sin(x), |
|
|
'|x|': lambda x: torch.abs(x), |
|
|
'step': lambda x: torch.sigmoid(10 * x), |
|
|
'sin(10x)': lambda x: torch.sin(10 * x), |
|
|
'xΒ³': lambda x: x ** 3, |
|
|
} |
|
|
|
|
|
depth = 5 |
|
|
width = 64 |
|
|
epochs = 500 |
|
|
|
|
|
results = {name: {} for name in activations} |
|
|
predictions = {name: {} for name in activations} |
|
|
|
|
|
x_train = torch.linspace(-2, 2, 200).unsqueeze(1) |
|
|
x_test = torch.linspace(-2, 2, 500).unsqueeze(1) |
|
|
|
|
|
for func_name, func in target_functions.items(): |
|
|
print(f"\n--- Target: {func_name} ---") |
|
|
|
|
|
y_train = func(x_train) |
|
|
y_test = func(x_test) |
|
|
|
|
|
for name, (_, _, module) in activations.items(): |
|
|
|
|
|
layers = [] |
|
|
for i in range(depth): |
|
|
layers.append(nn.Linear(width if i > 0 else 1, width)) |
|
|
layers.append(type(module)() if not isinstance(module, nn.Identity) else nn.Identity()) |
|
|
layers.append(nn.Linear(width, 1)) |
|
|
model = nn.Sequential(*layers) |
|
|
|
|
|
|
|
|
for m in model.modules(): |
|
|
if isinstance(m, nn.Linear): |
|
|
nn.init.xavier_uniform_(m.weight) |
|
|
nn.init.zeros_(m.bias) |
|
|
|
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) |
|
|
|
|
|
|
|
|
for epoch in range(epochs): |
|
|
optimizer.zero_grad() |
|
|
pred = model(x_train) |
|
|
loss = F.mse_loss(pred, y_train) |
|
|
loss.backward() |
|
|
optimizer.step() |
|
|
|
|
|
|
|
|
model.eval() |
|
|
with torch.no_grad(): |
|
|
pred_test = model(x_test) |
|
|
test_loss = F.mse_loss(pred_test, y_test).item() |
|
|
|
|
|
results[name][func_name] = test_loss |
|
|
predictions[name][func_name] = pred_test.numpy() |
|
|
|
|
|
print(f" {name:12s}: MSE = {test_loss:.6f}") |
|
|
|
|
|
|
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
|
|
|
act_names = list(results.keys()) |
|
|
func_names = list(target_functions.keys()) |
|
|
|
|
|
data = np.array([[results[act][func] for func in func_names] for act in act_names]) |
|
|
|
|
|
|
|
|
data_log = np.log10(data + 1e-10) |
|
|
|
|
|
im = ax.imshow(data_log, cmap='RdYlGn_r', aspect='auto') |
|
|
|
|
|
ax.set_xticks(range(len(func_names))) |
|
|
ax.set_xticklabels(func_names, rotation=45, ha='right') |
|
|
ax.set_yticks(range(len(act_names))) |
|
|
ax.set_yticklabels(act_names) |
|
|
|
|
|
|
|
|
for i in range(len(act_names)): |
|
|
for j in range(len(func_names)): |
|
|
text = f'{data[i, j]:.4f}' |
|
|
ax.text(j, i, text, ha='center', va='center', fontsize=8, |
|
|
color='white' if data_log[i, j] > -2 else 'black') |
|
|
|
|
|
ax.set_title('Representational Capacity: MSE by Activation Γ Target Function\n(lower is better)') |
|
|
plt.colorbar(im, label='log10(MSE)') |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp4_representational_heatmap.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
|
|
|
fig, axes = plt.subplots(len(target_functions), 1, figsize=(12, 3*len(target_functions))) |
|
|
|
|
|
colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
|
|
x_np = x_test.numpy().flatten() |
|
|
|
|
|
for idx, (func_name, func) in enumerate(target_functions.items()): |
|
|
ax = axes[idx] |
|
|
y_true = func(x_test).numpy().flatten() |
|
|
|
|
|
ax.plot(x_np, y_true, 'k-', linewidth=3, label='Ground Truth', alpha=0.7) |
|
|
|
|
|
for act_idx, name in enumerate(activations.keys()): |
|
|
pred = predictions[name][func_name].flatten() |
|
|
ax.plot(x_np, pred, '--', color=colors[act_idx], label=name, alpha=0.7, linewidth=1.5) |
|
|
|
|
|
ax.set_title(f'Target: {func_name}') |
|
|
ax.set_xlabel('x') |
|
|
ax.set_ylabel('y') |
|
|
ax.legend(loc='best', fontsize=7, ncol=3) |
|
|
ax.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('activation_functions/exp4_predictions.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
print("\nβ Saved: exp4_representational_heatmap.png") |
|
|
print("β Saved: exp4_predictions.png") |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Run all experiments and generate comprehensive report.""" |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("ACTIVATION FUNCTION COMPREHENSIVE TUTORIAL") |
|
|
print("="*80) |
|
|
|
|
|
|
|
|
exp1_results = experiment_1_gradient_flow() |
|
|
exp2_results = experiment_2_sparsity_dead_neurons() |
|
|
exp3_results = experiment_3_stability() |
|
|
exp4_results = experiment_4_representational_capacity() |
|
|
|
|
|
|
|
|
generate_summary_figure(exp1_results, exp2_results, exp3_results, exp4_results) |
|
|
|
|
|
|
|
|
generate_tutorial_report(exp1_results, exp2_results, exp3_results, exp4_results) |
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("ALL EXPERIMENTS COMPLETE!") |
|
|
print("="*80) |
|
|
print("\nGenerated files:") |
|
|
print(" - exp1_gradient_flow.png") |
|
|
print(" - exp2_sparsity_dead_neurons.png") |
|
|
print(" - exp2_activation_distributions.png") |
|
|
print(" - exp3_stability.png") |
|
|
print(" - exp4_representational_heatmap.png") |
|
|
print(" - exp4_predictions.png") |
|
|
print(" - summary_figure.png") |
|
|
print(" - activation_tutorial.md") |
|
|
|
|
|
|
|
|
def generate_summary_figure(exp1, exp2, exp3, exp4): |
|
|
"""Generate a comprehensive summary figure.""" |
|
|
|
|
|
fig = plt.figure(figsize=(20, 16)) |
|
|
gs = gridspec.GridSpec(3, 3, figure=fig, hspace=0.3, wspace=0.3) |
|
|
|
|
|
activations = list(exp1.keys()) |
|
|
colors = plt.cm.tab10(np.linspace(0, 1, len(activations))) |
|
|
|
|
|
|
|
|
ax1 = fig.add_subplot(gs[0, 0]) |
|
|
for (name, data), color in zip(exp1.items(), colors): |
|
|
if 20 in data: |
|
|
grads = data[20]['grad_magnitudes'] |
|
|
ax1.semilogy(range(1, len(grads)+1), grads, 'o-', label=name, color=color, markersize=3) |
|
|
ax1.set_xlabel('Layer') |
|
|
ax1.set_ylabel('Gradient Magnitude') |
|
|
ax1.set_title('1. Gradient Flow (depth=20)') |
|
|
ax1.legend(fontsize=7) |
|
|
ax1.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
ax2 = fig.add_subplot(gs[0, 1]) |
|
|
sparsities = [exp2[n]['avg_sparsity'] * 100 for n in activations] |
|
|
bars = ax2.bar(range(len(activations)), sparsities, color=colors) |
|
|
ax2.set_xticks(range(len(activations))) |
|
|
ax2.set_xticklabels(activations, rotation=45, ha='right', fontsize=8) |
|
|
ax2.set_ylabel('Sparsity (%)') |
|
|
ax2.set_title('2. Activation Sparsity') |
|
|
|
|
|
|
|
|
ax3 = fig.add_subplot(gs[0, 2]) |
|
|
dead_rates = [exp2[n]['avg_dead_neurons'] * 100 for n in activations] |
|
|
bars = ax3.bar(range(len(activations)), dead_rates, color=colors) |
|
|
ax3.set_xticks(range(len(activations))) |
|
|
ax3.set_xticklabels(activations, rotation=45, ha='right', fontsize=8) |
|
|
ax3.set_ylabel('Dead Neuron Rate (%)') |
|
|
ax3.set_title('3. Dead Neurons') |
|
|
|
|
|
|
|
|
ax4 = fig.add_subplot(gs[1, 0]) |
|
|
learning_rates = [0.001, 0.01, 0.1, 0.5, 1.0] |
|
|
for idx, name in enumerate(activations): |
|
|
final_losses = [] |
|
|
for lr in learning_rates: |
|
|
data = exp3['lr_results'][name][lr] |
|
|
if data['diverged']: |
|
|
final_losses.append(10) |
|
|
else: |
|
|
final_losses.append(min(data['final_loss'], 10)) |
|
|
ax4.semilogy(learning_rates, final_losses, 'o-', label=name, color=colors[idx], markersize=4) |
|
|
ax4.set_xlabel('Learning Rate') |
|
|
ax4.set_ylabel('Final Loss') |
|
|
ax4.set_title('4. Stability vs Learning Rate') |
|
|
ax4.legend(fontsize=6) |
|
|
ax4.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
ax5 = fig.add_subplot(gs[1, 1]) |
|
|
depths = [5, 10, 20, 50, 100] |
|
|
for idx, name in enumerate(activations): |
|
|
final_losses = [] |
|
|
for depth in depths: |
|
|
data = exp3['depth_results'][name][depth] |
|
|
if data['diverged']: |
|
|
final_losses.append(10) |
|
|
else: |
|
|
final_losses.append(min(data['final_loss'], 10)) |
|
|
ax5.semilogy(depths, final_losses, 'o-', label=name, color=colors[idx], markersize=4) |
|
|
ax5.set_xlabel('Network Depth') |
|
|
ax5.set_ylabel('Final Loss') |
|
|
ax5.set_title('5. Stability vs Depth') |
|
|
ax5.legend(fontsize=6) |
|
|
ax5.grid(True, alpha=0.3) |
|
|
|
|
|
|
|
|
ax6 = fig.add_subplot(gs[1, 2]) |
|
|
func_names = list(exp4[activations[0]].keys()) |
|
|
data = np.array([[exp4[act][func] for func in func_names] for act in activations]) |
|
|
data_log = np.log10(data + 1e-10) |
|
|
im = ax6.imshow(data_log, cmap='RdYlGn_r', aspect='auto') |
|
|
ax6.set_xticks(range(len(func_names))) |
|
|
ax6.set_xticklabels(func_names, rotation=45, ha='right', fontsize=8) |
|
|
ax6.set_yticks(range(len(activations))) |
|
|
ax6.set_yticklabels(activations, fontsize=8) |
|
|
ax6.set_title('6. Representational Capacity (log MSE)') |
|
|
plt.colorbar(im, ax=ax6, shrink=0.8) |
|
|
|
|
|
|
|
|
ax7 = fig.add_subplot(gs[2, :]) |
|
|
ax7.axis('off') |
|
|
|
|
|
insights_text = """ |
|
|
KEY INSIGHTS FROM EXPERIMENTS |
|
|
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
|
|
|
1. GRADIENT FLOW: |
|
|
β’ Sigmoid/Tanh suffer severe vanishing gradients in deep networks (gradients shrink exponentially) |
|
|
β’ ReLU maintains gradient magnitude but can have zero gradients (dead neurons) |
|
|
β’ GELU/Swish provide smooth, well-behaved gradient flow |
|
|
|
|
|
2. SPARSITY & DEAD NEURONS: |
|
|
β’ ReLU creates highly sparse activations (~50% zeros) - good for efficiency, bad if neurons die |
|
|
β’ Leaky ReLU/ELU prevent dead neurons while maintaining some sparsity |
|
|
β’ Sigmoid/Tanh rarely have exact zeros but can saturate |
|
|
|
|
|
3. STABILITY: |
|
|
β’ Bounded activations (Sigmoid, Tanh) are more stable but learn slower |
|
|
β’ ReLU can diverge with large learning rates or deep networks |
|
|
β’ Modern activations (GELU, Swish) offer good stability-performance tradeoff |
|
|
|
|
|
4. REPRESENTATIONAL CAPACITY: |
|
|
β’ All activations can approximate smooth functions well (Universal Approximation) |
|
|
β’ ReLU excels at sharp/piecewise functions (|x|) |
|
|
β’ Smooth activations (GELU, Swish) better for smooth targets |
|
|
β’ High-frequency functions are challenging for all activations |
|
|
|
|
|
RECOMMENDATIONS: |
|
|
β’ Default choice: ReLU or LeakyReLU (simple, fast, effective) |
|
|
β’ For transformers/attention: GELU (standard in BERT, GPT) |
|
|
β’ For very deep networks: LeakyReLU, ELU, or use residual connections |
|
|
β’ Avoid: Sigmoid/Tanh in hidden layers of deep networks |
|
|
""" |
|
|
|
|
|
ax7.text(0.5, 0.5, insights_text, transform=ax7.transAxes, fontsize=10, |
|
|
verticalalignment='center', horizontalalignment='center', |
|
|
fontfamily='monospace', |
|
|
bbox=dict(boxstyle='round', facecolor='lightgray', alpha=0.8)) |
|
|
|
|
|
plt.suptitle('Comprehensive Activation Function Analysis', fontsize=16, fontweight='bold') |
|
|
plt.savefig('activation_functions/summary_figure.png', dpi=150, bbox_inches='tight') |
|
|
plt.close() |
|
|
|
|
|
print("\nβ Saved: summary_figure.png") |
|
|
|
|
|
|
|
|
def generate_tutorial_report(exp1, exp2, exp3, exp4): |
|
|
"""Generate comprehensive markdown tutorial.""" |
|
|
|
|
|
activations = list(exp1.keys()) |
|
|
|
|
|
report = """# Comprehensive Tutorial: Activation Functions in Deep Learning |
|
|
|
|
|
## Table of Contents |
|
|
1. [Introduction](#introduction) |
|
|
2. [Theoretical Background](#theoretical-background) |
|
|
3. [Experiment 1: Gradient Flow](#experiment-1-gradient-flow) |
|
|
4. [Experiment 2: Sparsity and Dead Neurons](#experiment-2-sparsity-and-dead-neurons) |
|
|
5. [Experiment 3: Training Stability](#experiment-3-training-stability) |
|
|
6. [Experiment 4: Representational Capacity](#experiment-4-representational-capacity) |
|
|
7. [Summary and Recommendations](#summary-and-recommendations) |
|
|
|
|
|
--- |
|
|
|
|
|
## Introduction |
|
|
|
|
|
Activation functions are a critical component of neural networks that introduce non-linearity, enabling networks to learn complex patterns. This tutorial provides both **theoretical explanations** and **empirical experiments** to understand how different activation functions affect: |
|
|
|
|
|
1. **Gradient Flow**: Do gradients vanish or explode during backpropagation? |
|
|
2. **Sparsity & Dead Neurons**: How easily do units turn on/off? |
|
|
3. **Stability**: How robust is training under stress (large learning rates, deep networks)? |
|
|
4. **Representational Capacity**: How well can the network approximate different functions? |
|
|
|
|
|
### Activation Functions Studied |
|
|
|
|
|
| Function | Formula | Range | Key Property | |
|
|
|----------|---------|-------|--------------| |
|
|
| Linear | f(x) = x | (-β, β) | No non-linearity | |
|
|
| Sigmoid | f(x) = 1/(1+eβ»Λ£) | (0, 1) | Bounded, saturates | |
|
|
| Tanh | f(x) = (eΛ£-eβ»Λ£)/(eΛ£+eβ»Λ£) | (-1, 1) | Zero-centered, saturates | |
|
|
| ReLU | f(x) = max(0, x) | [0, β) | Sparse, can die | |
|
|
| Leaky ReLU | f(x) = max(Ξ±x, x) | (-β, β) | Prevents dead neurons | |
|
|
| ELU | f(x) = x if x>0, Ξ±(eΛ£-1) otherwise | (-Ξ±, β) | Smooth negative region | |
|
|
| GELU | f(x) = xΒ·Ξ¦(x) | β(-0.17, β) | Smooth, probabilistic | |
|
|
| Swish | f(x) = xΒ·Ο(x) | β(-0.28, β) | Self-gated | |
|
|
|
|
|
--- |
|
|
|
|
|
## Theoretical Background |
|
|
|
|
|
### Why Non-linearity Matters |
|
|
|
|
|
Without activation functions, a neural network of any depth is equivalent to a single linear transformation: |
|
|
|
|
|
``` |
|
|
f(x) = Wβ Γ Wβββ Γ ... Γ Wβ Γ x = W_combined Γ x |
|
|
``` |
|
|
|
|
|
Non-linear activations allow networks to approximate **any continuous function** (Universal Approximation Theorem). |
|
|
|
|
|
### The Gradient Flow Problem |
|
|
|
|
|
During backpropagation, gradients flow through the chain rule: |
|
|
|
|
|
``` |
|
|
βL/βWα΅’ = βL/βaβ Γ βaβ/βaβββ Γ ... Γ βaα΅’ββ/βaα΅’ Γ βaα΅’/βWα΅’ |
|
|
``` |
|
|
|
|
|
Each layer contributes a factor of **Ο'(z) Γ W**, where Ο' is the activation derivative. |
|
|
|
|
|
**Vanishing Gradients**: When |Ο'(z)| < 1 repeatedly |
|
|
- Sigmoid: Ο'(z) β (0, 0.25], maximum at z=0 |
|
|
- For n layers: gradient β (0.25)βΏ β 0 as n β β |
|
|
|
|
|
**Exploding Gradients**: When |Ο'(z) Γ W| > 1 repeatedly |
|
|
- More common with unbounded activations |
|
|
- Mitigated by gradient clipping, proper initialization |
|
|
|
|
|
--- |
|
|
|
|
|
## Experiment 1: Gradient Flow |
|
|
|
|
|
### Question |
|
|
How do gradients propagate through deep networks with different activations? |
|
|
|
|
|
### Method |
|
|
- Built networks with depths [5, 10, 20, 50] |
|
|
- Measured gradient magnitude at each layer during backpropagation |
|
|
- Used Xavier initialization for fair comparison |
|
|
|
|
|
### Results |
|
|
|
|
|
 |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
report += "#### Gradient Ratio (Layer 10 / Layer 1) at Depth=20\n\n" |
|
|
report += "| Activation | Gradient Ratio | Interpretation |\n" |
|
|
report += "|------------|----------------|----------------|\n" |
|
|
|
|
|
for name in activations: |
|
|
if 20 in exp1[name]: |
|
|
ratio = exp1[name][20]['grad_ratio'] |
|
|
if ratio > 1e6: |
|
|
interp = "Severe vanishing gradients" |
|
|
elif ratio > 100: |
|
|
interp = "Significant gradient decay" |
|
|
elif ratio > 10: |
|
|
interp = "Moderate gradient decay" |
|
|
elif ratio > 0.1: |
|
|
interp = "Stable gradient flow" |
|
|
else: |
|
|
interp = "Gradient amplification" |
|
|
report += f"| {name} | {ratio:.2e} | {interp} |\n" |
|
|
|
|
|
report += """ |
|
|
### Theoretical Explanation |
|
|
|
|
|
**Sigmoid** shows the most severe gradient decay because: |
|
|
- Maximum derivative is only 0.25 (at z=0) |
|
|
- In deep networks: 0.25Β²β° β 10β»ΒΉΒ² (effectively zero!) |
|
|
|
|
|
**ReLU** maintains gradients better because: |
|
|
- Derivative is exactly 1 for positive inputs |
|
|
- But can be exactly 0 for negative inputs (dead neurons) |
|
|
|
|
|
**GELU/Swish** provide smooth gradient flow: |
|
|
- Derivatives are bounded but not as severely as Sigmoid |
|
|
- Smooth transitions prevent sudden gradient changes |
|
|
|
|
|
--- |
|
|
|
|
|
## Experiment 2: Sparsity and Dead Neurons |
|
|
|
|
|
### Question |
|
|
How do activations affect the sparsity of representations and the "death" of neurons? |
|
|
|
|
|
### Method |
|
|
- Trained 10-layer networks with high learning rate (0.1) to stress-test |
|
|
- Measured activation sparsity (% of near-zero activations) |
|
|
- Measured dead neuron rate (neurons that never activate) |
|
|
|
|
|
### Results |
|
|
|
|
|
 |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
report += "| Activation | Sparsity (%) | Dead Neurons (%) |\n" |
|
|
report += "|------------|--------------|------------------|\n" |
|
|
|
|
|
for name in activations: |
|
|
sparsity = exp2[name]['avg_sparsity'] * 100 |
|
|
dead = exp2[name]['avg_dead_neurons'] * 100 |
|
|
report += f"| {name} | {sparsity:.1f}% | {dead:.1f}% |\n" |
|
|
|
|
|
report += """ |
|
|
### Theoretical Explanation |
|
|
|
|
|
**ReLU creates sparse representations**: |
|
|
- Any negative input β output is exactly 0 |
|
|
- ~50% sparsity is typical with zero-mean inputs |
|
|
- Sparsity can be beneficial (efficiency, regularization) |
|
|
|
|
|
**Dead Neuron Problem**: |
|
|
- If a ReLU neuron's input is always negative, it outputs 0 forever |
|
|
- Gradient is 0, so weights never update |
|
|
- Caused by: bad initialization, large learning rates, unlucky gradients |
|
|
|
|
|
**Solutions**: |
|
|
- **Leaky ReLU**: Small gradient (0.01) for negative inputs |
|
|
- **ELU**: Smooth negative region with non-zero gradient |
|
|
- **Proper initialization**: Keep activations in a good range |
|
|
|
|
|
--- |
|
|
|
|
|
## Experiment 3: Training Stability |
|
|
|
|
|
### Question |
|
|
How stable is training under stress conditions (large learning rates, deep networks)? |
|
|
|
|
|
### Method |
|
|
- Tested learning rates: [0.001, 0.01, 0.1, 0.5, 1.0] |
|
|
- Tested depths: [5, 10, 20, 50, 100] |
|
|
- Measured whether training diverged (loss β β) |
|
|
|
|
|
### Results |
|
|
|
|
|
 |
|
|
|
|
|
### Key Observations |
|
|
|
|
|
**Learning Rate Stability**: |
|
|
- Sigmoid/Tanh: Most stable (bounded outputs prevent explosion) |
|
|
- ReLU: Can diverge at high learning rates |
|
|
- GELU/Swish: Good balance of stability and performance |
|
|
|
|
|
**Depth Stability**: |
|
|
- All activations struggle with depth > 50 without special techniques |
|
|
- Sigmoid fails earliest due to vanishing gradients |
|
|
- ReLU/LeakyReLU maintain trainability longer |
|
|
|
|
|
### Theoretical Explanation |
|
|
|
|
|
**Why bounded activations are more stable**: |
|
|
- Sigmoid outputs β (0, 1), so activations can't explode |
|
|
- But gradients can vanish, making learning very slow |
|
|
|
|
|
**Why ReLU can be unstable**: |
|
|
- Unbounded outputs: large inputs β large outputs β larger gradients |
|
|
- Positive feedback loop can cause explosion |
|
|
|
|
|
**Modern solutions**: |
|
|
- Batch Normalization: Keeps activations in good range |
|
|
- Residual Connections: Allow gradients to bypass layers |
|
|
- Gradient Clipping: Prevents explosion |
|
|
|
|
|
--- |
|
|
|
|
|
## Experiment 4: Representational Capacity |
|
|
|
|
|
### Question |
|
|
How well can networks with different activations approximate various functions? |
|
|
|
|
|
### Method |
|
|
- Target functions: sin(x), |x|, step, sin(10x), xΒ³ |
|
|
- 5-layer networks, 500 epochs training |
|
|
- Measured test MSE |
|
|
|
|
|
### Results |
|
|
|
|
|
 |
|
|
|
|
|
 |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
report += "#### Test MSE by Activation Γ Target Function\n\n" |
|
|
func_names = list(exp4[activations[0]].keys()) |
|
|
|
|
|
report += "| Activation | " + " | ".join(func_names) + " |\n" |
|
|
report += "|------------|" + "|".join(["------" for _ in func_names]) + "|\n" |
|
|
|
|
|
for name in activations: |
|
|
values = [f"{exp4[name][f]:.4f}" for f in func_names] |
|
|
report += f"| {name} | " + " | ".join(values) + " |\n" |
|
|
|
|
|
report += """ |
|
|
### Theoretical Explanation |
|
|
|
|
|
**Universal Approximation Theorem**: |
|
|
- Any continuous function can be approximated with enough neurons |
|
|
- But different activations have different "inductive biases" |
|
|
|
|
|
**ReLU excels at piecewise functions** (like |x|): |
|
|
- ReLU networks compute piecewise linear functions |
|
|
- Perfect match for |x| which is piecewise linear |
|
|
|
|
|
**Smooth activations for smooth functions**: |
|
|
- GELU, Swish produce smoother decision boundaries |
|
|
- Better for smooth targets like sin(x) |
|
|
|
|
|
**High-frequency functions are hard**: |
|
|
- sin(10x) has 10 oscillations in [-2, 2] |
|
|
- Requires many neurons to capture all oscillations |
|
|
- All activations struggle without sufficient width |
|
|
|
|
|
--- |
|
|
|
|
|
## Summary and Recommendations |
|
|
|
|
|
### Comparison Table |
|
|
|
|
|
| Property | Best Activations | Worst Activations | |
|
|
|----------|------------------|-------------------| |
|
|
| Gradient Flow | LeakyReLU, GELU | Sigmoid, Tanh | |
|
|
| Avoids Dead Neurons | LeakyReLU, ELU, GELU | ReLU | |
|
|
| Training Stability | Sigmoid, Tanh, GELU | ReLU (high lr) | |
|
|
| Smooth Functions | GELU, Swish, Tanh | ReLU | |
|
|
| Sharp Functions | ReLU, LeakyReLU | Sigmoid | |
|
|
| Computational Speed | ReLU, LeakyReLU | GELU, Swish | |
|
|
|
|
|
### Practical Recommendations |
|
|
|
|
|
1. **Default Choice**: **ReLU** or **LeakyReLU** |
|
|
- Simple, fast, effective for most tasks |
|
|
- Use LeakyReLU if dead neurons are a concern |
|
|
|
|
|
2. **For Transformers/Attention**: **GELU** |
|
|
- Standard in BERT, GPT, modern transformers |
|
|
- Smooth gradients help with optimization |
|
|
|
|
|
3. **For Very Deep Networks**: **LeakyReLU** or **ELU** |
|
|
- Or use residual connections + batch normalization |
|
|
- Avoid Sigmoid/Tanh in hidden layers |
|
|
|
|
|
4. **For Regression with Bounded Outputs**: **Sigmoid** (output layer only) |
|
|
- Use for probabilities or [0, 1] outputs |
|
|
- Never in hidden layers of deep networks |
|
|
|
|
|
5. **For RNNs/LSTMs**: **Tanh** (traditional choice) |
|
|
- Zero-centered helps with recurrent dynamics |
|
|
- Modern alternative: use Transformers instead |
|
|
|
|
|
### The Big Picture |
|
|
|
|
|
``` |
|
|
ACTIVATION FUNCTION SELECTION GUIDE |
|
|
|
|
|
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β Is it a hidden layer? β |
|
|
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β |
|
|
βββββββββββββββββ΄ββββββββββββββββ |
|
|
βΌ βΌ |
|
|
YES NO (output layer) |
|
|
β β |
|
|
βΌ βΌ |
|
|
βββββββββββββββββββ βββββββββββββββββββββββ |
|
|
β Is it a β β What's the task? β |
|
|
β Transformer? β β β |
|
|
βββββββββββββββββββ β Binary class β Sigmoid |
|
|
β β Multi-class β Softmax |
|
|
βββββββββ΄ββββββββ β Regression β Linear β |
|
|
βΌ βΌ βββββββββββββββββββββββ |
|
|
YES NO |
|
|
β β |
|
|
βΌ βΌ |
|
|
GELU βββββββββββββββββββ |
|
|
β Worried about β |
|
|
β dead neurons? β |
|
|
βββββββββββββββββββ |
|
|
β |
|
|
βββββββββ΄ββββββββ |
|
|
βΌ βΌ |
|
|
YES NO |
|
|
β β |
|
|
βΌ βΌ |
|
|
LeakyReLU ReLU |
|
|
or ELU |
|
|
``` |
|
|
|
|
|
--- |
|
|
|
|
|
## Files Generated |
|
|
|
|
|
| File | Description | |
|
|
|------|-------------| |
|
|
| exp1_gradient_flow.png | Gradient magnitude across layers | |
|
|
| exp2_sparsity_dead_neurons.png | Sparsity and dead neuron rates | |
|
|
| exp2_activation_distributions.png | Activation value distributions | |
|
|
| exp3_stability.png | Stability vs learning rate and depth | |
|
|
| exp4_representational_heatmap.png | MSE heatmap for different targets | |
|
|
| exp4_predictions.png | Actual predictions vs ground truth | |
|
|
| summary_figure.png | Comprehensive summary visualization | |
|
|
|
|
|
--- |
|
|
|
|
|
## References |
|
|
|
|
|
1. Glorot, X., & Bengio, Y. (2010). Understanding the difficulty of training deep feedforward neural networks. |
|
|
2. He, K., et al. (2015). Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification. |
|
|
3. Hendrycks, D., & Gimpel, K. (2016). Gaussian Error Linear Units (GELUs). |
|
|
4. Ramachandran, P., et al. (2017). Searching for Activation Functions. |
|
|
5. Nwankpa, C., et al. (2018). Activation Functions: Comparison of trends in Practice and Research for Deep Learning. |
|
|
|
|
|
--- |
|
|
|
|
|
*Tutorial generated by Orchestra Research Assistant* |
|
|
*All experiments are reproducible with the provided code* |
|
|
""" |
|
|
|
|
|
with open('activation_functions/activation_tutorial.md', 'w') as f: |
|
|
f.write(report) |
|
|
|
|
|
print("\nβ Saved: activation_tutorial.md") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|