Bfgs

Day 19: Quasi-Newton — BFGS and L-BFGS — Companion Code
"""
Day 19: Quasi-Newton — BFGS and L-BFGS — Companion Code
Phase II, Week 3

Run: python bfgs.py
"""

import numpy as np
from collections import deque
import time


# =============================================================================
# Test Functions
# =============================================================================

def rosenbrock(x):
    """Rosenbrock function value."""
    return sum(100*(x[i+1] - x[i]**2)**2 + (1-x[i])**2
               for i in range(len(x)-1))


def rosenbrock_grad(x):
    """Rosenbrock gradient."""
    n = len(x)
    g = np.zeros(n)
    for i in range(n-1):
        g[i] += -2*(1-x[i]) + 200*(x[i+1]-x[i]**2)*(-2*x[i])
        g[i+1] += 200*(x[i+1]-x[i]**2)
    return g


# =============================================================================
# Wolfe Line Search
# =============================================================================

def wolfe_line_search(f, grad, x, d, c1=1e-4, c2=0.9, max_iter=25):
    """Strong Wolfe line search.

    Returns alpha satisfying:
      f(x + alpha*d) <= f(x) + c1*alpha*g'd   (Armijo)
      |g(x+alpha*d)'d| <= c2*|g(x)'d|          (curvature)
    """
    alpha = 1.0
    alpha_lo, alpha_hi = 0.0, float('inf')
    f0 = f(x)
    g0 = grad(x)
    dg0 = g0 @ d

    if dg0 >= 0:
        return 1e-4  # Not a descent direction, use tiny step

    for _ in range(max_iter):
        x_new = x + alpha * d
        f_new = f(x_new)

        # Armijo check
        if f_new > f0 + c1 * alpha * dg0:
            alpha_hi = alpha
            alpha = 0.5 * (alpha_lo + alpha_hi)
            continue

        g_new = grad(x_new)
        dg_new = g_new @ d

        # Strong Wolfe curvature check
        if abs(dg_new) <= c2 * abs(dg0):
            return alpha

        if dg_new > 0:
            alpha_hi = alpha
        else:
            alpha_lo = alpha

        if alpha_hi < float('inf'):
            alpha = 0.5 * (alpha_lo + alpha_hi)
        else:
            alpha *= 2.0

    return alpha


# =============================================================================
# Exercise 1: Full BFGS
# =============================================================================

def bfgs(f, grad, x0, max_iter=1000, tol=1e-8):
    """Full BFGS with inverse Hessian update.

    Stores the full n×n inverse Hessian approximation.
    """
    n = len(x0)
    x = x0.copy().astype(float)
    H = np.eye(n)  # Initial inverse Hessian approximation
    g = grad(x)

    history = {"f": [], "grad_norm": [], "n_evals": 0}

    for k in range(max_iter):
        fval = f(x)
        gnorm = np.linalg.norm(g)
        history["f"].append(fval)
        history["grad_norm"].append(gnorm)

        if gnorm < tol:
            break

        # Search direction
        d = -H @ g

        # Wolfe line search
        alpha = wolfe_line_search(f, grad, x, d)
        history["n_evals"] += 1

        # Step
        s = alpha * d
        x_new = x + s
        g_new = grad(x_new)
        y = g_new - g

        # BFGS inverse Hessian update
        sy = s @ y
        if sy > 1e-10:
            rho = 1.0 / sy
            I = np.eye(n)
            V = I - rho * np.outer(s, y)
            H = V @ H @ V.T + rho * np.outer(s, s)

        x = x_new
        g = g_new

    return x, history


# =============================================================================
# Exercise 2: L-BFGS Two-Loop Recursion
# =============================================================================

def lbfgs(f, grad, x0, m=10, max_iter=1000, tol=1e-8):
    """L-BFGS with two-loop recursion.

    Stores only the last m (s, y) pairs — O(mn) memory.
    """
    x = x0.copy().astype(float)
    g = grad(x)

    s_history = deque(maxlen=m)
    y_history = deque(maxlen=m)
    rho_history = deque(maxlen=m)

    history = {"f": [], "grad_norm": [], "n_evals": 0}

    for k in range(max_iter):
        fval = f(x)
        gnorm = np.linalg.norm(g)
        history["f"].append(fval)
        history["grad_norm"].append(gnorm)

        if gnorm < tol:
            break

        # Two-loop recursion to compute H_k @ g
        q = g.copy()
        alphas = []

        # First loop (backward)
        for i in range(len(s_history) - 1, -1, -1):
            alpha_i = rho_history[i] * (s_history[i] @ q)
            alphas.insert(0, alpha_i)
            q = q - alpha_i * y_history[i]

        # Initial scaling
        if len(s_history) > 0:
            s_last = s_history[-1]
            y_last = y_history[-1]
            gamma = (s_last @ y_last) / (y_last @ y_last)
        else:
            gamma = 1.0
        r = gamma * q

        # Second loop (forward)
        for i in range(len(s_history)):
            beta = rho_history[i] * (y_history[i] @ r)
            r = r + s_history[i] * (alphas[i] - beta)

        d = -r  # Search direction

        # Wolfe line search
        alpha = wolfe_line_search(f, grad, x, d)
        history["n_evals"] += 1

        # Update
        s = alpha * d
        x_new = x + s
        g_new = grad(x_new)
        y = g_new - g

        sy = s @ y
        if sy > 1e-10:
            s_history.append(s.copy())
            y_history.append(y.copy())
            rho_history.append(1.0 / sy)

        x = x_new
        g = g_new

    return x, history


# =============================================================================
# Exercise 3: Comparison Shootout
# =============================================================================

def gradient_descent(f, grad, x0, max_iter=50000, tol=1e-8):
    """Simple GD with backtracking."""
    x = x0.copy()
    history = {"f": [], "grad_norm": []}

    for k in range(max_iter):
        fval = f(x)
        g = grad(x)
        gnorm = np.linalg.norm(g)
        history["f"].append(fval)
        history["grad_norm"].append(gnorm)

        if gnorm < tol:
            break

        # Backtracking
        alpha = 1.0
        d = -g
        for _ in range(30):
            if f(x + alpha * d) <= fval + 1e-4 * alpha * (g @ d):
                break
            alpha *= 0.5

        x = x + alpha * d

    return x, history


def shootout():
    """Compare GD, BFGS, L-BFGS on 2D Rosenbrock."""
    print("=== Exercise 3: Method Shootout (2D Rosenbrock) ===")
    x0 = np.array([-1.0, 1.0])
    x_star = np.array([1.0, 1.0])

    methods = {
        "GD": lambda: gradient_descent(rosenbrock, rosenbrock_grad, x0),
        "BFGS": lambda: bfgs(rosenbrock, rosenbrock_grad, x0),
        "L-BFGS (m=10)": lambda: lbfgs(rosenbrock, rosenbrock_grad, x0, m=10),
        "L-BFGS (m=3)": lambda: lbfgs(rosenbrock, rosenbrock_grad, x0, m=3),
    }

    print(f"  {'Method':<15}  {'Iters':>6}  {'Error':>10}  {'Time (ms)':>10}")
    for name, method in methods.items():
        t0 = time.perf_counter()
        x, hist = method()
        elapsed = (time.perf_counter() - t0) * 1000
        err = np.linalg.norm(x - x_star)
        iters = len(hist["f"])
        print(f"  {name:<15}  {iters:>6}  {err:>10.2e}  {elapsed:>10.2f}")


# =============================================================================
# Exercise 4: High-Dimensional L-BFGS
# =============================================================================

def high_dimensional_test():
    """Test L-BFGS on n=1000 extended Rosenbrock."""
    print("\n=== Exercise 4: High-Dimensional (n=1000) ===")
    n = 1000
    x0 = -np.ones(n)
    x_star = np.ones(n)

    # L-BFGS should handle this easily
    t0 = time.perf_counter()
    x_lbfgs, hist_lbfgs = lbfgs(rosenbrock, rosenbrock_grad, x0, m=10,
                                  max_iter=2000, tol=1e-6)
    t_lbfgs = (time.perf_counter() - t0) * 1000
    err_lbfgs = np.linalg.norm(x_lbfgs - x_star)

    print(f"  L-BFGS (m=10): {len(hist_lbfgs['f']):>5} iters, "
          f"err = {err_lbfgs:.2e}, time = {t_lbfgs:.0f} ms")
    print(f"  Memory: L-BFGS = {2*10*n*8 / 1024:.0f} KB "
          f"vs BFGS = {n*n*8 / (1024*1024):.0f} MB")


# =============================================================================
# Memory comparison
# =============================================================================

def memory_comparison():
    """Show the memory advantage of L-BFGS."""
    print("\n=== Memory Comparison ===")
    print(f"  {'n':>8}  {'BFGS (MB)':>12}  {'L-BFGS m=10 (KB)':>18}")
    for n in [100, 1000, 10000, 100000]:
        bfgs_mb = n * n * 8 / (1024 * 1024)
        lbfgs_kb = 2 * 10 * n * 8 / 1024
        print(f"  {n:>8}  {bfgs_mb:>12.1f}  {lbfgs_kb:>18.0f}")


# =============================================================================
# Main
# =============================================================================

if __name__ == "__main__":
    shootout()
    high_dimensional_test()
    memory_comparison()