"""
Day 14: Week 2 Comprehensive Exercise Set — Companion Code
Phase I, Week 2

Covers: gradient checking, sparse tridiagonal, AD Jacobians,
condition number study, preconditioned CG, Schur complement.

Run: python exercise_set_2.py
"""

import numpy as np
import time
from scipy import sparse
from scipy.sparse import linalg as sp_linalg


# =============================================================================
# Problem 1: Gradient Checker
# =============================================================================

def check_gradient(f, grad_f, x, h=1e-7, tol=1e-5):
    """Check analytic gradient against central differences.
    
    Returns:
        passed: bool
        errors: array of relative errors per component
    """
    n = len(x)
    g_analytic = grad_f(x)
    g_numerical = np.zeros(n)

    for i in range(n):
        x_plus = x.copy()
        x_minus = x.copy()
        x_plus[i] += h
        x_minus[i] -= h
        g_numerical[i] = (f(x_plus) - f(x_minus)) / (2 * h)

    # Relative error with denominator safety
    denom = np.maximum(np.abs(g_analytic) + np.abs(g_numerical), 1e-15)
    errors = np.abs(g_analytic - g_numerical) / denom
    passed = np.all(errors < tol)

    return passed, errors


def problem_1():
    """Gradient checker demo."""
    print("=== Problem 1: Gradient Checker ===")

    rng = np.random.default_rng(42)
    A = rng.randn(5, 3)
    b = rng.randn(5)

    f = lambda x: np.linalg.norm(A @ x - b) ** 2
    grad_f_correct = lambda x: 2 * A.T @ (A @ x - b)
    grad_f_buggy = lambda x: -2 * A.T @ (A @ x - b)  # sign error

    x = rng.randn(3)

    passed, errors = check_gradient(f, grad_f_correct, x)
    print(f"  Correct gradient: PASS={passed}, max_err={np.max(errors):.2e}")

    passed, errors = check_gradient(f, grad_f_buggy, x)
    print(f"  Buggy gradient:   PASS={passed}, max_err={np.max(errors):.2e}")


# =============================================================================
# Problem 2: Sparse Tridiagonal System
# =============================================================================

def problem_2():
    """Sparse tridiagonal solve comparison."""
    print("\n=== Problem 2: Sparse Tridiagonal ===")
    n = 50000

    # Create tridiagonal: 4 on diag, -1 on off-diags
    A = sparse.diags([-1, 4, -1], [-1, 0, 1], shape=(n, n), format='csc')
    rng = np.random.default_rng(42)
    b = rng.randn(n)

    # (a) Sparse direct
    t0 = time.perf_counter()
    x_direct = sp_linalg.spsolve(A, b)
    t_direct = time.perf_counter() - t0

    # (b) SciPy CG
    t0 = time.perf_counter()
    x_cg, info_cg = sp_linalg.cg(A, b, tol=1e-10, maxiter=1000)
    t_cg = time.perf_counter() - t0
    err_cg = np.linalg.norm(x_cg - x_direct) / np.linalg.norm(x_direct)

    # (c) PCG with Jacobi
    diag_vals = A.diagonal()
    M_inv = sp_linalg.LinearOperator((n, n), matvec=lambda v: v / diag_vals)
    t0 = time.perf_counter()
    x_pcg, info_pcg = sp_linalg.cg(A, b, M=M_inv, tol=1e-10, maxiter=1000)
    t_pcg = time.perf_counter() - t0
    err_pcg = np.linalg.norm(x_pcg - x_direct) / np.linalg.norm(x_direct)

    print(f"  n = {n}")
    print(f"  Direct:  {t_direct*1000:>8.1f} ms")
    print(f"  CG:      {t_cg*1000:>8.1f} ms, info={info_cg}, err={err_cg:.2e}")
    print(f"  PCG:     {t_pcg*1000:>8.1f} ms, info={info_pcg}, err={err_pcg:.2e}")


# =============================================================================
# Problem 3: Forward-Mode AD Jacobian
# =============================================================================

class Dual:
    """Dual number for forward-mode AD."""
    def __init__(self, val, deriv=0.0):
        self.val = val
        self.deriv = deriv

    def __add__(self, other):
        other = other if isinstance(other, Dual) else Dual(other)
        return Dual(self.val + other.val, self.deriv + other.deriv)

    def __radd__(self, other):
        return self.__add__(other)

    def __mul__(self, other):
        other = other if isinstance(other, Dual) else Dual(other)
        return Dual(self.val * other.val,
                    self.val * other.deriv + self.deriv * other.val)

    def __rmul__(self, other):
        return self.__mul__(other)

    def __truediv__(self, other):
        other = other if isinstance(other, Dual) else Dual(other)
        return Dual(self.val / other.val,
                    (self.deriv * other.val - self.val * other.deriv) / other.val**2)

    def __pow__(self, n):
        return Dual(self.val ** n, n * self.val ** (n - 1) * self.deriv)

    def __neg__(self):
        return Dual(-self.val, -self.deriv)

    def __sub__(self, other):
        return self + (-other)

    def __rsub__(self, other):
        return (-self) + other


def dual_sin(d):
    return Dual(np.sin(d.val), np.cos(d.val) * d.deriv)

def dual_cos(d):
    return Dual(np.cos(d.val), -np.sin(d.val) * d.deriv)

def dual_exp(d):
    e = np.exp(d.val)
    return Dual(e, e * d.deriv)


def f_vec(x1, x2, x3):
    """Vector function for Jacobian test."""
    f1 = x1**2 + dual_sin(x2 * x3)
    f2 = dual_exp(x1) * dual_cos(x2)
    f3 = x3 / (1 + x1**2)
    return [f1, f2, f3]


def problem_3():
    """Forward-mode AD Jacobian."""
    print("\n=== Problem 3: AD Jacobian ===")
    x = [1.0, 2.0, 3.0]

    # Forward-mode AD: seed each input direction
    J_ad = np.zeros((3, 3))
    for j in range(3):
        duals = [Dual(x[i], 1.0 if i == j else 0.0) for i in range(3)]
        result = f_vec(duals[0], duals[1], duals[2])
        for i in range(3):
            J_ad[i, j] = result[i].deriv

    # Numerical Jacobian (central differences)
    def f_numpy(xv):
        return np.array([
            xv[0]**2 + np.sin(xv[1] * xv[2]),
            np.exp(xv[0]) * np.cos(xv[1]),
            xv[2] / (1 + xv[0]**2),
        ])

    J_num = np.zeros((3, 3))
    h = 1e-7
    xv = np.array(x)
    for j in range(3):
        ep = xv.copy(); em = xv.copy()
        ep[j] += h; em[j] -= h
        J_num[:, j] = (f_numpy(ep) - f_numpy(em)) / (2 * h)

    print("  AD Jacobian:")
    for row in J_ad:
        print(f"    [{', '.join(f'{v:>9.5f}' for v in row)}]")

    print("  Numerical Jacobian:")
    for row in J_num:
        print(f"    [{', '.join(f'{v:>9.5f}' for v in row)}]")

    print(f"  Max difference: {np.max(np.abs(J_ad - J_num)):.2e}")


# =============================================================================
# Problem 4: Condition Number Study
# =============================================================================

def simple_gd(H, b, max_iter=100000, tol=1e-6):
    """Gradient descent on 0.5 x^T H x - b^T x."""
    eigvals = np.linalg.eigvalsh(H)
    lr = 2.0 / (eigvals[-1] + eigvals[0])
    x_true = np.linalg.solve(H, b)

    x = np.zeros(len(b))
    for k in range(max_iter):
        g = H @ x - b
        if np.linalg.norm(x - x_true) / np.linalg.norm(x_true) < tol:
            return k
        x = x - lr * g
    return max_iter


def simple_cg(H, b, tol=1e-6):
    """CG iteration count to tolerance."""
    n = len(b)
    x_true = np.linalg.solve(H, b)
    x = np.zeros(n)
    r = b.copy()
    p = r.copy()
    rr = r @ r

    for k in range(2 * n):
        Hp = H @ p
        alpha = rr / (p @ Hp)
        x += alpha * p
        r -= alpha * Hp
        rr_new = r @ r
        if np.linalg.norm(x - x_true) / np.linalg.norm(x_true) < tol:
            return k + 1
        beta = rr_new / rr
        p = r + beta * p
        rr = rr_new
    return 2 * n


def problem_4():
    """Condition number vs convergence study."""
    print("\n=== Problem 4: Condition Number Study ===")
    n = 100
    rng = np.random.default_rng(42)
    Q, _ = np.linalg.qr(rng.randn(n, n))
    b = rng.randn(n)

    print(f"  {'κ':>10} {'Digits':>8} {'GD iters':>10} {'CG iters':>10}")
    print(f"  {'-'*10} {'-'*8} {'-'*10} {'-'*10}")

    for log_k in [1, 2, 3, 4, 5, 6]:
        kappa = 10 ** log_k
        eigvals = np.linspace(1, kappa, n)
        H = Q @ np.diag(eigvals) @ Q.T

        # Precision
        x_true = np.linalg.solve(H, b)
        x_check = np.linalg.solve(H, H @ x_true)
        rel_err = np.linalg.norm(x_check - x_true) / np.linalg.norm(x_true)
        digits = max(0, -np.log10(rel_err + 1e-20))

        # GD iterations (cap at 50000)
        gd_iters = simple_gd(H, b, max_iter=50000)

        # CG iterations
        cg_iters = simple_cg(H, b)

        print(f"  10^{log_k:<5} {digits:>8.1f} {gd_iters:>10} {cg_iters:>10}")


# =============================================================================
# Problem 5: Preconditioning Deep Dive
# =============================================================================

def problem_5():
    """PCG with various preconditioners."""
    print("\n=== Problem 5: Preconditioning Deep Dive ===")
    n = 100
    rng = np.random.default_rng(42)
    Q, _ = np.linalg.qr(rng.randn(n, n))
    eigvals = np.logspace(0, 4, n)  # κ = 10^4
    A = Q @ np.diag(eigvals) @ Q.T
    b = rng.randn(n)
    x_true = np.linalg.solve(A, b)

    # No preconditioning
    x1, info1 = sp_linalg.cg(A, b, tol=1e-10, maxiter=n)
    err1 = np.linalg.norm(x1 - x_true) / np.linalg.norm(x_true)

    # Jacobi preconditioning
    diag_A = np.diag(A)
    M_jacobi = sp_linalg.LinearOperator(
        (n, n), matvec=lambda v: v / diag_A)
    x2, info2 = sp_linalg.cg(A, b, M=M_jacobi, tol=1e-10, maxiter=n)
    err2 = np.linalg.norm(x2 - x_true) / np.linalg.norm(x_true)

    # "Perfect" preconditioning: M = A
    A_inv = np.linalg.inv(A)
    M_perfect = sp_linalg.LinearOperator(
        (n, n), matvec=lambda v: A_inv @ v)
    x3, info3 = sp_linalg.cg(A, b, M=M_perfect, tol=1e-10, maxiter=n)
    err3 = np.linalg.norm(x3 - x_true) / np.linalg.norm(x_true)

    # Effective condition numbers
    kappa_none = eigvals[-1] / eigvals[0]
    D_inv_sqrt = np.diag(1.0 / np.sqrt(diag_A))
    A_jacobi = D_inv_sqrt @ A @ D_inv_sqrt
    ev_jacobi = np.linalg.eigvalsh(A_jacobi)
    kappa_jacobi = ev_jacobi[-1] / ev_jacobi[0]
    kappa_perfect = 1.0  # M^{-1}A = I

    print(f"  κ(A)       = {kappa_none:.0f}")
    print(f"  No precond:  info={info1}, err={err1:.2e}")
    print(f"  Jacobi:      info={info2}, err={err2:.2e}, κ_eff={kappa_jacobi:.0f}")
    print(f"  Perfect:     info={info3}, err={err3:.2e}, κ_eff={kappa_perfect:.0f}")


# =============================================================================
# Problem 6: Schur Complement Solver
# =============================================================================

def problem_6():
    """Schur complement solve for BA-like structure."""
    print("\n=== Problem 6: Schur Complement ===")
    rng = np.random.default_rng(42)

    nc, nl = 10, 90  # camera params, landmark params
    n = nc + nl

    # B: camera-camera block (dense SPD)
    B_raw = rng.randn(nc, nc)
    B = B_raw @ B_raw.T + 5 * np.eye(nc)

    # C: block-diagonal landmark block (each 2x2 SPD)
    C = np.zeros((nl, nl))
    for i in range(0, nl, 2):
        c_raw = rng.randn(2, 2)
        C[i:i+2, i:i+2] = c_raw @ c_raw.T + 3 * np.eye(2)

    # E: camera-landmark coupling
    E = rng.randn(nc, nl) * 0.5

    # Full system
    H = np.block([[B, E], [E.T, C]])
    g = rng.randn(n)

    # (a) Full dense solve
    x_full = np.linalg.solve(H, g)

    # (b) Schur complement: eliminate landmarks
    # S = B - E C^{-1} E^T
    C_inv = np.linalg.inv(C)
    S = B - E @ C_inv @ E.T

    # Reduced RHS: g_c - E C^{-1} g_l
    g_c = g[:nc]
    g_l = g[nc:]
    g_reduced = g_c - E @ C_inv @ g_l

    # Solve for cameras
    x_c = np.linalg.solve(S, g_reduced)

    # Back-substitute for landmarks
    x_l = C_inv @ (g_l - E.T @ x_c)

    x_schur = np.concatenate([x_c, x_l])

    err = np.linalg.norm(x_schur - x_full) / np.linalg.norm(x_full)
    print(f"  Full system: {n}×{n}")
    print(f"  Schur complement: {nc}×{nc}")
    print(f"  Solutions match: {err < 1e-10} (err = {err:.2e})")


# =============================================================================
# Problem 8: Mini-SLAM Integration
# =============================================================================

def problem_8():
    """Mini-SLAM: assemble info matrix, fix gauge, solve."""
    print("\n=== Problem 8: Mini-SLAM Integration ===")
    rng = np.random.default_rng(42)

    n_poses = 5
    n_landmarks = 20
    pose_dim = 3  # [x, y, theta]
    lm_dim = 2    # [lx, ly]
    n_total = n_poses * pose_dim + n_landmarks * lm_dim

    # Generate random poses and landmarks
    poses = np.cumsum(rng.randn(n_poses, pose_dim) * [1.0, 1.0, 0.1], axis=0)
    landmarks = rng.randn(n_landmarks, 2) * 5

    # Build information matrix from observations
    H = np.zeros((n_total, n_total))
    g = np.zeros(n_total)

    # Odometry factors (between consecutive poses)
    sigma_odom = np.diag([0.1, 0.1, 0.01])
    info_odom = np.linalg.inv(sigma_odom @ sigma_odom)

    for i in range(n_poses - 1):
        idx = slice(i * pose_dim, (i + 2) * pose_dim)
        # Simplified: info matrix connects pose i and i+1
        J_local = np.zeros((pose_dim, 2 * pose_dim))
        J_local[:, :pose_dim] = -np.eye(pose_dim)
        J_local[:, pose_dim:] = np.eye(pose_dim)
        H[idx, idx] += J_local.T @ info_odom @ J_local

    # Observation factors (each pose observes some landmarks)
    sigma_obs = np.diag([0.05, 0.05])
    info_obs = np.linalg.inv(sigma_obs @ sigma_obs)

    for i in range(n_poses):
        pose_idx = i * pose_dim
        for j in range(n_landmarks):
            if rng.random() < 0.3:  # 30% visibility
                lm_idx = n_poses * pose_dim + j * lm_dim
                # Simplified bearing-range Jacobian
                J = np.zeros((lm_dim, pose_dim + lm_dim))
                J[:, :2] = -np.eye(2)  # d/d(pose x,y)
                J[:, pose_dim:pose_dim + lm_dim] = np.eye(2)  # d/d(landmark)

                # Add to H
                rows = list(range(pose_idx, pose_idx + 2)) + \
                       list(range(lm_idx, lm_idx + lm_dim))
                for ri, r in enumerate(rows):
                    for ci, c in enumerate(rows):
                        block = J.T @ info_obs @ J
                        if ri < block.shape[0] and ci < block.shape[1]:
                            H[r, c] += block[ri, ci]

    # Check conditioning before gauge fix
    eigvals_pre = np.linalg.eigvalsh(H)
    n_zero = np.sum(np.abs(eigvals_pre) < 1e-10)

    # Fix gauge: anchor pose 0
    for k in range(pose_dim):
        H[k, k] += 100.0  # strong prior on pose 0

    eigvals_post = np.linalg.eigvalsh(H)
    kappa_post = eigvals_post[-1] / max(eigvals_post[0], 1e-15)

    # Solve with CG
    g = rng.randn(n_total) * 0.1  # small perturbation as RHS
    x_direct = np.linalg.solve(H, g)
    x_cg, info_cg = sp_linalg.cg(H, g, tol=1e-10, maxiter=500)
    err = np.linalg.norm(x_cg - x_direct) / np.linalg.norm(x_direct)

    print(f"  System size: {n_total}×{n_total}")
    print(f"  Before gauge fix: {n_zero} near-zero eigenvalues")
    print(f"  After gauge fix: κ = {kappa_post:.0f}")
    print(f"  CG info={info_cg}, rel_error={err:.2e}")


# =============================================================================
# Main
# =============================================================================

if __name__ == "__main__":
    problem_1()
    problem_2()
    problem_3()
    problem_4()
    problem_5()
    problem_6()
    problem_8()