2023-09-22 Differentiation

2023-09-22 Differentiation#

Last time#

Newton’s method for systems
Bratu nonlinear PDE
p-Laplacian

Today#

NLsolve Newton solver library
- p-Laplacian robustness
- diagnostics
Algorithmic differentiation via Zygote
Symbolic differentiation
Structured by-hand differentiation
Concept of PDE-based inference (inverse problems)

using Plots
default(linewidth=3)
using LinearAlgebra
using SparseArrays

function vander(x, k=nothing)
    if k === nothing
        k = length(x)
    end
    V = ones(length(x), k)
    for j = 2:k
        V[:, j] = V[:, j-1] .* x
    end
    V
end

function fdstencil(source, target, k)
    "kth derivative stencil from source to target"
    x = source .- target
    V = vander(x)
    rhs = zero(x)'
    rhs[k+1] = factorial(k)
    rhs / V
end

function my_spy(A)
    cmax = norm(vec(A), Inf)
    s = max(1, ceil(120 / size(A, 1)))
    spy(A, marker=(:square, s), c=:diverging_rainbow_bgymr_45_85_c67_n256, clims=(-cmax, cmax))
end


function advdiff_sparse(n, kappa, wind, forcing)
    x = LinRange(-1, 1, n)
    xstag = (x[1:end-1] + x[2:end]) / 2
    rhs = forcing.(x)
    kappa_stag = kappa.(xstag)
    rows = [1, n]
    cols = [1, n]
    vals = [1., 1.] # diagonals entries (float)
    rhs[[1,n]] .= 0 # boundary condition
    for i in 2:n-1
        flux_L = -kappa_stag[i-1] * fdstencil(x[i-1:i], xstag[i-1], 1) +
            wind * (wind > 0 ? [1 0] : [0 1])
        flux_R = -kappa_stag[i] * fdstencil(x[i:i+1], xstag[i], 1) +
            wind * (wind > 0 ? [1 0] : [0 1])
        weights = fdstencil(xstag[i-1:i], x[i], 1)
        append!(rows, [i,i,i])
        append!(cols, i-1:i+1)
        append!(vals, weights[1] *  [flux_L..., 0] + weights[2] * [0, flux_R...])
    end
    L = sparse(rows, cols, vals)
    x, L, rhs
end

function newton(residual, jacobian, u0; maxits=20)
    u = u0
    uhist = [copy(u)]
    normhist = []
    for k in 1:maxits
        f = residual(u)
        push!(normhist, norm(f))
        J = jacobian(u)
        delta_u = - J \ f
        u += delta_u
        push!(uhist, copy(u))
    end
    uhist, normhist
end

newton (generic function with 1 method)

\(p\)-Laplacian#

\[ -\big(|u_x|^{p-2} u_x\big)_x = b(x) \text{ in } \Omega = (-1, 1), \quad u(-1) = 0, u(1) = 1 \]

plaplace_p = 1.5
plaplace_forcing = 0. # b(x) above
function plaplace_f(u)
    n = length(u)
    h = 2 / (n - 1)
    u = copy(u)
    f = copy(u)
    u[[1, n]] = [0, 1]
    f[n] -= 1
    for i in 2:n-1
        u_xstag = diff(u[i-1:i+1]) / h
        kappa_stag = abs.(u_xstag) .^ (plaplace_p - 2)
        f[i] = [1/h, -1/h]' *
            (kappa_stag .* u_xstag) - plaplace_forcing
    end
    f
end

plaplace_f (generic function with 1 method)

function plaplace_J(u)
    n = length(u)
    h = 2 / (n - 1)
    u = copy(u)
    u[[1, n]] = [0, 1]
    rows = [1, n]
    cols = [1, n]
    vals = [1., 1.] # diagonals entries (float)
    for i in 2:n-1
        js = i-1:i+1
        u_xstag = diff(u[js]) / h
        kappa_stag = abs.(u_xstag) .^ (plaplace_p - 2)
        fi = [h, -h]' * (kappa_stag .* u_xstag)
        fi_ujs = [-kappa_stag[1]/h^2,
                  sum(kappa_stag)/h^2,
                  -kappa_stag[2]/h^2]
        mask = 1 .< js .< n
        append!(rows, [i,i,i][mask])
        append!(cols, js[mask])
        append!(vals, fi_ujs[mask])
    end
    sparse(rows, cols, vals)
end

plaplace_J (generic function with 1 method)

Try solving#

n = 20
x = collect(LinRange(-1., 1, n))
u0 = (1 .+ x) / 2

plaplace_p = 1.5 # try different values
plaplace_forcing = 1
uhist, normhist = newton(plaplace_f, plaplace_J, u0; maxits=20);
plot(normhist, marker=:circle, yscale=:log10, ylims=(1e-10, 1e3))

plot(x, uhist[1:5:end], marker=:auto, legend=:bottomright)

What’s wrong?#

Using Zygote to differentiate#

using Zygote

gradient(x -> x^2, 3)

(6.0,)

function plaplace_fpoint(u, h)
    u_xstag = diff(u) / h
    kappa_stag = abs.(u_xstag) .^ (plaplace_p - 2)
    [1/h, -1/h]' * (kappa_stag .* u_xstag)
end

gradient(u -> plaplace_fpoint(u, .1), [0., .7, 1.])

([-18.898223650461365, 47.76573710994265, -28.867513459481284],)

p-Laplacian with Zygote#

function plaplace_fzygote(u)
    n = length(u)
    h = 2 / (n - 1)
    u = copy(u)
    f = copy(u)
    u[[1, n]] = [0, 1]
    f[n] -= 1
    for i in 2:n-1
        f[i] = plaplace_fpoint(u[i-1:i+1], h) - plaplace_forcing
    end
    f
end

plaplace_fzygote (generic function with 1 method)

function plaplace_Jzygote(u)
    n = length(u)
    h = 2 / (n - 1)
    u = copy(u)
    u[[1, n]] = [0, 1]
    rows = [1, n]
    cols = [1, n]
    vals = [1., 1.] # diagonals entries (float)
    for i in 2:n-1
        js = i-1:i+1
        fi_ujs = gradient(ujs -> plaplace_fpoint(ujs, h), u[js])[1]
        mask = 1 .< js .< n
        append!(rows, [i,i,i][mask])
        append!(cols, js[mask])
        append!(vals, fi_ujs[mask])
    end
    sparse(rows, cols, vals)
end

plaplace_Jzygote (generic function with 1 method)

Experiment with parameters#

plaplace_p = 5
plaplace_forcing = 1
u0 = (x .+ 1) / 2
uhist, normhist = newton(plaplace_fzygote, plaplace_Jzygote, u0; maxits=50);
plot(normhist, marker=:auto, yscale=:log10)

plot(x, uhist, marker=:auto, legend=:topleft)

Can you fix the plaplace_J to differentiate correctly (analytically)?
What is causing Newton to diverge?
How might we fix or work around it?

A model problem for Newton divergence; compare NLsolve#

k = 30.1 # what happens as this is changed
fk(x) = tanh.(k*x)
Jk(x) = reshape(k / cosh.(k*x).^2, 1, 1)
xhist, normhist = newton(fk, Jk, [1.], maxits=10)
plot(xhist, fk.(xhist), marker=:circle, legend=:bottomright)
plot!(fk, xlims=(-3, 3), linewidth=2)

# using Pkg; Pkg.add("NLsolve")
using NLsolve

nlsolve(x -> fk(x), Jk, [4.], show_trace=true)

Iter     f(x) inf-norm    Step 2-norm 
------   --------------   --------------

     0     1.000000e+00              NaN
     1     0.000000e+00     4.000000e+00

Results of Nonlinear Solver Algorithm
 * Algorithm: Trust-region with dogleg and autoscaling
 * Starting Point: [4.0]
 * Zero: [0.0]
 * Inf-norm of residuals: 0.000000
 * Iterations: 1
 * Convergence: true
   * |x - x'| < 0.0e+00: false
   * |f(x)| < 1.0e-08: true
 * Function Calls (f): 2
 * Jacobian Calls (df/dx): 2

NLsolve for p-Laplacian#

plaplace_p = 1.45
plaplace_forcing = 1
u0 = (x .+ 1) / 2
sol = nlsolve(plaplace_fzygote, plaplace_Jzygote, u0, 
    store_trace=true)
plaplace_p = 1.3
sol = nlsolve(plaplace_fzygote, plaplace_Jzygote, sol.zero, 
    store_trace=true)
fnorms = [sol.trace[i].fnorm for i in 1:sol.iterations]
plot(fnorms, marker=:circle, yscale=:log10)

plot(x, [sol.initial_x sol.zero], label=["initial" "solution"])

What about symbolic differentiation?#

import Symbolics: Differential, expand_derivatives, @variables
@variables x
Dx = Differential(x)
y = tanh(k*x)
Dx(y)

\[ \begin{equation} \frac{\mathrm{d} \tanh\left( 30.1 x \right)}{\mathrm{d}x} \end{equation} \]

expand_derivatives(Dx(y))

\[ \begin{equation} 30.1 \left( 1 - \tanh^{2}\left( 30.1 x \right) \right) \end{equation} \]

Cool, what about composition of functions#

y = x
for _ in 1:2
    y = cos(y^pi) * log(y)
end
expand_derivatives(Dx(y))

\[ \begin{equation} \frac{\left( \frac{\cos\left( x^{\pi} \right)}{x} - 3.1416 x^{2.1416} \log\left( x \right) \sin\left( x^{\pi} \right) \right) \cos\left( \cos^{3.1416}\left( x^{\pi} \right) \left( \log\left( x \right) \right)^{3.1416} \right)}{\log\left( x \right) \cos\left( x^{\pi} \right)} - \left( \frac{3.1416 \cos^{3.1416}\left( x^{\pi} \right) \left( \log\left( x \right) \right)^{2.1416}}{x} - 9.8696 \cos^{2.1416}\left( x^{\pi} \right) \left( \log\left( x \right) \right)^{3.1416} x^{2.1416} \sin\left( x^{\pi} \right) \right) \log\left( \log\left( x \right) \cos\left( x^{\pi} \right) \right) \sin\left( \cos^{3.1416}\left( x^{\pi} \right) \left( \log\left( x \right) \right)^{3.1416} \right) \end{equation} \]

The size of these expressions can grow exponentially.

Hand coding derivatives: it’s all chain rule and associativity#

\[ df = f'(x) dx \]

function f(x)
    y = x
    for _ in 1:2
        a = y^pi
        b = cos(a)
        c = log(y)
        y = b * c
    end
    y
end

f(1.9), gradient(f, 1.9)

(-1.5346823414986814, (-34.03241959914049,))

function df(x, dx)
    y = x
    dy = dx
    for _ in 1:2
        a = y^pi
        da = pi * y^(pi-1) * dy
        b = cos(a)
        db = -sin(a) * da
        c = log(y)
        dc = 1/y * dy
        y = b * c
        dy = db * c + b * dc
    end
    dy
end

df(1.9, 1)

-34.03241959914048

We can go the other way#

We can differentiate a composition \(h(g(f(x)))\) as

(15)#\[\begin{align} \operatorname{d} h &= h' \operatorname{d} g \\ \operatorname{d} g &= g' \operatorname{d} f \\ \operatorname{d} f &= f' \operatorname{d} x. \end{align}\]

What we’ve done above is called “forward mode”, and amounts to placing the parentheses in the chain rule like

\[ \operatorname d h = \frac{dh}{dg} \left(\frac{dg}{df} \left(\frac{df}{dx} \operatorname d x \right) \right) .\]

The expression means the same thing if we rearrange the parentheses,

\[ \operatorname d h = \left( \left( \left( \frac{dh}{dg} \right) \frac{dg}{df} \right) \frac{df}{dx} \right) \operatorname d x \]

which we can compute with in reverse order via

\[ \underbrace{\bar x}_{\frac{dh}{dx}} = \underbrace{\bar g \frac{dg}{df}}_{\bar f} \frac{df}{dx} .\]

A reverse mode example#