Efficient assembly

Copyright (C) 2022 Jørgen Schartum Dokken

This file is part of Oasisx SPDX-License-Identifier: MIT

Efficient assembly#

This demo illustrates the performance differences between matrix free and cached assembly for the Crank Nicholson time discretization with the implicit Adams-Bashforth linearization in the tentative velocity step of the Navier-Stokes Equation.

We start by importing the necessary modules

from mpi4py import MPI
from petsc4py import PETSc

import dolfinx
import dolfinx.fem.petsc
import numpy as np
import pandas
import seaborn
import ufl

We define a function, that takes in a mesh, the order P of the Lagrange function space for the scalar component of the velocity field, the number of times we should time the assembly, and jit_options for just in time compilation of the vartiational forms.

For the Matrix-vector strategy, we do only time the time it takes to modify the pre-assembled convection matrix, adding the scaled mass and stiffness matrices and computing the matrix vector product, as the matrix A is needed for the LHS assembly in the fractional step method.

For the Action strategy we use ufl.action to create the variational form for the RHS vector, and use generated code for the assembly.

This demo does not consider any Dirichet boundary conditions.

We add some arbitrary data to the variables dt, nu, u_1 and u_ab, as we are not solving a full problem here.

def assembly(mesh, P: int, repeats: int, jit_options: dict = None):
    V = dolfinx.fem.functionspace(mesh, ("Lagrange", int(P)))
    dt = 0.5
    nu = 0.3
    u = ufl.TrialFunction(V)
    v = ufl.TestFunction(V)

    # Solution from previous time step
    u_1 = dolfinx.fem.Function(V)
    u_1.interpolate(lambda x: np.sin(x[0]) * np.cos(x[1]))

    # Define variational forms
    mass = ufl.inner(u, v) * ufl.dx
    stiffness = ufl.inner(ufl.grad(u), ufl.grad(v)) * ufl.dx

    u_ab = [dolfinx.fem.Function(V, name=f"u_ab{i}") for i in range(mesh.geometry.dim)]
    convection = ufl.inner(ufl.dot(ufl.as_vector(u_ab), ufl.nabla_grad(u)), v) * ufl.dx
    for u_abi in u_ab:
        u_abi.interpolate(lambda x: x[0])

    # Compile forms for matrix vector products
    jit_options = {} if jit_options is None else jit_options
    mass_form = dolfinx.fem.form(mass, jit_options=jit_options)
    stiffness_form = dolfinx.fem.form(stiffness, jit_options=jit_options)
    convection_form = dolfinx.fem.form(convection, jit_options=jit_options)

    # Compile form for vector assembly (action)
    dt_inv = dolfinx.fem.Constant(mesh, 1.0 / dt)
    dt_inv.name = "dt_inv"
    nu_c = dolfinx.fem.Constant(mesh, nu)
    nu_c.name = "nu"
    lhs = dt_inv * mass - 0.5 * nu_c * stiffness - 0.5 * convection
    lhs = dolfinx.fem.form(ufl.action(lhs, u_1), jit_options=jit_options)

    # Assemble time independent matrices
    # Mass matrix
    M = dolfinx.fem.petsc.create_matrix(mass_form)
    M.setOption(PETSc.Mat.Option.SYMMETRIC, True)
    M.setOption(PETSc.Mat.Option.SYMMETRY_ETERNAL, True)
    M.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, True)
    dolfinx.fem.petsc.assemble_matrix(M, mass_form)
    M.assemble()
    M.setOption(PETSc.Mat.Option.NEW_NONZERO_LOCATIONS, False)

    # Stiffness matrix
    K = dolfinx.fem.petsc.create_matrix(stiffness_form)
    K.setOption(PETSc.Mat.Option.SYMMETRIC, True)
    K.setOption(PETSc.Mat.Option.SYMMETRY_ETERNAL, True)
    K.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, True)
    dolfinx.fem.petsc.assemble_matrix(K, stiffness_form)
    K.assemble()
    K.setOption(PETSc.Mat.Option.NEW_NONZERO_LOCATIONS, False)

    # Create time dependent matrix (convection matrix)
    A = dolfinx.fem.petsc.create_matrix(mass_form)
    A.setOption(PETSc.Mat.Option.IGNORE_ZERO_ENTRIES, True)

    # Vector for matrix vector product
    b = dolfinx.fem.Function(V)

    t_matvec = np.zeros((repeats, mesh.comm.size), dtype=np.float64)
    t_action = np.zeros((repeats, mesh.comm.size), dtype=np.float64)
    for i in range(repeats):
        # Zero out time-dependent matrix
        A.zeroEntries()

        # Add convection term
        dolfinx.fem.petsc.assemble_matrix(A, convection_form)
        A.assemble()

        # Do mat-vec operations
        with dolfinx.common.Timer(f"~{P} {i} Matvec strategy") as _:
            A.scale(-0.5)
            A.axpy(1.0 / dt, M)
            A.axpy(-0.5 * nu, K)
            A.mult(u_1.x.petsc_vec, b.x.petsc_vec)
            b.x.scatter_forward()

        # Compute the vector without using pre-generated matrices
        b_d = dolfinx.fem.Function(V)
        with dolfinx.common.Timer(f"~{P} {i} Action strategy") as _:
            dolfinx.fem.petsc.assemble_vector(b_d.x.petsc_vec, lhs)
            b_d.x.scatter_reverse(dolfinx.la.InsertMode.add)
            b_d.x.scatter_forward()
        # Compare results
        assert np.allclose(b.x.array, b_d.x.array)

        # Get timings
        matvec = dolfinx.common.timing(f"~{P} {i} Matvec strategy")
        action = dolfinx.common.timing(f"~{P} {i} Action strategy")
        t_matvec[i, :] = mesh.comm.allgather(matvec[1])
        t_action[i, :] = mesh.comm.allgather(action[1])

    return V.dofmap.index_map_bs * V.dofmap.index_map.size_global, t_matvec, t_action

We solve the problem on a unit cube that is split into tetrahedras with Nx,Ny and Nx tetrahera in the x, y and z-direction respectively.

def run_parameter_sweep(
    Nx: int, Ny: int, Nz: int, repeats: int, min_degree: int, max_degree: int
) -> dict:
    # Information regarding optimization flags can be found at:
    # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
    jit_options = {"cffi_extra_compile_args": ["-march=native", "-O3"]}

    mesh = dolfinx.mesh.create_unit_cube(MPI.COMM_WORLD, Nx, Ny, Nz)
    Ps = np.arange(min_degree, max_degree + 1, dtype=np.int32)
    j = 0
    results = {}
    for i, P in enumerate(Ps):
        dof, matvec, action = assembly(mesh, P, repeats=repeats, jit_options=jit_options)
        for row in matvec:
            for process in row:
                results[j] = {
                    "P": P,
                    "num_dofs": dof,
                    "method": "matvec",
                    "time (s)": process,
                    "procs": MPI.COMM_WORLD.size,
                }
                j += 1
        for row in action:
            for process in row:
                results[j] = {
                    "P": P,
                    "num_dofs": dof,
                    "method": "action",
                    "time (s)": process,
                    "procs": MPI.COMM_WORLD.size,
                }
                j += 1
    return results

We use pandas and seaborn to visualize the results

def create_plot(results: dict, outfile: str):
    if MPI.COMM_WORLD.rank == 0:
        df = pandas.DataFrame.from_dict(results, orient="index")
        df["label"] = (
            "P"
            + df["P"].astype(str)
            + " "
            + df["num_dofs"].astype(str)
            + " \n Comms: "
            + df["procs"].astype(str)
        )
        plot = seaborn.catplot(data=df, kind="swarm", x="label", y="time (s)", hue="method")
        plot.set(yscale="log")
        import matplotlib.pyplot as plt

        plt.grid()
        plt.savefig(outfile)

We start by running the comparison for an increasing number of degrees of freedom on a fixed grid.

if __name__ == "__main__":
    results_p = run_parameter_sweep(30, 25, 23, repeats=3, min_degree=1, max_degree=4)
    create_plot(results_p, "P_sweep.png")
_images/f030e9100a87fe9093ad5b1c7c775baf87d7cc42c2ae74203e5ce042e6e90340.png

We observe that for all the runs, independent of the degree \(P\), the Action Strategy is significantly faster than the

We note that the run for \(P=1\) is relatively small, and therefore run \(P=1\) on a larger mesh

if __name__ == "__main__":
    results_p1 = run_parameter_sweep(50, 40, 45, 3, 1, 1)
    create_plot(results_p1, "P1.png")
_images/45ba63f4187283e0b5685d142b2d1eccb6e92126d97d5ba43a4df272e0b39186.png

We observe that the run-time of both strategies for \(P = 1\) are more or less the same for larger matrices.