Cleanup and improve documentation

fc427d44 · René Heß · a7382743 · fc427d44
Commit fc427d44 authored 8 years ago by René Heß
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -339,12 +339,37 @@ def sum_factorization_kernel(a_matrices,
                             restriction=0,
                             direct_input=None,
                             ):
-    """Calculate a sum factorization tensor product.
+    """Create a sum factorization kernel

-    Y = A_{d-1}*...*A_0*X
+    Sum factorization can be written as

-    where X is the input tensor and Y is the output variable. This is
-    done using matrices and reinterpreting the data structures.
+    Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
+
+    with:
+    - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
+    - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
+    - A_l: Values of 1D basis evaluations at quadrature points in l
+      direction, matrix of dimension m_l x n_l
+    - R_l: Transformation operator that permutes the underlying data
+      vector of the rank d tensor in such a way that the fastest
+      direction gets the slowest direction
+
+    In the l'th step we have the following setup:
+    - A_l: Matrix of dimensions m_l x n_l
+    - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
+    - R_l: Transformation operator
+
+    Looking at the indizes the following will happen:
+    X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
+    A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+    R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+
+    So the multiplication with A_l is reduction over one index and the
+    transformation brings the next reduction index in the fastest
+    position.
+
+    Note: In the code below the transformation step is directly done
+    in the reduction instruction by adapting the assignee!

    Arguments:
    ----------
@@ -366,6 +391,7 @@ def sum_factorization_kernel(a_matrices,
    restriction: Restriction for faces values.
    direct_input: Global data structure containing input for
        sumfactorization (e.g. when using FastDGGridOperator).
+
    """
    if get_global_context_value("dry_run", False):
        return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
@@ -396,12 +422,8 @@ def sum_factorization_kernel(a_matrices,
    for l, a_matrix in enumerate(a_matrices):
        # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
        # and get inames that realize the product.
-        # inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
-        # out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
        inp_shape = (a_matrix.cols,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
        out_shape = (a_matrix.rows,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
-        # i = sumfact_iname(out_shape[0], "row")
-        # j = sumfact_iname(out_shape[1], "col")
        out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
        vec_iname = ()
        if a_matrix.vectorized:
@@ -449,8 +471,13 @@ def sum_factorization_kernel(a_matrices,

        switch_base_storage(buf)

-        # Get a temporary that interprets the base storage of the output
-        # as row-major matrix
+        # Get a temporary that interprets the base storage of the output.
+        #
+        # Note: In this step the reordering of the fastest directions
+        # is happening. The new direction (out_inames[0]) and the
+        # corresponding shape (out_shape[0]) goes to the end (slowest
+        # direction) and everything stays column major (ftags->fortran
+        # style).
        out = get_buffer_temporary(buf,
                                   shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape,
                                   dim_tags=ftags)
@@ -467,10 +494,11 @@ def sum_factorization_kernel(a_matrices,
        if a_matrix.cols != 1:
            matprod = lp.Reduction("sum", k, matprod)

+        # Here we also move the new direction (out_inames[0]) to the end
+        assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname)

        # Issue the reduction instruction that implements the multiplication
        # at the same time store the instruction ID for the next instruction to depend on
-        assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname)
        insn_dep = frozenset({instruction(assignee=assignee,
                                          expression=matprod,
                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(additional_inames),