Merge branch 'feature/tensor-product-sf' into 'master'

Feature/tensor product sf See merge request !101

Merge branch 'feature/tensor-product-sf' into 'master'
Feature/tensor product sf See merge request !101
ead49c7e · Dominic Kempf · 3116c6a6 · fc427d44 · ead49c7e
Commit ead49c7e authored 8 years ago by Dominic Kempf
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -339,12 +339,37 @@ def sum_factorization_kernel(a_matrices,
                             restriction=0,
                             direct_input=None,
                             ):
-    """
-    Calculate a sum factorization matrix product.
+    """Create a sum factorization kernel
+
+    Sum factorization can be written as
+
+    Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
+
+    with:
+    - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
+    - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
+    - A_l: Values of 1D basis evaluations at quadrature points in l
+      direction, matrix of dimension m_l x n_l
+    - R_l: Transformation operator that permutes the underlying data
+      vector of the rank d tensor in such a way that the fastest
+      direction gets the slowest direction
+
+    In the l'th step we have the following setup:
+    - A_l: Matrix of dimensions m_l x n_l
+    - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
+    - R_l: Transformation operator

-    Y = A_{d-1}*...*A_0*X
+    Looking at the indizes the following will happen:
+    X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
+    A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+    R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]

-    where X is the input matrix and Y is the output variable.
+    So the multiplication with A_l is reduction over one index and the
+    transformation brings the next reduction index in the fastest
+    position.
+
+    Note: In the code below the transformation step is directly done
+    in the reduction instruction by adapting the assignee!

    Arguments:
    ----------
@@ -353,22 +378,33 @@ def sum_factorization_kernel(a_matrices,
        Order of application is from 0 up.
    buf: A string identifying the flip flop buffer in use
        for intermediate results. The memory is expected to be
-        pre-initialized with the input.
-    insn_dep: an instruction ID that the first issued instruction
+        pre-initialized with the input or you have to provide
+        direct_input (FastDGGridOperator).
+    insn_dep: An instruction ID that the first issued instruction
        should depend upon. All following ones will depend on each
        other.
+    additional_inames: Instructions will be executed within those
+        inames (needed for stage 3 in jacobians).
+    preferred_position: Will be used in the dry run to order kernels
+        when doing vectorization e.g. (dx u,dy u,dz u, u).
+    outshape: Shape of the output.
+    restriction: Restriction for faces values.
+    direct_input: Global data structure containing input for
+        sumfactorization (e.g. when using FastDGGridOperator).
+
    """
    if get_global_context_value("dry_run", False):
        return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()

-    ftags = "f,f"
-    ctags = "c,c"
+    ftags = ",".join(["f"]*len(a_matrices))
+    ctags = ",".join(["c"]*len(a_matrices))
    vec_shape = ()
    if next(iter(a_matrices)).vectorized:
        ftags = ftags + ",vec"
        ctags = ctags + ",vec"
        vec_shape = (4,)

+    # Measure times and count operations in c++ code
    if get_option("instrumentation_level") >= 4:
        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(stage)
        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
@@ -377,17 +413,18 @@ def sum_factorization_kernel(a_matrices,
                                          depends_on=insn_dep,
                                          within_inames=additional_inames)})

+    # Put a barrier before the sumfactorization kernel
    insn_dep = frozenset({barrier(depends_on=insn_dep,
                                  within_inames=additional_inames,
                                  )})

+    # Product of all matrices
    for l, a_matrix in enumerate(a_matrices):
        # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
        # and get inames that realize the product.
-        inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
-        out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
-        i = sumfact_iname(out_shape[0], "row")
-        j = sumfact_iname(out_shape[1], "col")
+        inp_shape = (a_matrix.cols,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
+        out_shape = (a_matrix.rows,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
+        out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
        vec_iname = ()
        if a_matrix.vectorized:
            iname = sumfact_iname(4, "vec")
@@ -404,19 +441,20 @@ def sum_factorization_kernel(a_matrices,
        else:
            k_expr = 0

-        # Setup the input of the sum factorization kernel. In the first matrix multiplication
-        # this can be taken from
+        # Setup the input of the sum factorization kernel. In the
+        # first matrix multiplication this can be taken from
        # * an input temporary (default)
        # * a global data structure (if FastDGGridOperator is in use)
        # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
        if l == 0 and direct_input is not None:
            globalarg(direct_input, dtype=np.float64, shape=inp_shape)
            if a_matrix.vectorized:
-                input_summand = prim.Call(prim.Variable("Vec4d"), (prim.Subscript(prim.Variable(direct_input),
-                                                                                  (k_expr, prim.Variable(j))),))
+                input_summand = prim.Call(prim.Variable("Vec4d"),
+                                          (prim.Subscript(prim.Variable(direct_input),
+                                                          (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])),))
            else:
                input_summand = prim.Subscript(prim.Variable(direct_input),
-                                               (k_expr, prim.Variable(j)) + vec_iname)
+                                               (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)
        else:
            # Get a temporary that interprets the base storage of the input
            # as a column-major matrix. In later iteration of the amatrix loop
@@ -429,35 +467,47 @@ def sum_factorization_kernel(a_matrices,
            silenced_warning('read_no_write({})'.format(inp))

            input_summand = prim.Subscript(prim.Variable(inp),
-                                           (k_expr, prim.Variable(j)) + vec_iname)
+                                           (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)

        switch_base_storage(buf)

-        # Get a temporary that interprets the base storage of the output
-        # as row-major matrix
+        # Get a temporary that interprets the base storage of the output.
+        #
+        # Note: In this step the reordering of the fastest directions
+        # is happening. The new direction (out_inames[0]) and the
+        # corresponding shape (out_shape[0]) goes to the end (slowest
+        # direction) and everything stays column major (ftags->fortran
+        # style).
        out = get_buffer_temporary(buf,
-                                   shape=out_shape + vec_shape,
-                                   dim_tags=ctags)
+                                   shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape,
+                                   dim_tags=ftags)

        # Write the matrix-matrix multiplication expression
-        matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(i), k_expr) + vec_iname),
-                           input_summand
-                           ))
+        # matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
+        #                                   (prim.Variable(i), k_expr) + vec_iname),
+        #                    input_summand))
+        matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
+                                          (prim.Variable(out_inames[0]), k_expr) + vec_iname),
+                           input_summand))
+
+        # ... which may be a reduction, if k>0
        if a_matrix.cols != 1:
-            # ... which may be a reduction, if k>0
            matprod = lp.Reduction("sum", k, matprod)

+        # Here we also move the new direction (out_inames[0]) to the end
+        assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname)
+
        # Issue the reduction instruction that implements the multiplication
        # at the same time store the instruction ID for the next instruction to depend on
-        assignee = prim.Subscript(prim.Variable(out), (prim.Variable(i), prim.Variable(j)) + vec_iname)
        insn_dep = frozenset({instruction(assignee=assignee,
                                          expression=matprod,
-                                          forced_iname_deps=frozenset({i, j}).union(additional_inames),
+                                          forced_iname_deps=frozenset([iname for iname in out_inames]).union(additional_inames),
                                          forced_iname_deps_is_final=True,
                                          depends_on=insn_dep,
                                          )
                              })

+    # Measure times and count operations in c++ code
    if get_option("instrumentation_level") >= 4:
        insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
                                          depends_on=insn_dep,