From c23c9fcf32a3ffd484ef35ce63c5b86926601a4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de>
Date: Tue, 24 Jan 2017 13:06:58 +0100
Subject: [PATCH] Directly accumulate the output in stage 3 of sf

Note: Right now this is only done for FastDGGridOperator.
---
 python/dune/perftool/sumfact/sumfact.py | 135 +++++++++++++++++-------
 1 file changed, 97 insertions(+), 38 deletions(-)

diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index d56e080b..65f26ece 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                                  name=inp,
                                                  )
 
-        # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron
+        # Those input fields, that are padded need to be set to zero
+        # in order to do a horizontal_add lateron
         for pad in padding:
             instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)),
                         expression=0,
@@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                               depends_on=insn_dep,
                                               within_inames=frozenset(visitor.inames))})
 
-        # Add a sum factorization kernel that implements the multiplication
-        # with the test function (stage 3)
-        pref_pos = i if accterm.argument.index else None
-        result, insn_dep = sum_factorization_kernel(a_matrices,
-                                                    buf,
-                                                    3,
-                                                    insn_dep=insn_dep,
-                                                    additional_inames=frozenset(visitor.inames),
-                                                    preferred_position=pref_pos,
-                                                    restriction=(accterm.argument.restriction, restriction),
-                                                    )
 
         inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices))
 
@@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
         rank = 2 if visitor.inames else 1
         if rank == 2:
-            # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now)
+            # TODO the next line should get its inames from
+            # elsewhere. This is *NOT* robust (but works right now)
             ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())),
                                              (basis_functions_per_direction(),) * dim,
                                              order="f"
@@ -251,6 +242,32 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # Construct the expression representing "{r,jac}.accumulate(..)"
         accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
 
+
+        direct_output = None
+        if get_option('fastdg'):
+            ft = get_global_context_value("form_type")
+            accum = accum + ".data()"
+            if ft == 'residual' or ft == 'jacobian_apply':
+                direct_output = accum
+            else:
+                assert ft == 'jacobian'
+                direct_output = accum
+
+
+        # Add a sum factorization kernel that implements the multiplication
+        # with the test function (stage 3)
+        pref_pos = i if accterm.argument.index else None
+        result, insn_dep = sum_factorization_kernel(a_matrices,
+                                                    buf,
+                                                    3,
+                                                    insn_dep=insn_dep,
+                                                    additional_inames=frozenset(visitor.inames),
+                                                    preferred_position=pref_pos,
+                                                    restriction=(accterm.argument.restriction, restriction),
+                                                    direct_output=direct_output,
+                                                    visitor=visitor
+                                                    )
+
         # Determine the expression to accumulate with. This depends on the vectorization strategy!
         result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
         vecinames = ()
@@ -271,30 +288,38 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         if get_option('fastdg'):
             ft = get_global_context_value("form_type")
             if ft == 'residual' or ft == 'jacobian_apply':
-                accum = accum + ".data()"
-                size = basis_functions_per_direction() ** world_dimension()
-                globalarg(accum, dtype=np.float64, shape=(size,), managed=False)
-                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,))
-                expression = prim.Sum((assignee, result))
-                instruction(assignee=assignee,
-                            expression=expression,
-                            forced_iname_deps=frozenset(inames),
-                            forced_iname_deps_is_final=True,
-                            depends_on=insn_dep,
-                            )
+                if get_global_context_value("dry_run", False):
+                    shape = (basis_functions_per_direction(),) * world_dimension()
+                    ftags = ",".join(["f"]*len(shape))
+                    globalarg(accum, dtype=np.float64, shape=shape, dim_tags=ftags)
+                    assignee = prim.Subscript(prim.Variable(accum), tuple(prim.Variable(i) for i in inames))
+
+                    expression = prim.Sum((assignee, result))
+                    instruction(assignee=assignee,
+                                expression=expression,
+                                forced_iname_deps=frozenset(inames),
+                                forced_iname_deps_is_final=True,
+                                depends_on=insn_dep,
+                    )
             else:
-                assert ft == 'jacobian'
-                accum = accum + ".data()"
-                size = basis_functions_per_direction() ** world_dimension()
-                globalarg(accum, dtype=np.float64, shape=(size, size), managed=True)
-                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index, ansatz_lfs.index))
-                expression = prim.Sum((assignee, result))
-                instruction(assignee=assignee,
-                            expression=expression,
-                            forced_iname_deps=frozenset(inames + visitor.inames),
-                            forced_iname_deps_is_final=True,
-                            depends_on=insn_dep,
-                            )
+                if get_global_context_value("dry_run", False):
+                    assert ft == 'jacobian'
+                    shape = (basis_functions_per_direction(),) * (world_dimension() * 2)
+                    ftags = ",".join(["f"] * len(shape))
+                    globalarg(accum, dtype=np.float64, shape=shape, dim_tags=ftags)
+                    _test_inames = tuple(prim.Variable(i) for i in inames)
+                    # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now)
+                    _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension()))
+                    assignee = prim.Subscript(prim.Variable(accum), _ansatz_inames + _test_inames)
+                    expression = prim.Sum((assignee, result))
+
+
+                    instruction(assignee=assignee,
+                                expression=expression,
+                                forced_iname_deps=frozenset(inames + visitor.inames),
+                                forced_iname_deps_is_final=True,
+                                depends_on=insn_dep,
+                    )
         # Default: Generate accumulation instructions
         else:
             expr = Call(PDELabAccumulationFunction(accum, rank),
@@ -368,7 +393,7 @@ def _sf_flop_cost(a_matrices):
 
 
 def _sf_permutation_strategy(a_matrices, stage):
-    """Choose permutation of a_matices list based on computational cost
+    """Choose permutation of a_matrices list based on computational cost
 
     Note: If there are multiple permutations with the same cost a
     heuristic is used to pick one.
@@ -418,6 +443,8 @@ def sum_factorization_kernel(a_matrices,
                              outshape=None,
                              restriction=0,
                              direct_input=None,
+                             direct_output=None,
+                             visitor=None,
                              ):
     """Create a sum factorization kernel
 
@@ -481,6 +508,9 @@ def sum_factorization_kernel(a_matrices,
     direct_input: Global data structure containing input for
         sumfactorization (e.g. when using FastDGGridOperator).
     """
+    # Return a pymbolic SumfactKernel node in the dry run. This will
+    # be used to decide on an appropriate vectorization strategy
+    # before we do the real thing.
     if get_global_context_value("dry_run", False):
         return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
 
@@ -614,7 +644,36 @@ def sum_factorization_kernel(a_matrices,
         output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
         if l == len(a_matrices)-1:
             output_inames = _permute_backward(output_inames, perm)
-        assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
+
+        # In case of direct output we directly accumulate the result
+        # of the Sumfactorization into some global data structure.
+        if l == len(a_matrices)-1 and direct_output is not None:
+            ft = get_global_context_value("form_type")
+            novec_ftags = ",".join(["f"]*len(a_matrices))
+            if ft == 'residual' or ft == 'jacobian_apply':
+                globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
+                assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
+            else:
+                assert ft == 'jacobian'
+                globalarg(direct_output,
+                          dtype=np.float64,
+                          shape=output_shape + output_shape,
+                          dim_tags = novec_ftags + "," + novec_ftags)
+                # TODO the next line should get its inames from
+                # elsewhere. This is *NOT* robust (but works right
+                # now)
+                _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension()))
+                assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
+
+            # In case of vectorization we need to apply a horizontal add
+            if a_matrix.vectorized:
+                matprod = prim.Call(prim.Variable("horizontal_add"),
+                                    (matprod,))
+
+            # We need to accumulate
+            matprod = prim.Sum((assignee, matprod))
+        else:
+            assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on
-- 
GitLab