From de423230b1cd268309961cd62b6c7a4f0774bdf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de>
Date: Fri, 16 Dec 2016 15:21:56 +0100
Subject: [PATCH] Towards fastdg accumulation

---
 python/dune/perftool/options.py               |  2 +-
 python/dune/perftool/sumfact/amatrix.py       |  1 -
 python/dune/perftool/sumfact/basis.py         | 12 +--
 python/dune/perftool/sumfact/sumfact.py       | 89 ++++++++++++++-----
 python/dune/perftool/sumfact/vectorization.py | 10 +--
 test/sumfact/poisson/CMakeLists.txt           |  7 ++
 test/sumfact/poisson/poisson_dg.mini          |  4 +-
 test/sumfact/poisson/poisson_fastdg.mini      | 21 +++++
 8 files changed, 106 insertions(+), 40 deletions(-)
 create mode 100644 test/sumfact/poisson/poisson_fastdg.mini

diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 4cb6e8e8..5f27ede9 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -42,7 +42,7 @@ def get_form_compiler_arguments():
     parser.add_argument("--timer", action="store_true", help="measure times")
     parser.add_argument("--opcounter", action="store_true", default=False, help="Count operations. Should only be used with yaspgrid. Timer should be set.")
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
-    parser.add_argument("--fasfdg", action="store_false", help="Use FastDGGridOperator from PDELab.")
+    parser.add_argument("--fastdg", action="store_true", help="Use FastDGGridOperator from PDELab.")
     # TODO at some point this help description should be updated
     parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
     parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization")
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index c13ab899..3822ba43 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -8,7 +8,6 @@ from dune.perftool.generation import (class_member,
                                       domain,
                                       function_mangler,
                                       get_global_context_value,
-                                      globalarg,
                                       iname,
                                       include_file,
                                       initializer_list,
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index e77001d4..8401ad18 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -75,29 +75,29 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor):
 
         # Get the vectorization info. If this happens during the dry run, we get dummies
         from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, restriction)
+        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
 
         # Initialize the buffer for the sum fact kernel
         shape = (product(mat.cols for mat in a_matrices),)
         if index is not None:
             shape = shape + (4,)
-        input = initialize_buffer(buffer,
+        inp = initialize_buffer(buf,
                                   base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                                   num=2
                                   ).get_temporary(shape=shape,
-                                                  name=input,
+                                                  name=inp,
                                                   )
         if insn_dep is None:
-            insn_dep = frozenset({Writes(input)})
+            insn_dep = frozenset({Writes(inp)})
 
         # Setup the input!
-        setup_theta(input, element, restriction, component, index)
+        setup_theta(inp, element, restriction, component, index)
 
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
         var, insn_dep = sum_factorization_kernel(a_matrices,
-                                                 buffer,
+                                                 buf,
                                                  1,
                                                  preferred_position=i,
                                                  insn_dep=insn_dep,
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 797a700f..86731a14 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -58,7 +58,6 @@ from pymbolic.primitives import (Call,
                                  )
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.vectorization import find_sumfact
-from loopy import Reduction, GlobalArg
 from loopy.symbolic import FunctionIdentifier, IdentityMapper
 
 import loopy as lp
@@ -142,7 +141,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
 
         # Get the vectorization info. If this happens during the dry run, we get dummies
         from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
+        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
 
         # Initialize a base storage for this buffer and get a temporay pointing to it
         shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1)
@@ -156,12 +155,12 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
             index = ()
             vectag = frozenset()
 
-        temp = initialize_buffer(buffer,
+        temp = initialize_buffer(buf,
                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                                  num=2
                                  ).get_temporary(shape=shape,
                                                  dim_tags=dim_tags,
-                                                 name=input,
+                                                 name=inp,
                                                  )
 
         # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron
@@ -203,7 +202,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # with the test function (stage 3)
         pref_pos = i if accterm.argument.index else None
         result, insn_dep = sum_factorization_kernel(a_matrices,
-                                                    buffer,
+                                                    buf,
                                                     3,
                                                     insn_dep=insn_dep,
                                                     additional_inames=frozenset(visitor.inames),
@@ -245,18 +244,57 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                (maybe_wrap_subscript(result, prim.Variable(iname)),),
                                )
 
-        expr = Call(PDELabAccumulationFunction(accum, rank),
-                    (ansatz_lfs.get_args() +
-                     test_lfs.get_args() +
-                     (result,)
-                     )
-                    )
-        instruction(assignees=(),
-                    expression=expr,
-                    forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
-                    forced_iname_deps_is_final=True,
-                    depends_on=insn_dep,
-                    )
+        if get_option('fastdg'):
+            ft = get_global_context_value("form_type")
+            if ft=='residual':
+                globalarg(accum, dtype=np.float64, shape=(ansatz_lfs.index,), managed=False)
+                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,))
+                expression = prim.Sum((assignee,result))
+                instruction(assignee=assignee,
+                            expression=expression,
+                            forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                            forced_iname_deps_is_final=True,
+                            depends_on=insn_dep,
+                            )
+            else:
+                assert ft=='jacobian'
+                # palpo TODO: think about it
+                globalarg(accum, dtype=np.float64, shape=(ansatz_lfs.index, test_lfs.index), managed=False)
+                assignee = prim.Subscript(prim.Variable(accum), (ansatz_lfs.index, test_lfs.index))
+                expression = prim.Sum((assignee,result))
+                instruction(assignee=assignee,
+                            expression=expression,
+                            forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                            forced_iname_deps_is_final=True,
+                            depends_on=insn_dep,
+                            )
+                # expr = Call(PDELabAccumulationFunction(accum, rank),
+                #             (ansatz_lfs.get_args() +
+                #              test_lfs.get_args() +
+                #              (result,)
+                #              )
+                #             )
+                # instruction(assignees=(),
+                #             expression=expr,
+                #             forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                #             forced_iname_deps_is_final=True,
+                #             depends_on=insn_dep,
+                #             )
+
+        else:
+            expr = Call(PDELabAccumulationFunction(accum, rank),
+                        (ansatz_lfs.get_args() +
+                         test_lfs.get_args() +
+                         (result,)
+                        )
+            )
+            instruction(assignees=(),
+                        expression=expr,
+                        forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                        forced_iname_deps_is_final=True,
+                        depends_on=insn_dep,
+            )
+
 
         # Mark the transformation that moves the quadrature loop inside the trialfunction loops for application
         transform(nest_quadrature_loops, visitor.inames)
@@ -357,21 +395,24 @@ def sum_factorization_kernel(a_matrices, buf, stage,
             # a code generation corner case producing way too complicated code. This
             # could be fixed upstream, but the loopy code realizing reductions is not
             # trivial and the priority is kind of low.
-            matprod = Product((Subscript(Variable(a_matrix.name), (Variable(i), 0) + vec_iname),
-                               Subscript(Variable(inp), (0, Variable(j)) + vec_iname)
+            matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(i), 0) + vec_iname),
+                               prim.Subscript(prim.Variable(inp), (0, prim.Variable(j)) + vec_iname)
                                ))
         else:
             k = sumfact_iname(a_matrix.cols, "red")
 
             # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-            prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname),
-                            Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
-                            ))
-            matprod = Reduction("sum", k, prod)
+            prod = prim.Product((prim.Subscript(prim.Variable(a_matrix.name),
+                                                (prim.Variable(i), prim.Variable(k)) + vec_iname),
+                                 prim.Subscript(prim.Variable(inp),
+                                                (prim.Variable(k), prim.Variable(j)) + vec_iname)
+                                ))
+            matprod = lp.Reduction("sum", k, prod)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname),
+        assignee = prim.Subscript(prim.Variable(out), (prim.Variable(i), prim.Variable(j)) + vec_iname)
+        insn_dep = frozenset({instruction(assignee=assignee,
                                           expression=matprod,
                                           forced_iname_deps=frozenset({i, j}).union(additional_inames),
                                           forced_iname_deps_is_final=True,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 100f3b21..0adcc6ab 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -13,8 +13,8 @@ import loopy as lp
 
 
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, r, *args: (a, r))
-def vectorization_info(a_matrices, restriction, new_a_matrices, buffer, input, index, padding):
-    return (new_a_matrices, buffer, input, index, padding)
+def vectorization_info(a_matrices, restriction, new_a_matrices, buf, inp, index, padding):
+    return (new_a_matrices, buf, inp, index, padding)
 
 
 def get_vectorization_info(a_matrices, restriction):
@@ -63,8 +63,8 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
                 position_mapping[sumf] = available.pop()
 
         # Enable vectorization strategy:
-        input = get_counted_variable("joined_input")
-        buffer = get_counted_variable("joined_buffer")
+        inp = get_counted_variable("joined_input")
+        buf = get_counted_variable("joined_buffer")
 
         # Collect the large matrices!
         large_a_matrices = []
@@ -88,7 +88,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
             large_a_matrices.append(large)
 
         for sumf in stage_sumfacts:
-            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buffer, input, position_mapping[sumf], frozenset(available))
+            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buf, inp, position_mapping[sumf], frozenset(available))
     else:
         # Disable vectorization strategy
         no_vectorization(stage_sumfacts)
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 08ca1ca7..1dc2483f 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -33,3 +33,10 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
                                   BASENAME sumfact_poisson_dg
                                   INIFILE poisson_dg.mini
                                   )
+
+
+# Poisson DG using FastDGGridOperator
+dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+                                  BASENAME sumfact_poisson_fastdg
+                                  INIFILE poisson_fastdg.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_dg.mini b/test/sumfact/poisson/poisson_dg.mini
index 1e1127ec..747a5b7d 100644
--- a/test/sumfact/poisson/poisson_dg.mini
+++ b/test/sumfact/poisson/poisson_dg.mini
@@ -1,10 +1,9 @@
 __name = sumfact_poisson_dg_{__exec_suffix}
-__exec_suffix = {diff_suffix}_{quadvec_suffix}_{gradvec_suffix}_{fastdg_suffix}
+__exec_suffix = {diff_suffix}_{quadvec_suffix}_{gradvec_suffix}
 
 diff_suffix = numdiff, symdiff | expand num
 quadvec_suffix = quadvec, nonquadvec | expand quad
 gradvec_suffix = gradvec, nongradvec | expand grad
-fastdg_suffix = fastdg, standarddg | expand fastdg
 
 cells = 16 16
 extension = 1. 1.
@@ -20,4 +19,3 @@ exact_solution_expression = g
 compare_l2errorsquared = 1e-4
 vectorize_quad = 1, 0 | expand quad
 vectorize_grads = 1, 0 | expand grad
-fastdg = 1, 0 | expand fastdg
diff --git a/test/sumfact/poisson/poisson_fastdg.mini b/test/sumfact/poisson/poisson_fastdg.mini
new file mode 100644
index 00000000..ddd098e0
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg.mini
@@ -0,0 +1,21 @@
+__name = sumfact_poisson_fastdg_{__exec_suffix}
+__exec_suffix = {gradvec_suffix}_{quadvec_suffix}
+
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+quadvec_suffix = quadvec, nonquadvec | expand quadvec
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+numerical_jacobian = 0
+sumfact = 1
+exact_solution_expression = g
+compare_l2errorsquared = 1e-4
+vectorize_quad = 1, 0 | expand quadvec
+vectorize_grads = 1, 0 | expand gradvec
+fastdg = 1
-- 
GitLab