From 144d1c8143a085e40ab84763abce12796c2c202f Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Fri, 28 Oct 2016 13:41:49 +0200
Subject: [PATCH] Include stage 3, with a lot of dummies etc.

---
 python/dune/perftool/pdelab/localoperator.py |  7 ++-
 python/dune/perftool/sumfact/sumfact.py      | 50 ++++++++++++++++++++
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 48998b88..fca6efac 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -2,12 +2,14 @@ from __future__ import absolute_import
 from os.path import splitext
 
 from dune.perftool.options import get_option
-from dune.perftool.generation import (base_class,
+from dune.perftool.generation import (backend,
+                                      base_class,
                                       class_basename,
                                       class_member,
                                       constructor_parameter,
                                       domain,
                                       dump_accumulate_timer,
+                                      get_backend,
                                       global_context,
                                       iname,
                                       include_file,
@@ -361,6 +363,7 @@ def grad_iname(index, dim):
     return name
 
 
+@backend(interface="accum_insn")
 def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
     # First we do the tree traversal to get a pymbolic expression representing this expression
     pymbolic_expr = visitor(accterm.term)
@@ -467,7 +470,7 @@ def generate_kernel(integrals):
                 interface = PDELabInterface()
             from dune.perftool.ufl.visitor import UFL2LoopyVisitor
             visitor = UFL2LoopyVisitor(interface, measure, indexmap)
-            generate_accumulation_instruction(visitor, term, measure, subdomain_id)
+            get_backend(interface="accum_insn")(visitor, term, measure, subdomain_id)
 
     # Extract the information, which is needed to create a loopy kernel.
     # First extracting it, might be useful to alter it before kernel generation.
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 37be6d7d..a9c7271a 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -4,9 +4,12 @@ from dune.perftool.pdelab.argument import (name_coefficientcontainer,
 from dune.perftool.generation import (backend,
                                       domain,
                                       get_counter,
+                                      get_global_context_value,
+                                      globalarg,
                                       iname,
                                       instruction,
                                       silenced_warning,
+                                      temporary_variable,
                                       )
 from dune.perftool.loopy.buffer import (get_buffer_temporary,
                                         initialize_buffer,
@@ -82,6 +85,53 @@ def pymbolic_trialfunction(element, restriction, component):
     return Subscript(Variable(var), tuple(Variable(i) for i in quadrature_inames()))
 
 
+@backend(interface="accum_insn", name="sumfact")
+def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
+    pymbolic_expr = visitor(accterm.term)
+
+    if pymbolic_expr == 0:
+        return
+
+    # Get geometric dimension
+    formdata = get_global_context_value('formdata')
+    dim = formdata.geometric_dimension
+
+    # Get a matrix that holds the result of the pymbolic_expr at all quadrature points
+    name = "blubb"
+    temporary_variable(name, shape=(quadrature_points_per_direction(),) * dim, managed=True)
+    instruction(assignee=Subscript(Variable(name), tuple(Variable(i) for i in quadrature_inames())),
+                expression=pymbolic_expr,
+                forced_iname_deps=frozenset(quadrature_inames()),
+                forced_iname_deps_is_final=True,
+                depends_on=frozenset({stage_insn(1)}),
+                )
+
+    theta_transposed = name_theta_transposed()
+    rows = basis_functions_per_direction()
+    cols = quadrature_points_per_direction()
+    a_matrix = AMatrix(theta_transposed, rows, cols)
+    a_matrices = (a_matrix, a_matrix)
+
+    # Do stage 1
+    initialize_buffer("reffub",
+                      base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
+                      num=2
+                      )
+
+    result = sum_factorization_kernel(a_matrices, "reffub", 2)
+
+    from dune.perftool.pdelab.spaces import LFSLocalIndex
+    # Now write all this into the correct residual
+    inames = tuple(sumfact_iname(mat.rows, 'accum') for mat in a_matrices)
+    globalarg("r", shape=(basis_functions_per_direction() ^ dim,))
+    instruction(expression=Subscript(Variable(result), tuple(Variable(i) for i in inames)),
+#                 assignee=Subscript(Variable("r"), (Call(LFSLocalIndex("lfs"), tuple(Variable(i) for i in inames)),)),
+                assignee=Subscript(Variable("r"), (0,)),
+                forced_iname_deps=frozenset(inames),
+                forced_iname_deps_is_final=True,
+                depends_on=frozenset({stage_insn(3)}),
+                )
+
 #     # Do stage 3 (for f=u => mass matrix)
 #     theta_transposed = name_theta_transposed()
 #     a_matrix_transposed = AMatrix(theta_transposed, cols, rows)
-- 
GitLab