From 488dabdadab743aa16f0299c5ba00ab42873a431 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Fri, 2 Dec 2016 19:52:38 +0100
Subject: [PATCH] Implement large matrix construction

---
 python/dune/perftool/loopy/buffer.py          |  4 +-
 python/dune/perftool/sumfact/basis.py         | 30 ++++++++-----
 python/dune/perftool/sumfact/sumfact.py       | 36 ++++++++++------
 python/dune/perftool/sumfact/vectorization.py | 42 +++++++++++++++----
 4 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py
index 84076012..feef4570 100644
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
@@ -29,9 +29,9 @@ class FlipFlopBuffer(object):
         base = self.base_storage[self._current]
 
         # Construct a temporary name
-        name = kwargs.pop("name", get_counted_variable(self.identifier))
+        name = kwargs.pop("name", None)
         if name is None:
-            from pudb import set_trace; set_trace()
+            name = get_counted_variable(self.identifier)
 
         # Get geometric dimension
         formdata = get_global_context_value('formdata')
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index fb3c3ff3..8a451a55 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -75,14 +75,14 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
 
         # Initialize the buffer for the sum fact kernel
         shape = (product(mat.cols for mat in a_matrices),)
-        if index:
+        if index is not None:
             shape = shape + (4,)
-        initialize_buffer(buffer,
-                          base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
-                          num=2
-                          ).get_temporary(shape=shape,
-                                          name=input,
-                                          )
+        input = initialize_buffer(buffer,
+                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
+                                  num=2
+                                  ).get_temporary(shape=shape,
+                                                  name=input,
+                                                  )
 
         # Setup the input!
         setup_theta(input, element, restriction, component, index)
@@ -90,14 +90,22 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
-        var, _ = sum_factorization_kernel(a_matrices,
+        if index:
+            var, _ = sum_factorization_kernel(a_matrices,
                                           buffer,
                                           1,
                                           preferred_position=i,
                                           insn_dep=frozenset({Writes(input)}),
+                                          output_name=name,
                                           )
-
-        buffers.append(var)
+        else:
+            var, _ = sum_factorization_kernel(a_matrices,
+                                          buffer,
+                                          1,
+                                          preferred_position=i,
+                                          insn_dep=frozenset({Writes(input)}),
+                                          )
+            buffers.append(var)
 
     # TODO this should be quite conditional!!!
     for i, buf in enumerate(buffers):
@@ -139,7 +147,7 @@ def pymbolic_trialfunction(element, restriction, component):
 
     # Flip flop buffers for sumfactorization
     shape = (product(mat.cols for mat in a_matrices),)
-    if vec:
+    if index is not None:
         shape = shape + (4,)
     initialize_buffer(buffer,
                       base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 6cf4bd7b..7f32ed52 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -321,14 +321,20 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         a_matrices, buffer, input, index = get_vectorization_info(a_matrices)
 
         # Initialize a base storage for this buffer and get a temporay pointing to it
-#        shape = product(mat.cols for mat in a_matrices)
-#        if vec:
-#            shape = (shape, 4)
+        shape = tuple(mat.cols for mat in a_matrices)
+        dim_tags = ",".join(['f'] * dim)
+        if index is not None:
+            shape = shape + (4,)
+            dim_tags = dim_tags + ",c"
+            index = (index,)
+        else:
+            index = ()
+
         temp = initialize_buffer(buffer,
                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                                  num=2
-                                 ).get_temporary(shape=(quadrature_points_per_direction(),) * dim,
-                                                 dim_tags=",".join(['f'] * dim),
+                                 ).get_temporary(shape=shape,
+                                                 dim_tags=dim_tags,
                                                  name=input,
                                                  )
 
@@ -346,7 +352,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
 
         # Issue an instruction in the quadrature loop that fills the buffer
         # with the evaluation of the contribution at all quadrature points
-        assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()))
+        assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + index)
         contrib_dep = instruction(assignee=assignee,
                                   expression=expression,
                                   forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
@@ -403,7 +409,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         transform(nest_quadrature_loops, visitor.inames)
 
 
-def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False, preferred_position=None):
+def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), preferred_position=None, output_name=None):
     """
     Calculate a sum factorization matrix product.
 
@@ -426,9 +432,13 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
     if get_global_context_value("dry_run", False):
         return SumfactKernel(a_matrices, buf, stage, preferred_position), frozenset()
 
-    ftags = "f,f" + (",vec" if add_vec_tag else "")
-    ctags = "c,c" + (",vec" if add_vec_tag else "")
-    vec_shape = (4,) if add_vec_tag else ()
+    ftags = "f,f"
+    ctags = "c,c"
+    vec_shape = ()
+    if isinstance(next(iter(a_matrices)), LargeAMatrix):
+        ftags = ftags + ",vec"
+        ctags = ctags + ",vec"
+        vec_shape = (4,)
 
     insn_dep = frozenset({barrier(depends_on=insn_dep,
                                   within_inames=additional_inames,
@@ -495,13 +505,15 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
     out_shape = tuple(mat.rows for mat in a_matrices)
     dim_tags = ",".join(['f'] * dim)
 
-    if add_vec_tag:
+    if isinstance(next(iter(a_matrices)), LargeAMatrix):
         out_shape = out_shape + vec_shape
         dim_tags = dim_tags + ",c"
 
     out = get_buffer_temporary(buf,
                                shape=out_shape,
-                               dim_tags=dim_tags)
+                               dim_tags=dim_tags,
+                               name=output_name,
+                               )
     silenced_warning('read_no_write({})'.format(out))
 
     return prim.Variable(out), insn_dep
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 9e4f2eb8..9155fb27 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -10,8 +10,8 @@ import loopy as lp
 
 
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, *args: a)
-def vectorization_info(a_matrices, buffer, input, index):
-    return (a_matrices, buffer, input, index)
+def vectorization_info(a_matrices, new_a_matrices, buffer, input, index):
+    return (new_a_matrices, buffer, input, index)
 
 
 def get_vectorization_info(a_matrices):
@@ -20,20 +20,21 @@ def get_vectorization_info(a_matrices):
         # Return dummy data
         return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
     try:
-        return vectorization_info(a_matrices, None, None, None)
+        return vectorization_info(a_matrices, None, None, None, None)
     except TypeError:
         raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt")
 
 
 def no_vectorization(sumfacts):
     for sumf in sumfacts:
-        vectorization_info(sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
+        vectorization_info(sumf.a_matrices, sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
 
 
 def decide_stage_vectorization_strategy(sumfacts, stage):
     stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage])
     if len(stage_sumfacts) in (3, 4):
         # Map the sum factorization to their position in the joint kernel
+        position_mapping = {}
         available = set(range(4))
         for sf in stage_sumfacts:
             if sf.preferred_position is not None:
@@ -41,16 +42,41 @@ def decide_stage_vectorization_strategy(sumfacts, stage):
                 # Later on, more complicated stuff might be necessary here.
                 assert sf.preferred_position in available
                 available.discard(sf.preferred_position)
+                position_mapping[sf] = sf.preferred_position
+
+        # Choose a position for those that have no preferred one!
+        for sumf in stage_sumfacts:
+            if sumf.preferred_position is None:
+                position_mapping[sumf] = available.pop()
 
         # Enable vectorization strategy:
         input = get_counted_variable("joined_input")
         buffer = get_counted_variable("joined_buffer")
 
+        # Collect the large matrices!
+        large_a_matrices = []
+        for i in range(len(next(iter(stage_sumfacts)).a_matrices)):
+            # Assert that the matrices of all sum factorizations have the same size
+            assert len(set(tuple(sf.a_matrices[i].rows for sf in stage_sumfacts))) == 1
+            assert len(set(tuple(sf.a_matrices[i].cols for sf in stage_sumfacts))) == 1
+
+            # Collect the transpose/derivative information
+            transpose = [False] * 4
+            derivative = [False] * 4
+            for sf in stage_sumfacts:
+                transpose[position_mapping[sf]] = sf.a_matrices[i].transpose
+                derivative[position_mapping[sf]] = sf.a_matrices[i].derivative
+
+            from dune.perftool.sumfact.amatrix import LargeAMatrix
+            large = LargeAMatrix(rows=next(iter(stage_sumfacts)).a_matrices[i].rows,
+                                 cols=next(iter(stage_sumfacts)).a_matrices[i].cols,
+                                 transpose=tuple(transpose),
+                                 derivative=tuple(derivative),
+                                 )
+            large_a_matrices.append(large)
+
         for sumf in stage_sumfacts:
-            pref_pos = sumf.preferred_position
-            if pref_pos is None:
-                pref_pos = available.pop()
-            vectorization_info(sumf.a_matrices, buffer, input, pref_pos)
+            vectorization_info(sumf.a_matrices, tuple(large_a_matrices), buffer, input, position_mapping[sumf])
     else:
         # Disable vectorization strategy
         no_vectorization(stage_sumfacts)
-- 
GitLab