From 31e8390329bd8c8a87ef7d563f22764e5f552f33 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@web.de>
Date: Thu, 18 Jan 2018 15:41:28 +0100
Subject: [PATCH] use plain array while creating the kernel, not afterwards in
 the vectorization

---
 .../dune/perftool/blockstructured/__init__.py |  11 +-
 .../perftool/blockstructured/accumulation.py  |  51 +++++++
 .../dune/perftool/blockstructured/argument.py |  31 ++++-
 .../perftool/blockstructured/vectorization.py | 126 ------------------
 python/dune/perftool/pdelab/localoperator.py  |   8 +-
 5 files changed, 94 insertions(+), 133 deletions(-)
 create mode 100644 python/dune/perftool/blockstructured/accumulation.py

diff --git a/python/dune/perftool/blockstructured/__init__.py b/python/dune/perftool/blockstructured/__init__.py
index 9657e81c..830221bb 100644
--- a/python/dune/perftool/blockstructured/__init__.py
+++ b/python/dune/perftool/blockstructured/__init__.py
@@ -3,6 +3,7 @@ import dune.perftool.blockstructured.argument
 import dune.perftool.blockstructured.geometry
 import dune.perftool.blockstructured.spaces
 import dune.perftool.blockstructured.basis
+from dune.perftool.options import get_option
 
 from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position
 # from dune.perftool.pdelab.geometry import to_global
@@ -18,6 +19,7 @@ from dune.perftool.blockstructured.tools import sub_element_inames
 from dune.perftool.pdelab import PDELabInterface
 
 
+
 class BlockStructuredInterface(PDELabInterface):
     def __init__(self):
         PDELabInterface.__init__(self)
@@ -25,10 +27,17 @@ class BlockStructuredInterface(PDELabInterface):
     #
     # Local function space related generator functions
     #
+    def generate_accumulation_instruction(self, expr, visitor):
+        if get_option("vectorization_blockstructured"):
+            from dune.perftool.blockstructured.accumulation import generate_accumulation_instruction
+            return generate_accumulation_instruction(expr, visitor)
+        else:
+            from dune.perftool.pdelab.localoperator import generate_accumulation_instruction
+            return generate_accumulation_instruction(expr, visitor)
 
     # TODO current way to squeeze subelem iname in, not really ideal
     def lfs_inames(self, element, restriction, number=None, context=''):
-        return lfs_inames(element, restriction, number, context) + sub_element_inames()
+        return sub_element_inames() + lfs_inames(element, restriction, number, context)
 
     #
     # Test and trial function related generator functions
diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py
new file mode 100644
index 00000000..5a8a0232
--- /dev/null
+++ b/python/dune/perftool/blockstructured/accumulation.py
@@ -0,0 +1,51 @@
+from dune.perftool.generation import temporary_variable, instruction
+from dune.perftool.options import get_option
+from dune.perftool.pdelab.localoperator import determine_accumulation_space
+from dune.perftool.pdelab.argument import name_accumulation_variable
+from dune.perftool.pdelab.localoperator import boundary_predicates
+import pymbolic.primitives as prim
+
+
+def define_accumulation_alias(container, accumspace):
+    k = get_option("number_of_blocks")
+    p = accumspace.element.degree()
+    temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1),
+                       base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
+    code = "{} = &{}.container()({},0);".format(container + "_alias", container, accumspace.lfs.name)
+    instruction(within_inames=frozenset(),
+                code=code,
+                read_variables=frozenset({container}),
+                assignees=frozenset({container+"_alias"}))
+
+
+def name_accumulation_alias(accumvar, accumspace):
+    name = accumvar+"_alias"
+    define_accumulation_alias(accumvar, accumspace)
+    return name
+
+
+def generate_accumulation_instruction(expr, visitor):
+    # Collect the lfs and lfs indices for the accumulate call
+    test_lfs = determine_accumulation_space(visitor.test_info, 0)
+    # In the jacobian case, also determine the space for the ansatz space
+    ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1)
+
+    # Collect the lfs and lfs indices for the accumulate call
+    accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+    accumvar_alias = name_accumulation_alias(accumvar, test_lfs)
+
+    predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id)
+
+    quad_inames = visitor.interface.quadrature_inames()
+    lfs_inames = visitor.test_info.inames
+    if visitor.trial_info:
+        lfs_inames = lfs_inames + visitor.trial_info.inames
+
+    assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames))
+
+    instruction(assignee=assignee,
+                expression=prim.Sum((expr, assignee)),
+                forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
+                forced_iname_deps_is_final=True,
+                predicates=predicates
+                )
diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py
index 7ca3a94c..b40dcbac 100644
--- a/python/dune/perftool/blockstructured/argument.py
+++ b/python/dune/perftool/blockstructured/argument.py
@@ -1,10 +1,30 @@
 from dune.perftool.generation import (backend,
                                       kernel_cached,
-                                      valuearg)
+                                      valuearg, temporary_variable, instruction)
+from dune.perftool.options import get_option
 from dune.perftool.pdelab.argument import CoefficientAccess
-from dune.perftool.blockstructured.tools import micro_index_to_macro_index
+from dune.perftool.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
 from loopy.types import NumpyType
 import pymbolic.primitives as prim
+import loopy as lp
+
+
+def define_alias(container, lfs, element):
+    k = get_option("number_of_blocks")
+    p = element.degree()
+    temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1),
+                       base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
+    code = "{} = &{}({},0);".format(container+"_alias",container, lfs.name)
+    instruction(within_inames=frozenset(),
+                code=code,
+                read_variables=frozenset({container}),
+                assignees=frozenset({container+"_alias"}))
+
+
+def name_alias(container, lfs, element):
+    define_alias(container, lfs, element)
+    name = container+"_alias"
+    return name
 
 
 # TODO remove the need for element
@@ -20,4 +40,9 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
 
     # use higher order FEM index instead of Q1 index
-    return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    if get_option("vectorization_blockstructured"):
+        subelem_inames = sub_element_inames()
+        coeff_alias = name_alias(container, lfs, element)
+        return prim.Subscript(lp.TaggedVariable(coeff_alias, 'coeff_alias'), tuple(prim.Variable(i) for i in subelem_inames+index))
+    else:
+        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
index b6a83142..19112ab9 100644
--- a/python/dune/perftool/blockstructured/vectorization.py
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -1,128 +1,2 @@
-import loopy as lp
-import pymbolic.primitives as prim
-from dune.perftool.generation import temporary_variable, instruction, kernel_cached, get_counter
-from dune.perftool.options import get_option
-from dune.perftool.pdelab.argument import CoefficientAccess, PDELabAccumulationFunction
-
-vector_temporaries = set()
-
-
-def define_vector(name, code, inames, read_variables):
-    temporary_variable(name, base_storage='dummy'+name, managed=True, _base_storage_access_may_be_aliasing=True)
-    instruction(code=code,
-                inames=inames,
-                read_variables=read_variables,
-                assignees=frozenset({name}))
-
-
-def name_vector(expr, insn, iname):
-    func = expr.function
-    parameters = ', '.join(tuple(str(p) for p in expr.parameters[0:2]))
-    if isinstance(func, CoefficientAccess):
-        name = func.name
-    elif isinstance(func, PDELabAccumulationFunction):
-        name = func.accumobj
-    else:
-        return None
-    vector_name = name+"_vec"
-
-    vector_temporaries.add(vector_name)
-
-    c_alias = "{} = &{}({});".format(vector_name,name,parameters)
-    define_vector(vector_name, c_alias, insn.within_inames - frozenset({iname}), insn.read_dependency_names()-frozenset({iname}))
-    return vector_name
-
-
-def add_vectortype_alias(knl, iname):
-    assert(isinstance(knl, lp.LoopKernel))
-    from loopy.symbolic import WalkMapper
-    class SubstLocalVector(WalkMapper):
-        def __init__(self):
-            self.localvectors = ()
-
-        def add_localvector(self, expr):
-            if isinstance(expr, prim.Call):
-                if isinstance(expr.function, CoefficientAccess):
-                    localvector = expr.function.name
-                elif isinstance(expr.function, PDELabAccumulationFunction):
-                    localvector = expr.function.accumobj
-                else:
-                    return True
-                self.localvectors += ((localvector, expr.parameters[0:2]))
-            return True
-
-        post_visit = add_localvector
-
-    # find insn with localvectors in their expression
-    insn_with_localvectors = dict()
-    for insn in knl.instructions:
-        if isinstance(insn, lp.MultiAssignmentBase):
-            SM = SubstLocalVector()
-            SM(insn.expression)
-            if SM.localvectors:
-                insn_with_localvectors[insn.id] = SM.localvectors
-
-    # add temporaries
-    temporaries_names = set((lv[0]+"_vec" for lv in insn_with_localvectors.values()))
-    new_temporaries = dict()
-    for temporary_name in temporaries_names:
-        from dune.perftool.loopy.temporary import DuneTemporaryVariable
-        # temporary name unique?
-        k = get_option("number_of_blocks")
-        new_temporaries[temporary_name] = \
-            DuneTemporaryVariable(temporary_name, base_storage="dummy_"+temporary_name, managed=True,
-                                  _base_storage_access_may_be_aliasing=True, shape=(1,),
-                                  scope=lp.temp_var_scope.PRIVATE, read_only=True)
-    knl = knl.copy(temporary_variables={**knl.temporary_variables, **new_temporaries})
-
-    # set temporaries to localvector
-    temporaries_insns = []
-    for insn_id, localvector in insn_with_localvectors.items():
-        insn = knl.id_to_insn[insn_id]
-
-        if isinstance(insn, lp.CallInstruction):
-            code = "{} = &{}.container()({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0]))
-        else:
-            code = "{} = &{}({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0]))
-
-        temporaries_insns.append(lp.CInstruction(id='insn_{}'.format(str(get_counter('__insn_id')).zfill(4)),
-                                                 iname_exprs=frozenset(),
-                                                 within_inames=frozenset(),
-                                                 code=code,
-                                                 read_variables=frozenset({localvector[0]}),
-                                                 assignees=frozenset({localvector[0]+'_vec'})))
-    knl = knl.copy(instructions=knl.instructions+temporaries_insns)
-
-    new_insns = []
-    for insn in knl.instructions:
-        if insn.id in insn_with_localvectors:
-            localvector = insn_with_localvectors[insn.id]
-            if isinstance(insn, lp.CallInstruction):
-                expr = insn.expression.parameters[-1]
-                insn_dict = insn.__dict__
-                insn_dict['assignee'] = prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
-                insn_dict['expression'] = expr + prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
-                insn_dict['temp_var_type'] = ()
-                insn_dict.pop('assignees')
-                insn_dict.pop('temp_var_types')
-                new_insns.append(lp.Assignment(**insn_dict))
-            else:
-                from loopy.symbolic import SubstitutionMapper
-                class ReplaceCall(SubstitutionMapper):
-                    def map_call(self, expr):
-                        if isinstance(expr.function, CoefficientAccess):
-                            return prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
-                        else:
-                            return super.map_call(expr)
-                new_insns.append(insn.with_transformed_expressions(ReplaceCall(lambda e: None)))
-        else:
-            new_insns.append(insn.copy()) #vielleicht auf dependencies achten??
-    knl = knl.copy(instructions=new_insns)
-
-    return knl
-
-
 def vectorize_micro_elements(knl):
-    if "subel_x" in knl.all_inames():
-        knl = add_vectortype_alias(knl, "subel_x_inner")
     return knl
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 64a96ed3..78dda3e6 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -260,6 +260,7 @@ def determine_accumulation_space(info, number):
     return AccumulationSpace(lfs=lfs,
                              index=lfsi,
                              restriction=info.restriction,
+                             element=element
                              )
 
 
@@ -525,7 +526,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
                          silenced_warnings=silenced,
                          name=name,
                          )
-
     from loopy import make_reduction_inames_unique
     kernel = make_reduction_inames_unique(kernel)
 
@@ -551,8 +551,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
     from dune.perftool.loopy import heuristic_duplication
     kernel = heuristic_duplication(kernel)
 
-    from dune.perftool.blockstructured.vectorization import vectorize_micro_elements
-    kernel = vectorize_micro_elements(kernel)
     # Maybe apply vectorization strategies
     if get_option("vectorization_quadloop"):
         if get_option("sumfact"):
@@ -560,6 +558,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
             kernel = vectorize_quadrature_loop(kernel)
         else:
             raise NotImplementedError("Only vectorizing sumfactorized code right now!")
+    if get_option("vectorization_blockstructured"):
+        from dune.perftool.blockstructured.vectorization import vectorize_micro_elements
+        kernel = vectorize_micro_elements(kernel)
+
 
     # Now add the preambles to the kernel
     preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))]
-- 
GitLab