From bea763e55677e55d79f8fcb5041b9e492f9a9fc4 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Fri, 6 May 2016 17:24:59 +0200
Subject: [PATCH] Poisson DG is working! + Shaped temporaries

---
 python/dune/perftool/loopy/transformer.py    | 94 +++++++++++++++-----
 python/dune/perftool/pdelab/basis.py         | 20 ++---
 python/dune/perftool/pdelab/localoperator.py |  8 ++
 python/dune/perftool/pymbolic/placeholder.py | 64 +++++++++++++
 4 files changed, 156 insertions(+), 30 deletions(-)
 create mode 100644 python/dune/perftool/pymbolic/placeholder.py

diff --git a/python/dune/perftool/loopy/transformer.py b/python/dune/perftool/loopy/transformer.py
index 70cea9f9..d5efa83b 100644
--- a/python/dune/perftool/loopy/transformer.py
+++ b/python/dune/perftool/loopy/transformer.py
@@ -8,6 +8,10 @@ from __future__ import absolute_import
 from dune.perftool import Restriction
 from dune.perftool.ufl.modified_terminals import ModifiedTerminalTracker
 from dune.perftool.pymbolic.uflmapper import UFL2PymbolicMapper
+from dune.perftool.pymbolic.placeholder import (IndexPlaceholder,
+                                                IndexPlaceholderExtraction,
+                                                IndexPlaceholderRemoval,
+                                                )
 from dune.perftool.pdelab.geometry import GeometryMapper
 from dune.perftool.generation import (domain,
                                       get_temporary_name,
@@ -35,24 +39,63 @@ def index_sum_iname(i):
     return name_index(i)
 
 
+_outerloop = None
+
+
+def set_outerloop(v):
+    global _outerloop
+    _outerloop = v
+
+
+def get_outerloop():
+    return _outerloop
+
+
 class UFL2LoopyVisitor(ModifiedTerminalTracker, UFL2PymbolicMapper, GeometryMapper):
     def __init__(self, measure):
-        self.iname_stack = [quadrature_iname()]
         self.measure = measure
         super(UFL2LoopyVisitor, self).__init__()
 
     def _assign(self, o):
         # In some corner cases we do not even need a temporary variable
-        if isinstance(o, int):
+        if isinstance(o, int) or isinstance(o, float):
             return o
 
         # Assign a name to the temporary variable we want our result in
         temp = get_temporary_name()
+        temp_shape = ()
+
+        # Determine which inames this assignment depends on and whether it should
+        # be merged into the main accumulation loop. Right now we apply the following
+        # procedure: All instructions that depend on all argument loop indices are
+        # merged into the main loop nest. Those instructions that depend on some
+        # argument loop indices but not all are merged into the kernel by fixing the
+        # loop ordering of the main loop (or are pulled outside if this already happened).
+        assignee = Variable(temp)
+        iname_deps = self.inames
+        merge_into_main_loopnest = True
+
+        if self.rank == 2 and len(set(iname_deps)) == 1:
+            if get_outerloop() is None:
+                set_outerloop(iname_deps[0].number)
+            if iname_deps[0].number != get_outerloop():
+                merge_into_main_loopnest = False
+
+        # Change the assignee!
+        if not merge_into_main_loopnest:
+            assignee_index_placeholder = IndexPlaceholderExtraction()(o).pop()
+            assignee_index = IndexPlaceholderRemoval(duplicate_inames=True)(assignee_index_placeholder)
+            assignee = Subscript(assignee, (assignee_index,))
+            temp_shape = (name_lfs_bound(name_lfs(assignee_index_placeholder.element, assignee_index_placeholder.restriction)),)
+
+        # Now introduce duplicate inames for the argument loop if necessary
+        replaced_iname_deps = [IndexPlaceholderRemoval(duplicate_inames=not merge_into_main_loopnest, wrap_in_variable=False)(i) for i in iname_deps]
+        replaced_expr = IndexPlaceholderRemoval(duplicate_inames=not merge_into_main_loopnest)(o)
 
         # Now we assign this expression to a new temporary variable
-        insn_id = instruction(assignee=Variable(temp),
-                              expression=o,
-                              forced_iname_deps=frozenset({i for i in self.iname_stack}),
+        insn_id = instruction(assignee=assignee,
+                              expression=replaced_expr,
+                              forced_iname_deps=frozenset({i for i in replaced_iname_deps}).union(frozenset({quadrature_iname()})),
                               forced_iname_deps_is_final=True,
                               )
 
@@ -60,12 +103,24 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker, UFL2PymbolicMapper, GeometryMapp
         from dune.perftool.generation import retrieve_cache_items
         temp = filter(lambda i: i.id == insn_id, retrieve_cache_items("instruction"))[0].assignee_name
 
+        retvar = Variable(temp)
+        if not merge_into_main_loopnest:
+            retvar_index = IndexPlaceholderRemoval()(assignee_index_placeholder)
+            retvar = Subscript(retvar, (retvar_index,))
+
         # Now that we know its exact name, declare the temporary
-        temporary_variable(temp)
+        temporary_variable(temp, shape=temp_shape)
 
-        return Variable(temp)
+        return retvar
 
     def __call__(self, o):
+        # Determine the rank of the term
+        from dune.perftool.ufl.rank import ufl_rank
+        self.rank = ufl_rank(o)
+
+        # And initialize a list of found inames
+        self.inames = []
+
         # First we do the tree traversal to get a pymbolic expression representing this expression
         ret = self.call(o)
 
@@ -81,15 +136,14 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker, UFL2PymbolicMapper, GeometryMapp
             restriction = Restriction.NEGATIVE
 
         # Have the issued instruction depend on the iname for this localfunction space
-        from dune.perftool.pdelab.basis import lfs_iname
-        self.iname_stack.append(lfs_iname(o.element(), restriction, argcount=o.number()))
+        self.inames.append(IndexPlaceholder(o.element(), restriction, o.number()))
 
         if self.grad:
             from dune.perftool.pdelab.argument import name_testfunction_gradient
-            return Subscript(Variable(name_testfunction_gradient(o, restriction)), (Variable(lfs_iname(o.element(), restriction, argcount=o.number())),))
+            return Subscript(Variable(name_testfunction_gradient(o, restriction)), (IndexPlaceholder(o.element(), restriction, o.number()),))
         else:
             from dune.perftool.pdelab.argument import name_testfunction
-            return Subscript(Variable(name_testfunction(o, restriction)), (Variable(lfs_iname(o.element(), restriction, argcount=o.number())),))
+            return Subscript(Variable(name_testfunction(o, restriction)), (IndexPlaceholder(o.element(), restriction, o.number()),))
 
     def coefficient(self, o):
         # If this is a trialfunction, we do something entirely different
@@ -132,17 +186,22 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker, UFL2PymbolicMapper, GeometryMapp
         from loopy import Reduction
         from dune.perftool.ufl.shape import determine_shape
 
-        inames = ()
+        oldinames = self.inames
+        self.inames = []
+
+        red_inames = ()
         # Define an iname for each of the indices in the multiindex
         for i in o.ufl_operands[1].indices():
-            inames = inames + (index_sum_iname(i),)
+            red_inames = red_inames + (index_sum_iname(i),)
             shape = determine_shape(o.ufl_operands[0], i)
             from dune.perftool.pdelab import name_index
             domain(name_index(i), shape)
 
         # Recurse to get the summation expression
         term = self.call(o.ufl_operands[0])
-        ret = self._assign(Reduction("sum", inames, term))
+        ret = self._assign(Reduction("sum", red_inames, term))
+
+        self.inames = self.inames + oldinames
 
         return ret
 
@@ -182,9 +241,7 @@ def transform_accumulation_term(term, measure, subdomain_id):
             icount = 0
 
         lfs = name_lfs(ma.argexpr.element(), ma.restriction)
-#         force_duplicate_iname = (count == 1)
-#         lfsi = lfs_iname(ma.argexpr.element(), ma.restriction, force_duplicate_iname=force_duplicate_iname)
-        lfsi = lfs_iname(ma.argexpr.element(), ma.restriction, argcount=count)
+        lfsi = lfs_iname(ma.argexpr.element(), ma.restriction, count=count)
 
         accumargs[2 * icount] = lfs
         accumargs[2 * icount + 1] = lfsi
@@ -196,9 +253,6 @@ def transform_accumulation_term(term, measure, subdomain_id):
     from dune.perftool.pdelab.argument import name_accumulation_variable
     accumvar = name_accumulation_variable(arg_restr)
 
-#     if len(test_ma) == 2:
-#         from IPython import embed; embed()
-
     # The residual/the jacobian should be represented through a loopy global argument
     # TODO this seems still a bit hacky, esp. w.r.t. systems
     for rs in residual_shape:
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index e94b0554..303e860f 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -132,19 +132,16 @@ def traverse_lfs_tree(arg):
 
 @generator_factory(item_tags=("iname",), cache_key_generator=lambda e, r, c: (e, c))
 def _lfs_iname(element, restriction, context):
-    name = name_lfs(element, restriction)
-    bound = name_lfs_bound(name)
-
-    if context:
-        context = '_' + context
+    lfs = name_lfs(element, restriction)
+    bound = name_lfs_bound(lfs)
 
-    name = name + context + '_index'
+    name = "{}_{}_index".format(lfs, context)
     domain(name, bound)
 
     return name
 
 
-def lfs_iname(element, restriction, context='', argcount=0):
+def lfs_iname(element, restriction, count=None, context=''):
     """ Get the iname to iterate over the local function space given by element
 
     Arguments:
@@ -158,9 +155,12 @@ def lfs_iname(element, restriction, context='', argcount=0):
         a given purpose, see the 'Loops and dependencies' of the loopy docs:
         https://documen.tician.de/loopy/tutorial.html#loops-and-dependencies
     """
-    if argcount != 0:
-        context = '{}{}_arg{}'.format(context, '_' if len(context) else '', argcount)
-
+    assert not ((context == '') and (count is None))
+    if count is not None:
+        if context != '':
+            context = "{}_{}".format(count, context)
+        else:
+            context = str(count)
     return _lfs_iname(element, restriction, context)
 
 
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 1162f9e8..8335f8b0 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -284,6 +284,10 @@ def generate_localoperator_kernels(formdata, namedata):
         with global_context(form_type='residual'):
             # Generate the necessary residual methods
             for integral in form.integrals():
+                # Reset the outer loop
+                from dune.perftool.loopy.transformer import set_outerloop
+                set_outerloop(None)
+
                 with global_context(integral_type=integral.integral_type()):
                     enum_pattern()
                     pattern_baseclass()
@@ -316,6 +320,10 @@ def generate_localoperator_kernels(formdata, namedata):
 
             with global_context(form_type="jacobian"):
                 for integral in jacform.integrals():
+                    # Reset the outer loop
+                    from dune.perftool.loopy.transformer import set_outerloop
+                    set_outerloop(None)
+
                     with global_context(integral_type=integral.integral_type()):
                         kernel = generate_kernel(integral)
                     operator_kernels[(integral.integral_type(), 'jacobian')] = kernel
diff --git a/python/dune/perftool/pymbolic/placeholder.py b/python/dune/perftool/pymbolic/placeholder.py
new file mode 100644
index 00000000..c96765b1
--- /dev/null
+++ b/python/dune/perftool/pymbolic/placeholder.py
@@ -0,0 +1,64 @@
+from pymbolic.mapper import Collector, IdentityMapper
+from pymbolic.primitives import Variable
+
+
+class IndexPlaceholder(object):
+    def __init__(self, element, restriction, number, context=''):
+        self.element = element
+        self.restriction = restriction
+        self.number = number
+        self.context = context
+
+    def __hash__(self):
+        return hash((self.element, self.restriction, self.number, self.context))
+
+    def __eq__(self, o):
+        return (self.element == o.element) and (self.restriction == o.restriction) and (self.number == o.number) and (self.context == o.context)
+
+
+class IndexPlaceholderRemoval(IdentityMapper):
+    def __init__(self, wrap_in_variable=True, duplicate_inames=False):
+        self.duplicate_inames = duplicate_inames
+        self.wrap_in_variable = wrap_in_variable
+        super(IndexPlaceholderRemoval, self).__init__()
+
+    def map_foreign(self, o):
+        # How do we map constants here? map_constant was not correct
+        if isinstance(o, int) or isinstance(o, float):
+            return o
+
+        # There might be tuples of indices where only one is a placeholder,
+        # so we recurse manually into the tuple.
+        if isinstance(o, tuple):
+            return tuple(self.rec(op) for op in o)
+
+        # We only handle IndexPlaceholder instances from now on
+        assert isinstance(o, IndexPlaceholder)
+
+        context = o.context
+        from dune.perftool.loopy.transformer import get_outerloop
+        if (self.duplicate_inames) and (o.number != get_outerloop()):
+            context = 'dup'
+
+        from dune.perftool.pdelab.basis import lfs_iname
+        i = lfs_iname(o.element, o.restriction, count=o.number, context=context)
+        if self.wrap_in_variable:
+            return Variable(i)
+        else:
+            return i
+
+    def map_reduction(self, o):
+        from loopy import Reduction
+        o.expr = self.rec(o.expr)
+        return o
+
+
+class IndexPlaceholderExtraction(Collector):
+    def map_foreign(self, o):
+        if isinstance(o, int) or isinstance(o, float):
+            return set()
+        assert isinstance(o, tuple)
+        return set(i for i in o if isinstance(i, IndexPlaceholder))
+
+    def map_reduction(self, o):
+        return self.rec(o.expr)
-- 
GitLab