First draft of sumfact kernels in separate functions

91e6c5c0 · Dominic Kempf · 13c37b3c · 13c37b3c · 91e6c5c0 · 91e6c5c0
Commit 91e6c5c0 authored 7 years ago by Dominic Kempf
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
-from dune.perftool.error import PerftoolLoopyError
-from dune.perftool.generation import (get_counted_variable,
-                                      kernel_cached,
-                                      temporary_variable,
-                                      )
-
-
-class FlipFlopBuffer(object):
-    def __init__(self, identifier):
-        self.identifier = identifier
-
-        # Initialize the counter that switches between the base storages!
-        self._current = 0
-
-        # Generate the base storage names
-        self.base_storage = tuple("{}_base_{}".format(self.identifier, i) for i in (0, 1))
-
-    def switch_base_storage(self):
-        self._current = (self._current + 1) % 2
-
-    def get_temporary(self, **kwargs):
-        assert("base_storage" not in kwargs)
-        assert("storage_shape" not in kwargs)
-
-        # Select the base storage and increase counter
-        base = self.base_storage[self._current]
-
-        # Construct a temporary name
-        name = kwargs.pop("name", None)
-        if name is None:
-            name = get_counted_variable(self.identifier)
-
-        # Construct the temporary and return it
-        temporary_variable(name,
-                           base_storage=base,
-                           managed=True,
-                           _base_storage_access_may_be_aliasing=True,
-                           **kwargs
-                           )
-
-        return name
-
-
-@kernel_cached
-def initialize_buffer(identifier):
-    assert isinstance(identifier, str)
-    return FlipFlopBuffer(identifier)
-
-
-def get_buffer_temporary(identifier, **kwargs):
-    return initialize_buffer(identifier).get_temporary(**kwargs)
-
-
-def switch_base_storage(identifier):
-    initialize_buffer(identifier).switch_base_storage()
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -473,7 +473,11 @@ def generate_kernel(integrals):
    delete_cache_items("kernel_default")
    for integral in integrals:
        visit_integral(integral)
-    knl = extract_kernel_from_cache("kernel_default")
+
+    from dune.perftool.pdelab.signatures import kernel_name, assembly_routine_signature
+    name = kernel_name()
+    signature = assembly_routine_signature()
+    knl = extract_kernel_from_cache("kernel_default", name, signature)
    delete_cache_items("kernel_default")

    # Reset the quadrature degree
@@ -491,7 +495,7 @@ def generate_kernels_per_integral(integrals):
    yield generate_kernel(integrals)


-def extract_kernel_from_cache(tag, wrap_in_cgen=True):
+def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True):
    # Now extract regular loopy kernel components
    from dune.perftool.loopy.target import DuneTarget
    domains = [i for i in retrieve_cache_items("{} and domain".format(tag))]
@@ -512,13 +516,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
                  check_dep_resolution=False,
                  )

-    # Find a name for the kernel
-    if wrap_in_cgen:
-        from dune.perftool.pdelab.signatures import kernel_name
-        name = kernel_name()
-    else:
-        name = "constructor_kernel"
-
    # Create the kernel
    from loopy import make_kernel, preprocess_kernel
    kernel = make_kernel(domains,
@@ -570,8 +567,9 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):

    if wrap_in_cgen:
        # Wrap the kernel in something which can generate code
-        from dune.perftool.pdelab.signatures import assembly_routine_signature
-        signature = assembly_routine_signature()
+        if signature is None:
+            from dune.perftool.pdelab.signatures import assembly_routine_signature
+            signature = assembly_routine_signature()
        kernel = LoopyKernelMethod(signature, kernel)

    return kernel
@@ -673,7 +671,7 @@ def cgen_class_from_cache(tag, members=[]):
    tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))]

    # Construct the constructor
-    constructor_knl = extract_kernel_from_cache(tag, wrap_in_cgen=False)
+    constructor_knl = extract_kernel_from_cache(tag, "constructor_kernel", None, wrap_in_cgen=False)
    from dune.perftool.loopy.target import DuneTarget
    constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
    signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params))
@@ -1035,6 +1033,13 @@ def generate_localoperator_file(kernels, filename):
    for k in kernels.values():
        operator_methods.extend(k)

+    # Generate all the realizations of sum factorization kernel objects needed in this operator
+    from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
+    for sf, qp in retrieve_cache_items("kernelimpl"):
+        from dune.perftool.sumfact.tabulation import set_quadrature_points
+        set_quadrature_points(qp)
+        operator_methods.append(realize_sumfact_kernel_function(sf))
+
    if get_option('instrumentation_level') >= 3:
        include_file('dune/perftool/common/timer.hh', filetag='operatorfile')
        operator_methods.append(TimerMethod())

--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -23,7 +23,6 @@ from dune.perftool.options import (get_form_option,
                                   get_option,
                                   )
 from dune.perftool.loopy.flatten import flatten_index
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.sumfact.quadrature import nest_quadrature_loops
 from dune.perftool.pdelab.localoperator import determine_accumulation_space
 from dune.perftool.pdelab.restriction import restricted_name
@@ -427,11 +426,15 @@ def generate_accumulation_instruction(expr, visitor):

    vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset()

-    temp = get_buffer_temporary(buffer,
-                                shape=vsf.quadrature_shape,
-                                dim_tags=vsf.quadrature_dimtags,
-                                name="input_{}".format(buffer),
-                                )
+    from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype
+    storage = name_buffer_storage(buffer, 0)
+    temp = "input_{}".format(buffer)
+    temporary_variable(temp,
+                       shape=vsf.quadrature_shape,
+                       dim_tags=vsf.quadrature_dimtags,
+                       decl_method=buffer_decl(storage, get_sumfact_dtype(sf)),
+                       managed=True,
+                       )

    # Those input fields, that are padded need to be set to zero
    # in order to do a horizontal_add later on

--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -32,7 +32,6 @@ from dune.perftool.pdelab.argument import name_coefficientcontainer
 from dune.perftool.pdelab.geometry import (local_dimension,
                                           world_dimension,
                                           )
-from dune.perftool.loopy.buffer import initialize_buffer, get_buffer_temporary
 from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase
 from dune.perftool.options import get_form_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
@@ -83,10 +82,14 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord):
        coeff = pc(container, lfs, basisiname)

        # Get the input temporary!
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype
+        storage = name_buffer_storage(sf.buffer, 0)
+        name = "input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(product(mat.basis_size for mat in sf.matrix_sequence), sf.vector_width),
+                           decl_method=buffer_decl(storage, get_sumfact_dtype(sf)),
+                           managed=True,
+                           )

        assignee = prim.Subscript(prim.Variable(name),
                                  (prim.Variable(basisiname),) + (index,))

--- a/python/dune/perftool/sumfact/geometry.py
+++ b/python/dune/perftool/sumfact/geometry.py
@@ -12,7 +12,6 @@ from dune.perftool.generation import (backend,
                                      temporary_variable,
                                      globalarg,
                                      )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.pdelab.geometry import (local_dimension,
                                           world_dimension,
                                           name_geometry,
@@ -41,10 +40,14 @@ class GeoCornersInput(SumfactKernelInputBase, ImmutableRecord):
        ImmutableRecord.__init__(self, dir=dir)

    def realize(self, sf, index, insn_dep):
-        name = get_buffer_temporary(sf.buffer,
-                                    shape=(2 ** local_dimension(), sf.vector_width),
-                                    name="input_{}".format(sf.buffer)
-                                    )
+        from dune.perftool.sumfact.realization import name_buffer_storage, buffer_decl, get_sumfact_dtype
+        storage = name_buffer_storage(sf.buffer, 0)
+        name = name="input_{}".format(sf.buffer)
+        temporary_variable(name,
+                           shape=(2 ** local_dimension(), sf.vector_width),
+                           decl_method=buffer_decl(storage, get_sumfact_dtype(sf)),
+                           managed=True,
+                           )

        ciname = corner_iname()
        geo = name_geometry()

--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -3,20 +3,19 @@ The code that triggers the creation of the necessary code constructs
 to realize a sum factorization kernel
 """
 from dune.perftool.generation import (barrier,
+                                      delete_cache_items,
                                      dump_accumulate_timer,
                                      generator_factory,
                                      get_global_context_value,
                                      globalarg,
                                      instruction,
+                                      kernel_cached,
                                      post_include,
                                      preamble,
                                      silenced_warning,
                                      temporary_variable,
                                      transform,
                                      )
-from dune.perftool.loopy.buffer import (get_buffer_temporary,
-                                        switch_base_storage,
-                                        )
 from dune.perftool.pdelab.argument import pymbolic_coefficient
 from dune.perftool.pdelab.basis import shape_as_pymbolic
 from dune.perftool.pdelab.geometry import world_dimension
@@ -40,6 +39,22 @@ import numpy as np
 import pymbolic.primitives as prim


+necessary_kernel_implementations = generator_factory(item_tags=("kernelimpl",), no_deco=True)
+
+
+@generator_factory(cache_key_generator=lambda s, qp: (s.function_key, qp))
+def _name_kernel_implementation_function(sf, qp):
+    name = "sfimpl_{}".format("_".join(str(m) for m in sf.matrix_sequence))
+    necessary_kernel_implementations((sf, qp))
+    return name
+
+
+def name_kernel_implementation_function(sf):
+    from dune.perftool.sumfact.quadrature import quadrature_points_per_direction
+    qp = quadrature_points_per_direction()
+    return _name_kernel_implementation_function(sf, qp)
+
+
 def realize_sum_factorization_kernel(sf, **kwargs):
    if get_global_context_value("dry_run", False):
        return sf, sf.insn_dep
@@ -52,32 +67,96 @@ def alias_data_array(name, data):
    return "auto {} = {}.data();".format(name, data)


-@generator_factory(item_tags=("sumfactkernel",),
-                   context_tags=("kernel",),
-                   cache_key_generator=lambda s, **kw: s.cache_key)
+@preamble
+def declare_buffer_storage(name, size, alignment):
+    return "char {}[{}] __attribute__ ((aligned({})));".format(name, size * alignment, alignment)
+
+
+def name_buffer_storage(buff, which):
+    name = "{}_{}".format(buff, which)
+    return name
+
+
+@kernel_cached
 def _realize_sum_factorization_kernel(sf):
    insn_dep = sf.insn_dep

-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        if sf.stage == 1:
-            setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
-            insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
-                                                             within_inames=frozenset(sf.within_inames),
-                                                             depends_on=insn_dep)}))
-
-        timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
-        post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-        dump_accumulate_timer(timer_name)
-        insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
-                                                         within_inames=frozenset(sf.within_inames),
-                                                         depends_on=insn_dep,
-                                                         ),
-                                             }))
+    # Get all the necessary pieces for a function call
+    funcname = name_kernel_implementation_function(sf)
+    #TODO calculate the size and alignment correctly
+    size = 10000
+    alignment = 8
+    buffers = tuple(name_buffer_storage(sf.buffer, i) for i in range(2))
+
+    # Make sure that the storage is allocated
+    for buf in buffers:
+        declare_buffer_storage(buf, size, alignment)

+    # Realize the input if it is not direct
    if not sf.input.direct_input_is_possible:
        insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep))

+    # Call the function
+    code = "{}({}, {});".format(funcname, *buffers)
+    insn_dep = frozenset({instruction(code=code,
+                                      depends_on=insn_dep,
+                                      within_inames=frozenset(sf.within_inames))
+                          })
+
+    # Interpret the output as a temporary of correct shape
+    out = "{}_output".format(sf.buffer)
+    temporary_variable(out,
+                       shape=sf.output_shape,
+                       dim_tags=sf.output_dimtags,
+                       decl_method=buffer_decl(buffers[sf.length % 2], get_sumfact_dtype(sf)),
+                       managed=True,
+                       )
+    silenced_warning("read_no_write({})".format(out))
+
+    return lp.TaggedVariable(out, sf.tag), insn_dep
+
+
+def buffer_decl(buffer, dtype):
+    def _buffer_decl(name, *a):
+        from dune.perftool.loopy.target import numpy_to_cpp_dtype
+        _type = numpy_to_cpp_dtype(dtype)
+        return "{0} *{1} = ({0} *){2};".format(_type, name, buffer)
+
+    return _buffer_decl
+
+
+def get_sumfact_dtype(sf):
+    if sf.vectorized:
+        pass
+    else:
+        from dune.perftool.loopy.target import dtype_floatingpoint
+        from loopy.types import NumpyType
+        return NumpyType(dtype_floatingpoint()).dtype.name
+
+
+class BufferSwitcher(object):
+    def __init__(self, buffers=("buffer0", "buffer1")):
+        self.buffers = buffers
+        self.current = 0
+
+    def get_temporary(self, name=None, **kwargs):
+        temporary_variable(name,
+                           managed=True,
+                           decl_method=buffer_decl(self.buffers[self.current], kwargs["dtype"]),
+                           **kwargs
+                           )
+
+        return name
+
+    def switch(self):
+        self.current = (self.current + 1) % 2
+
+
+def realize_sumfact_kernel_function(sf):
+    # Get a buffer switcher instance
+    buffer = BufferSwitcher()
+    insn_dep = frozenset()
+
    # Prepare some dim_tags/shapes for later use
    ftags = ",".join(["f"] * sf.length)
    novec_ftags = ftags
@@ -147,9 +226,11 @@ def _realize_sum_factorization_kernel(sf):
            # Get a temporary that interprets the base storage of the input
            # as a column-major matrix. In later iteration of the matrix loop
            # this reinterprets the output of the previous iteration.
-            inp = get_buffer_temporary(sf.buffer,
+            inp = buffer.get_temporary("buff_step{}_in".format(l),
                                       shape=inp_shape + vec_shape,
-                                       dim_tags=ftags)
+                                       dim_tags=ftags,
+                                       dtype=get_sumfact_dtype(sf),
+                                       )

            # The input temporary will only be read from, so we need to silence the loopy warning
            silenced_warning('read_no_write({})'.format(inp))
@@ -157,7 +238,7 @@ def _realize_sum_factorization_kernel(sf):
            input_summand = prim.Subscript(prim.Variable(inp),
                                           input_inames + vec_iname)

-        switch_base_storage(sf.buffer)
+        buffer.switch()

        # Get a temporary that interprets the base storage of the output.
        #
@@ -171,9 +252,11 @@ def _realize_sum_factorization_kernel(sf):
        output_shape = tuple(out_shape[1:]) + (out_shape[0],)
        if l == len(matrix_sequence) - 1:
            output_shape = permute_backward(output_shape, perm)
-        out = get_buffer_temporary(sf.buffer,
+        out = buffer.get_temporary("buff_step{}_out".format(l),
                                   shape=output_shape + vec_shape,
-                                   dim_tags=ftags)
+                                   dim_tags=ftags,
+                                   dtype=get_sumfact_dtype(sf),
+                                   )

        # Write the matrix-matrix multiplication expression
        matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname),
@@ -217,22 +300,196 @@ def _realize_sum_factorization_kernel(sf):
                                              )
                                  })

-    # Measure times and count operations in c++ code
-    if get_option("instrumentation_level") >= 4:
-        stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                           depends_on=frozenset({lp.match.Tagged(tag)}),
-                                           within_inames=frozenset(sf.within_inames))})
-        if sf.stage == 1:
-            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
-            post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
-            dump_accumulate_timer(timer_name)
-            frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name),
-                                   depends_on=stop_insn)})
-
-    out = get_buffer_temporary(sf.buffer,
-                               shape=sf.output_shape,
-                               dim_tags=sf.output_dimtags,
-                               )
-    silenced_warning('read_no_write({})'.format(out))
-
-    return lp.TaggedVariable(out, sf.tag), insn_dep
+    # Construct a loopy kernel object
+    name = name_kernel_implementation_function(sf)
+    from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
+    signature = "void {}(const char* buffer0, const char* buffer1) const".format(name)
+    kernel = extract_kernel_from_cache("kernel_default", name, [signature])
+    delete_cache_items("kernel_default")
+    return kernel
+
+
+# @generator_factory(item_tags=("sumfactkernel",),
+#                    context_tags=("kernel",),
+#                    cache_key_generator=lambda s, **kw: s.cache_key)
+# def old_realize_sum_factorization_kernel(sf):
+#     insn_dep = sf.insn_dep
+# 
+#     # Measure times and count operations in c++ code
+#     if get_option("instrumentation_level") >= 4:
+#         if sf.stage == 1:
+#             setuptimer = '{}_kernel_setup'.format(assembler_routine_name())
+#             insn_dep = insn_dep.union(frozenset({instruction(code='HP_TIMER_STOP({});'.format(setuptimer),
+#                                                              within_inames=frozenset(sf.within_inames),
+#                                                              depends_on=insn_dep)}))
+#
+#         timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(sf.stage)
+#         post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+#         dump_accumulate_timer(timer_name)
+#         insn_dep = insn_dep.union(frozenset({instruction(code="HP_TIMER_START({});".format(timer_name),
+#                                                          within_inames=frozenset(sf.within_inames),
+#                                                          depends_on=insn_dep,
+#                                                          ),
+#                                              }))
+#
+#     if not sf.input.direct_input_is_possible:
+#         insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep))
+#
+#     # Prepare some dim_tags/shapes for later use
+#     ftags = ",".join(["f"] * sf.length)
+#     novec_ftags = ftags
+#     ctags = ",".join(["c"] * sf.length)
+#     vec_shape = ()
+#     if sf.vectorized:
+#         ftags = ftags + ",vec"
+#         ctags = ctags + ",vec"
+#         vec_shape = (sf.vector_width,)
+#
+#     # Decide in which order we want to process directions in the
+#     # sumfactorization. A clever ordering can lead to a reduced
+#     # complexity. This will e.g. happen at faces where we only have
+#     # one quadratue point m_l=1 if l is the normal direction of the
+#     # face.
+#     #
+#     # Rule of thumb: small m's early and large n's late.
+#     perm = sumfact_permutation_strategy(sf)
+#
+#     # Permute matrix sequence
+#     matrix_sequence = permute_forward(sf.matrix_sequence, perm)
+#
+#     # Product of all matrices
+#     for l, matrix in enumerate(matrix_sequence):
+#         # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
+#         # and get inames that realize the product.
+#         inp_shape = (matrix.cols,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l])
+#         out_shape = (matrix.rows,) + tuple(mat.cols for mat in matrix_sequence[l + 1:]) + tuple(mat.rows for mat in matrix_sequence[:l])
+#         out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
+#         vec_iname = ()
+#         if matrix.vectorized:
+#             iname = sumfact_iname(sf.vector_width, "vec")
+#             vec_iname = (prim.Variable(iname),)
+#             transform(lp.tag_inames, [(iname, "vec")])
+#
+#         # A trivial reduction is implemented as a product, otherwise we run into
+#         # a code generation corner case producing way too complicated code. This
+#         # could be fixed upstream, but the loopy code realizing reductions is not
+#         # trivial and the priority is kind of low.
+#         if matrix.cols != 1:
+#             k = sumfact_iname(matrix.cols, "red")
+#             k_expr = prim.Variable(k)
+#         else:
+#             k_expr = 0
+#
+#         # Setup the input of the sum factorization kernel. In the
+#         # first matrix multiplication this can be taken from
+#         # * an input temporary (default)
+#         # * a global data structure (if FastDGGridOperator is in use)
+#         # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
+#         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
+#         if l == 0 and sf.input.direct_input_is_possible:
+#             # See comment below
+#             input_inames = permute_backward(input_inames, perm)
+#             inp_shape = permute_backward(inp_shape, perm)
+#
+#             input_summand = sf.input.realize_direct(inp_shape, input_inames)
+#         else:
+#             # If we did permute the order of a matrices above we also
+#             # permuted the order of out_inames. Unfortunately the
+#             # order of our input is from 0 to d-1. This means we need
+#             # to permute _back_ to get the right coefficients.
+#             if l == 0:
+#                 inp_shape = permute_backward(inp_shape, perm)
+#                 input_inames = permute_backward(input_inames, perm)
+#
+#             # Get a temporary that interprets the base storage of the input
+#             # as a column-major matrix. In later iteration of the matrix loop
+#             # this reinterprets the output of the previous iteration.
+#             inp = get_buffer_temporary(sf.buffer,
+#                                        shape=inp_shape + vec_shape,
+#                                        dim_tags=ftags)
+#
+#             # The input temporary will only be read from, so we need to silence the loopy warning
+#             silenced_warning('read_no_write({})'.format(inp))
+#
+#             input_summand = prim.Subscript(prim.Variable(inp),
+#                                            input_inames + vec_iname)
+#
+#         switch_base_storage(sf.buffer)
+#
+#         # Get a temporary that interprets the base storage of the output.
+#         #
+#         # Note: In this step the reordering of the fastest directions
+#         # is happening. The new direction (out_inames[0]) and the
+#         # corresponding shape (out_shape[0]) goes to the end (slowest
+#         # direction) and everything stays column major (ftags->fortran
+#         # style).
+#         #
+#         # If we are in the last step we reverse the permutation.
+#         output_shape = tuple(out_shape[1:]) + (out_shape[0],)
+#         if l == len(matrix_sequence) - 1:
+#             output_shape = permute_backward(output_shape, perm)
+#         out = get_buffer_temporary(sf.buffer,
+#                                    shape=output_shape + vec_shape,
+#                                    dim_tags=ftags)
+#
+#         # Write the matrix-matrix multiplication expression
+#         matprod = prim.Product((matrix.pymbolic((prim.Variable(out_inames[0]), k_expr) + vec_iname),
+#                                 input_summand))
+#
+#         # ... which may be a reduction, if k>0
+#         if matrix.cols != 1:
+#             matprod = lp.Reduction("sum", k, matprod)
+#
+#         # Here we also move the new direction (out_inames[0]) to the
+#         # end and reverse permutation
+#         output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
+#         if l == len(matrix_sequence) - 1:
+#             output_inames = permute_backward(output_inames, perm)
+#
+#         tag = "sumfact_stage{}".format(sf.stage)
+#         if sf.stage == 3:
+#             tag = "{}_{}".format(tag, "_".join(sf.within_inames))
+#
+#         # Collect the key word arguments for the loopy instruction
+#         insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)),
+#                      "forced_iname_deps_is_final": True,
+#                      "depends_on": insn_dep,
+#                      "tags": frozenset({tag}),
+#                      "predicates": sf.predicates,
+#                      "groups": frozenset({sf.group_name}),
+#                      }
+#
+#         # In case of direct output we directly accumulate the result
+#         # of the Sumfactorization into some global data structure.
+#         if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3:
+#             if sf.vectorized:
+#                 insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name}))
+#             insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args)
+#         else:
+#             # Issue the reduction instruction that implements the multiplication
+#             # at the same time store the instruction ID for the next instruction to depend on
+#             insn_dep = frozenset({instruction(assignee=prim.Subscript(prim.Variable(out), output_inames + vec_iname),
+#                                               expression=matprod,
+#                                               **insn_args
+#                                               )
+#                                   })
+#
+#     # Measure times and count operations in c++ code
+#     if get_option("instrumentation_level") >= 4:
+#         stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
+#                                            depends_on=frozenset({lp.match.Tagged(tag)}),
+#                                            within_inames=frozenset(sf.within_inames))})
+#         if sf.stage == 1:
+#             qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
+#             post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
+#             dump_accumulate_timer(timer_name)
+#             frozenset({instruction(code="HP_TIMER_START({});".format(qp_timer_name),
+#                                    depends_on=stop_insn)})
+#
+#     out = get_buffer_temporary(sf.buffer,
+#                                shape=sf.output_shape,
+#                                dim_tags=sf.output_dimtags,
+#                                )
+#     silenced_warning('read_no_write({})'.format(out))
+#
+#     return lp.TaggedVariable(out, sf.tag), insn_dep
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -278,6 +278,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable):
    # Some cache key definitions
    # Watch out for the documentation to see which key is used unter what circumstances
    #
+    @property
+    def function_key(self):
+        """ Kernels sharing this key may use the same kernel implementation function """
+        return self.matrix_sequence

    @property
    def parallel_key(self):

--- a/python/dune/perftool/sumfact/tabulation.py
+++ b/python/dune/perftool/sumfact/tabulation.py
@@ -18,7 +18,6 @@ from dune.perftool.generation import (class_member,
                                      transform,
                                      valuearg
                                      )
-from dune.perftool.loopy.buffer import get_buffer_temporary
 from dune.perftool.loopy.target import dtype_floatingpoint
 from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size
 from dune.perftool.pdelab.localoperator import (name_domain_field,