diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py
index e33cc1cb46915cc40ff766a0e38dce234601ed96..a0a7de0172f6310067ddd7a72160a595889fdcaa 100644
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -3,11 +3,11 @@
 Use this module to insert pymbolic nodes and the likes.
 """
 from dune.perftool.error import PerftoolError
+from dune.perftool.sumfact.symbolic import SumfactKernel
 from pymbolic.mapper.substitutor import make_subst_func
 
 import loopy as lp
 import pymbolic.primitives as prim
-from pytools import ImmutableRecord
 from six.moves import intern
 
 
@@ -16,246 +16,6 @@ from six.moves import intern
 #
 
 
-class SumfactKernel(ImmutableRecord, prim.Variable):
-    def __init__(self,
-                 a_matrices=None,
-                 buffer=None,
-                 stage=1,
-                 preferred_position=None,
-                 restriction=0,
-                 within_inames=(),
-                 input=None,
-                 padding=frozenset(),
-                 index=None,
-                 insn_dep=frozenset(),
-                 coeff_func=None,
-                 element=None,
-                 component=None,
-                 accumvar=None,
-                 ):
-        """Create a sum factorization kernel
-
-        Sum factorization can be written as
-
-        Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
-
-        with:
-        - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
-        - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
-        - A_l: Values of 1D basis evaluations at quadrature points in l
-               direction, matrix of dimension m_l x n_l
-        - R_l: Transformation operator that permutes the underlying data
-               vector of the rank d tensor in such a way that the fastest
-               direction gets the slowest direction
-
-        In the l'th step we have the following setup:
-        - A_l: Matrix of dimensions m_l x n_l
-        - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
-        - R_l: Transformation operator
-
-        Looking at the indizes the following will happen:
-        X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
-        A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
-        R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
-
-        So the multiplication with A_l is a reduction over one index and
-        the transformation brings the next reduction index in the fastest
-        position.
-
-        It can make sense to permute the order of directions. If you have
-        a small m_l (e.g. stage 1 on faces) it is better to do direction l
-        first. This can be done by:
-
-        - Permuting the order of the A matrices.
-        - Permuting the input tensor.
-        - Permuting the output tensor (this assures that the directions of
-          the output tensor are again ordered from 0 to d-1).
-
-        Note, that you will typically *not* set all of the below arguments,
-        but only some. The vectorization strategy may set others for you.
-        The only argument really needed in all cases is a_matrices.
-
-        Arguments:
-        ----------
-        a_matrices: A tuple of AMatrix instances
-            The list of tensors to be applied to the input.
-            Order of application is from 0 up.
-        buffer: A string identifying the flip flop buffer in use
-            for intermediate results. The memory is expected to be
-            pre-initialized with the input or you have to provide
-            direct_input (FastDGGridOperator).
-        stage: 1 or 3
-        insn_dep: An instruction ID that the first issued instruction
-            should depend upon. All following ones will depend on each
-            other.
-        within_inames: Instructions will be executed within those
-            inames (needed for stage 3 in jacobians).
-        preferred_position: Will be used in the dry run to order kernels
-            when doing vectorization e.g. (dx u,dy u,dz u, u).
-        restriction: Restriction for faces values.
-        input: The name to use for the input temporary
-        padding: a set of slots in the input temporary to be padded
-        index: The slot in the input temporary dedicated to this kernel
-        coeff_func: The function to call to get the input coefficient
-        element: The UFL element
-        component: The treepath to the correct component of above element
-        accumvar: The accumulation variable to accumulate into
-    """
-        # Check the input and apply defaults where necessary
-        assert isinstance(a_matrices, tuple)
-        from dune.perftool.sumfact.amatrix import AMatrixBase
-        assert all(isinstance(m, AMatrixBase) for m in a_matrices)
-
-        assert stage in (1, 3)
-
-        if preferred_position is not None:
-            assert isinstance(preferred_position, int)
-
-        if stage == 3:
-            assert isinstance(restriction, tuple)
-
-        assert isinstance(within_inames, tuple)
-
-        assert isinstance(insn_dep, frozenset)
-
-        ImmutableRecord.__init__(self,
-                                 a_matrices=a_matrices,
-                                 buffer=buffer,
-                                 stage=stage,
-                                 preferred_position=preferred_position,
-                                 restriction=restriction,
-                                 within_inames=within_inames,
-                                 input=input,
-                                 padding=padding,
-                                 index=index,
-                                 insn_dep=insn_dep,
-                                 coeff_func=coeff_func,
-                                 element=element,
-                                 component=component,
-                                 accumvar=accumvar,
-                                 )
-
-        prim.Variable.__init__(self, "SUMFACT")
-
-    #
-    # The methods/fields needed to get a well-formed pymbolic node
-    #
-    def __getinitargs__(self):
-        return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction, self.within_inames, self.input, self.padding, self.index, self.insn_dep, self.coeff_func, self.element, self.component, self.accumvar)
-
-    def stringifier(self):
-        return lp.symbolic.StringifyMapper
-
-    init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction", "within_inames", "input", "padding", "index", "insn_dep", "coeff_func", "element", "component", "accumvar")
-
-    mapper_method = "map_sumfact_kernel"
-
-    #
-    # Some convenience methods to extract information about the sum factorization kernel
-    #
-
-    @property
-    def length(self):
-        return len(self.a_matrices)
-
-    @property
-    def vectorized(self):
-        return next(iter(self.a_matrices)).vectorized
-
-    @property
-    def transposed(self):
-        return next(iter(self.a_matrices)).transpose
-
-    @property
-    def cache_key(self):
-        """ The cache key that can be used in generation magic,
-        Any two sum factorization kernels having the same cache_key
-        are realized simulatneously!
-        """
-        return hash((self.a_matrices, self.restriction, self.stage, self.buffer))
-
-    @property
-    def vec_index(self):
-        """ A tuple with the vector index
-
-        Defaults to the empty tuple to allow addition to any
-        existing index tuple """
-        if self.index is not None:
-            return (self.index,)
-        else:
-            return ()
-
-    @property
-    def flat_input_shape(self):
-        """ The 'flat' input tensor shape """
-        from pytools import product
-        shape = (product(mat.cols for mat in self.a_matrices),)
-        if self.vectorized:
-            shape = shape + (4,)
-        return shape
-
-    @property
-    def quadrature_shape(self):
-        """ The shape of a temporary for the quadrature points
-
-        Takes into account the lower dimensionality of faces and vectorization.
-        """
-        if self.transposed:
-            shape = tuple(mat.cols for mat in self.a_matrices if mat.face is None)
-        else:
-            shape = tuple(mat.rows for mat in self.a_matrices if mat.face is None)
-        if self.vectorized:
-            shape = shape + (4,)
-        return shape
-
-    @property
-    def quadrature_dimtags(self):
-        """ The dim_tags of a temporary for the quadrature points
-
-        Takes into account the lower dimensionality of faces and vectorization.
-        """
-        tags = ["f"] * len(self.quadrature_shape)
-        if self.vectorized:
-            tags[-1] = 'c'
-        return ",".join(tags)
-
-    @property
-    def dof_shape(self):
-        """ The shape of a temporary for the degrees of freedom
-
-        Takes into account vectorization.
-        """
-        shape = tuple(mat.rows for mat in self.a_matrices)
-        if self.vectorized:
-            shape = shape + (4,)
-        return shape
-
-    @property
-    def dof_dimtags(self):
-        """ The dim_tags of a temporary for the degrees of freedom
-
-        Takes into account vectorization.
-        """
-        tags = ["f"] * len(self.dof_shape)
-        if self.vectorized:
-            tags[-1] = 'vec'
-        return ",".join(tags)
-
-    @property
-    def output_shape(self):
-        if self.stage == 1:
-            return self.quadrature_shape
-        else:
-            return self.dof_shape
-
-    @property
-    def output_dimtags(self):
-        if self.stage == 1:
-            return self.quadrature_dimtags
-        else:
-            return self.dof_dimtags
-
-
 class FusedMultiplyAdd(prim.Expression):
     """ Represents an FMA operation """
 
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index 2de5312691609ec50d3cac23b19b417a43dde838..d175f02585d886c4950cd25493f895b3a4ed6447 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -2,15 +2,9 @@
 Our extensions to the loopy type system
 """
 from dune.perftool.options import get_option
-from dune.perftool.generation import (function_mangler,
-                                      include_file,
-                                      )
-
-from loopy.symbolic import FunctionIdentifier
-from loopy.types import NumpyType
-from loopy import CallMangleInfo
-
+from dune.perftool.generation import function_mangler
 
+import loopy as lp
 import numpy as np
 
 
@@ -73,4 +67,4 @@ def vcl_function_mangler(knl, func, arg_dtypes):
 
     if func == "horizontal_add":
         vcl = lp.types.NumpyType(get_vcl_type(np.float64))
-        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,))
\ No newline at end of file
+        return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,))
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
index 0389416b5999e96b7e303434f07cab7e07a824b5..f7f108b311b2f17379e5e719730f75e46ff5c9f2 100644
--- a/python/dune/perftool/sumfact/__init__.py
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -12,6 +12,8 @@ from dune.perftool.sumfact.basis import (lfs_inames,
                                          pymbolic_coefficient,
                                          pymbolic_coefficient_gradient,
                                          )
+
+import dune.perftool.sumfact.accumulation
 import dune.perftool.sumfact.switch
 
 from dune.perftool.pdelab import PDELabInterface
diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index 93568b41b124f23c747d632ae8d150d8e06fa546..d908e5e2e545f85c6f7a415d6459834aee686403 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -1,4 +1,3 @@
-from dune.perftool.loopy.symbolic import substitute
 from dune.perftool.pdelab.argument import (name_accumulation_variable,
                                            PDELabAccumulationFunction,
                                            )
@@ -29,7 +28,7 @@ from dune.perftool.sumfact.amatrix import (basis_functions_per_direction,
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
                                           )
-from dune.perftool.loopy.symbolic import SumfactKernel
+from dune.perftool.sumfact.symbolic import SumfactKernel
 from dune.perftool.ufl.modified_terminals import extract_modified_arguments
 from dune.perftool.tools import get_pymbolic_basename
 from dune.perftool.error import PerftoolError
@@ -136,6 +135,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         replace_dict = {}
         for iname in additional_inames:
             replace_dict[prim.Variable(iname)] = i
+        from dune.perftool.loopy.symbolic import substitute
         expression = substitute(pymbolic_expr, replace_dict)
 
         # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1)
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 739f8a96d821365223f8e185e91c1589185ace90..3bfa80ebfbd73c0721179a7e2f8fe8c7c3c74e48 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -29,7 +29,7 @@ from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
 from dune.perftool.loopy.buffer import initialize_buffer
-from dune.perftool.loopy.symbolic import SumfactKernel
+from dune.perftool.sumfact.symbolic import SumfactKernel
 from dune.perftool.options import get_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py
index b15bbec27c340250660698e8420e93f0b4bd5857..129aa049eabbed04688709238e0875e767b197ea 100644
--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -117,7 +117,7 @@ def _realize_sum_factorization_kernel(sf):
     perm = sumfact_permutation_strategy(sf)
 
     # Permute a_matrices
-    a_matrices = _permute_forward(sf.a_matrices, perm)
+    a_matrices = permute_forward(sf.a_matrices, perm)
 
     # Product of all matrices
     for l, a_matrix in enumerate(a_matrices):
@@ -150,8 +150,8 @@ def _realize_sum_factorization_kernel(sf):
         input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
         if l == 0 and direct_input is not None:
             # See comment below
-            input_inames = _permute_backward(input_inames, perm)
-            inp_shape = _permute_backward(inp_shape, perm)
+            input_inames = permute_backward(input_inames, perm)
+            inp_shape = permute_backward(inp_shape, perm)
 
             globalarg(direct_input, dtype=np.float64, shape=inp_shape, dim_tags=novec_ftags)
             if a_matrix.vectorized:
@@ -167,8 +167,8 @@ def _realize_sum_factorization_kernel(sf):
             # order of our input is from 0 to d-1. This means we need
             # to permute _back_ to get the right coefficients.
             if l == 0:
-                inp_shape = _permute_backward(inp_shape, perm)
-                input_inames = _permute_backward(input_inames, perm)
+                inp_shape = permute_backward(inp_shape, perm)
+                input_inames = permute_backward(input_inames, perm)
 
             # Get a temporary that interprets the base storage of the input
             # as a column-major matrix. In later iteration of the amatrix loop
@@ -196,7 +196,7 @@ def _realize_sum_factorization_kernel(sf):
         # If we are in the last step we reverse the permutation.
         output_shape = tuple(out_shape[1:]) + (out_shape[0],)
         if l == len(a_matrices) - 1:
-            output_shape = _permute_backward(output_shape, perm)
+            output_shape = permute_backward(output_shape, perm)
         out = get_buffer_temporary(sf.buffer,
                                    shape=output_shape + vec_shape,
                                    dim_tags=ftags)
@@ -214,7 +214,7 @@ def _realize_sum_factorization_kernel(sf):
         # end and reverse permutation
         output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
         if l == len(a_matrices) - 1:
-            output_inames = _permute_backward(output_inames, perm)
+            output_inames = permute_backward(output_inames, perm)
 
         # In case of direct output we directly accumulate the result
         # of the Sumfactorization into some global data structure.
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dbeb5278a43b47952b4f261a472b8feaea6344e
--- /dev/null
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -0,0 +1,245 @@
+""" A pymbolic node representing a sum factorization kernel """
+
+from pytools import ImmutableRecord
+
+import pymbolic.primitives as prim
+ 
+
+class SumfactKernel(ImmutableRecord, prim.Variable):
+    def __init__(self,
+                 a_matrices=None,
+                 buffer=None,
+                 stage=1,
+                 preferred_position=None,
+                 restriction=0,
+                 within_inames=(),
+                 input=None,
+                 padding=frozenset(),
+                 index=None,
+                 insn_dep=frozenset(),
+                 coeff_func=None,
+                 element=None,
+                 component=None,
+                 accumvar=None,
+                 ):
+        """Create a sum factorization kernel
+
+        Sum factorization can be written as
+
+        Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
+
+        with:
+        - X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
+        - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
+        - A_l: Values of 1D basis evaluations at quadrature points in l
+               direction, matrix of dimension m_l x n_l
+        - R_l: Transformation operator that permutes the underlying data
+               vector of the rank d tensor in such a way that the fastest
+               direction gets the slowest direction
+
+        In the l'th step we have the following setup:
+        - A_l: Matrix of dimensions m_l x n_l
+        - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
+        - R_l: Transformation operator
+
+        Looking at the indizes the following will happen:
+        X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
+        A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+        R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
+
+        So the multiplication with A_l is a reduction over one index and
+        the transformation brings the next reduction index in the fastest
+        position.
+
+        It can make sense to permute the order of directions. If you have
+        a small m_l (e.g. stage 1 on faces) it is better to do direction l
+        first. This can be done by:
+
+        - Permuting the order of the A matrices.
+        - Permuting the input tensor.
+        - Permuting the output tensor (this assures that the directions of
+          the output tensor are again ordered from 0 to d-1).
+
+        Note, that you will typically *not* set all of the below arguments,
+        but only some. The vectorization strategy may set others for you.
+        The only argument really needed in all cases is a_matrices.
+
+        Arguments:
+        ----------
+        a_matrices: A tuple of AMatrix instances
+            The list of tensors to be applied to the input.
+            Order of application is from 0 up.
+        buffer: A string identifying the flip flop buffer in use
+            for intermediate results. The memory is expected to be
+            pre-initialized with the input or you have to provide
+            direct_input (FastDGGridOperator).
+        stage: 1 or 3
+        insn_dep: An instruction ID that the first issued instruction
+            should depend upon. All following ones will depend on each
+            other.
+        within_inames: Instructions will be executed within those
+            inames (needed for stage 3 in jacobians).
+        preferred_position: Will be used in the dry run to order kernels
+            when doing vectorization e.g. (dx u,dy u,dz u, u).
+        restriction: Restriction for faces values.
+        input: The name to use for the input temporary
+        padding: a set of slots in the input temporary to be padded
+        index: The slot in the input temporary dedicated to this kernel
+        coeff_func: The function to call to get the input coefficient
+        element: The UFL element
+        component: The treepath to the correct component of above element
+        accumvar: The accumulation variable to accumulate into
+    """
+        # Check the input and apply defaults where necessary
+        assert isinstance(a_matrices, tuple)
+        from dune.perftool.sumfact.amatrix import AMatrixBase
+        assert all(isinstance(m, AMatrixBase) for m in a_matrices)
+
+        assert stage in (1, 3)
+
+        if preferred_position is not None:
+            assert isinstance(preferred_position, int)
+
+        if stage == 3:
+            assert isinstance(restriction, tuple)
+
+        assert isinstance(within_inames, tuple)
+
+        assert isinstance(insn_dep, frozenset)
+
+        ImmutableRecord.__init__(self,
+                                 a_matrices=a_matrices,
+                                 buffer=buffer,
+                                 stage=stage,
+                                 preferred_position=preferred_position,
+                                 restriction=restriction,
+                                 within_inames=within_inames,
+                                 input=input,
+                                 padding=padding,
+                                 index=index,
+                                 insn_dep=insn_dep,
+                                 coeff_func=coeff_func,
+                                 element=element,
+                                 component=component,
+                                 accumvar=accumvar,
+                                 )
+
+        prim.Variable.__init__(self, "SUMFACT")
+
+    #
+    # The methods/fields needed to get a well-formed pymbolic node
+    #
+    def __getinitargs__(self):
+        return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction, self.within_inames, self.input, self.padding, self.index, self.insn_dep, self.coeff_func, self.element, self.component, self.accumvar)
+
+    def stringifier(self):
+        return lp.symbolic.StringifyMapper
+
+    init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction", "within_inames", "input", "padding", "index", "insn_dep", "coeff_func", "element", "component", "accumvar")
+
+    mapper_method = "map_sumfact_kernel"
+
+    #
+    # Some convenience methods to extract information about the sum factorization kernel
+    #
+
+    @property
+    def length(self):
+        return len(self.a_matrices)
+
+    @property
+    def vectorized(self):
+        return next(iter(self.a_matrices)).vectorized
+
+    @property
+    def transposed(self):
+        return next(iter(self.a_matrices)).transpose
+
+    @property
+    def cache_key(self):
+        """ The cache key that can be used in generation magic,
+        Any two sum factorization kernels having the same cache_key
+        are realized simulatneously!
+        """
+        return hash((self.a_matrices, self.restriction, self.stage, self.buffer))
+
+    @property
+    def vec_index(self):
+        """ A tuple with the vector index
+
+        Defaults to the empty tuple to allow addition to any
+        existing index tuple """
+        if self.index is not None:
+            return (self.index,)
+        else:
+            return ()
+
+    @property
+    def flat_input_shape(self):
+        """ The 'flat' input tensor shape """
+        from pytools import product
+        shape = (product(mat.cols for mat in self.a_matrices),)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def quadrature_shape(self):
+        """ The shape of a temporary for the quadrature points
+
+        Takes into account the lower dimensionality of faces and vectorization.
+        """
+        if self.transposed:
+            shape = tuple(mat.cols for mat in self.a_matrices if mat.face is None)
+        else:
+            shape = tuple(mat.rows for mat in self.a_matrices if mat.face is None)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def quadrature_dimtags(self):
+        """ The dim_tags of a temporary for the quadrature points
+
+        Takes into account the lower dimensionality of faces and vectorization.
+        """
+        tags = ["f"] * len(self.quadrature_shape)
+        if self.vectorized:
+            tags[-1] = 'c'
+        return ",".join(tags)
+
+    @property
+    def dof_shape(self):
+        """ The shape of a temporary for the degrees of freedom
+
+        Takes into account vectorization.
+        """
+        shape = tuple(mat.rows for mat in self.a_matrices)
+        if self.vectorized:
+            shape = shape + (4,)
+        return shape
+
+    @property
+    def dof_dimtags(self):
+        """ The dim_tags of a temporary for the degrees of freedom
+
+        Takes into account vectorization.
+        """
+        tags = ["f"] * len(self.dof_shape)
+        if self.vectorized:
+            tags[-1] = 'vec'
+        return ",".join(tags)
+
+    @property
+    def output_shape(self):
+        if self.stage == 1:
+            return self.quadrature_shape
+        else:
+            return self.dof_shape
+
+    @property
+    def output_dimtags(self):
+        if self.stage == 1:
+            return self.quadrature_dimtags
+        else:
+            return self.dof_dimtags
\ No newline at end of file