diff --git a/python/dune/perftool/pdelab/quadrature.py b/python/dune/perftool/pdelab/quadrature.py
index 91a25b3318a047ccfb85a2be19dd5311d80cad1f..637d391c845bcd9c3af32e35cb8c6b47b46b7c4f 100644
--- a/python/dune/perftool/pdelab/quadrature.py
+++ b/python/dune/perftool/pdelab/quadrature.py
@@ -70,7 +70,7 @@ def name_order():
     # TODO how to determine integration order in a generic fashion here?
     # Or maybe just read it from the ini file? Or define some mapping from the
     # finite elements?
-    return "2"
+    return "4"
 
 
 def quadrature_loop_statement():
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index f1b8a6209591b08b543b3a906dec037bbecf1da4..637c77de560f83d63153cdf7b8f77eb6ec71e2c8 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -29,7 +29,7 @@ def quadrature_points_per_direction():
     # as soon as we have a generic implementation
 
     # Quadrature order
-    q = 2
+    q = 4
 
     # Quadrature points in per direction
     nb_qp = q // 2 + 1
@@ -159,6 +159,7 @@ def construct_theta(name):
 
 @class_member("operator")
 def typedef_theta(name):
+    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
     alignment = name_alignment()
     range_field = lop_template_range_field()
     return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
@@ -187,5 +188,17 @@ def name_theta():
 
 
 def evaluate_coefficient_gradient_sumfact():
-    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
-    name_theta()
+    # TODO this code is WIP and mainly used for experiments.
+
+    name = name_theta()
+    # TODO Hack
+    # m = name_number_of_quadrature_points_per_direction()
+    # n = name_number_of_basis_functions_per_direction()
+    m = quadrature_points_per_direction()
+    n = basis_functions_per_direction()
+    from dune.perftool.sumfact.sumfact import AMatrix, sumfact
+    a_matrix = AMatrix(name, m, n)
+    a_matrices = (a_matrix, a_matrix)
+    inp = 0
+    out = -1
+    sumfact(a_matrices, inp, out)
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
new file mode 100644
index 0000000000000000000000000000000000000000..47111314324546cc8555c1d6b19d1c851cb81033
--- /dev/null
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -0,0 +1,123 @@
+from dune.perftool.generation import (class_member,
+                                      constructor_block,
+                                      domain,
+                                      iname,
+                                      instruction,
+                                      no_caching,
+                                      preamble,
+                                      )
+
+from dune.perftool.sumfact.amatrix import name_alignment
+
+from dune.perftool.pdelab.localoperator import (
+    lop_template_range_field,
+)
+
+class AMatrix(object):
+    def __init__(self, a_matrix, m, n):
+        self.a_matrix = a_matrix
+        self.m = m
+        self.n = n
+
+    def __str__(self):
+        return "a_matrix: {}\nm: {}\nn: {}".format(self.a_matrix, self.m, self.n)
+
+
+class _GlobalBufferCounter(object):
+    counter = 0
+
+    def get(self):
+        _GlobalBufferCounter.counter = _GlobalBufferCounter.counter + 1
+        return _GlobalBufferCounter.counter
+
+
+@constructor_block("operator")
+def construct_buffer(name, a_matrices):
+    # buffer1 = (T*) aligned_malloc(8*std::max(static_cast<size_t>(M),static_cast<size_t>(N))*sizeof(T),alignment);
+    # if (buffer1==0) throw std::bad_alloc();
+    range_field = lop_template_range_field()
+
+    # Upper bound for the number of elements in the buffer. Note: This
+    # is usually much too large and could be improved.
+    max_size = 1
+    for mat in a_matrices:
+        max_size *= max(mat.n, mat.m)
+
+    alignment = name_alignment()
+    return ["{} = ({}*) aligned_malloc({}, {});".format(name, range_field, max_size, alignment),
+            "if ({}==0) throw std::bad_alloc();".format(name)]
+
+
+
+def type_buffer():
+    range_field = lop_template_range_field()
+    return "mutable {}*".format(range_field)
+
+
+@class_member("operator")
+def define_buffer(name, a_matrices):
+    buffer_type = type_buffer()
+    construct_buffer(name, a_matrices)
+    return "{} {};".format(buffer_type, name)
+
+
+def name_buffer(a_matrices):
+    name = "sumfact_buffer_" + str(_GlobalBufferCounter().get())
+    define_buffer(name, a_matrices)
+    return name
+
+
+def sumfact(a_matrices, inp, out):
+    # TODO: Think about caching.
+
+    # TODO: Some code suggests that the code works for different n/m
+    # per direction (eg. n_1 for x-direction and n_2 for y-direction,
+    # of course n!=m is allowed!). This is not tested at all
+    assert len(a_matrices) > 0
+    n = a_matrices[0].n
+    m = a_matrices[0].m
+    for mat in a_matrices:
+        assert n == mat.n
+        assert m == mat.m
+
+    # Special case for 1D
+    if len(a_matrices) == 1:
+        sumfact_kernel(a_matrices[0], inp, out)
+    else:
+        # Create buffer
+        buffer_1 = name_buffer(a_matrices)
+        if len(a_matrices) > 2:
+            buffer_2 = name_buffer(a_matrices)
+
+        if len(a_matrices)%2 == 0:
+            # First kernel
+            sumfact_kernel(a_matrices[0], inp, buffer_1)
+
+            # In between kernels
+            for mat_1, mat_2 in zip(a_matrices[1:-2:2], a_matrices[2:-1:2]):
+                sumfact_kernel(mat_1, buffer_1, buffer_2)
+                sumfact_kernel(mat_2, buffer_2, buffer_1)
+
+            # Last kernel
+            sumfact_kernel(a_matrices[-1], buffer_1, out)
+            None
+        else:
+            # First kernel
+            sumfact_kernel(a_matrices[0], inp, buffer_1)
+
+            # Second kernel
+            sumfact_kernel(a_matrices[1], buffer_1, buffer_2)
+
+            # In between kernels
+            for mat_2, mat_1 in zip(a_matrices[2:-2:2], a_matrices[3:-1:2]):
+                sumfact_kernel(mat_2, buffer_2, buffer_1)
+                sumfact_kernel(mat_1, buffer_1, buffer_2)
+
+            # Last kernel
+            sumfact_kernel(a_matrices[-1], buffer_2, out)
+
+
+@preamble(cache_key_generator=no_caching)
+def sumfact_kernel(a_matrix, inp, out):
+    print("sumfact kernel, inp: {}, out: {}".format(inp, out))
+    return "//// palpo"
diff --git a/test/sumfact/poisson/poisson.ufl b/test/sumfact/poisson/poisson.ufl
index 8c62dd3a24d2f64a3aec350c2096fd65967498d8..576c603e5c5f8b8d00bad8ea4cdcb6ec19ebd332 100644
--- a/test/sumfact/poisson/poisson.ufl
+++ b/test/sumfact/poisson/poisson.ufl
@@ -1,7 +1,9 @@
-f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());")
-g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());")
+cell = "quadrilateral"
 
-V = FiniteElement("CG", "triangle", 1, dirichlet_expression=g)
+f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());", cell=cell)
+g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());", cell=cell)
+
+V = FiniteElement("CG", cell, 1, dirichlet_expression=g)
 u = TrialFunction(V)
 v = TestFunction(V)