diff --git a/dune/perftool/sumfact/alignedmatvec.hh b/dune/perftool/sumfact/alignedmatvec.hh
new file mode 100644
index 0000000000000000000000000000000000000000..7f454fc3077bf9e08bbd7d1bcbc3abb0ffe1a5ed
--- /dev/null
+++ b/dune/perftool/sumfact/alignedmatvec.hh
@@ -0,0 +1,322 @@
+#ifndef __ALIGNEDMATVEC__
+#define __ALIGNEDMATVEC__
+
+#include <iostream>
+#include <iomanip>
+#include <tbb/scalable_allocator.h>
+
+/****************************************************************************/
+/****************************************************************************/
+// dynamic allocation of aligned memory blocks
+/****************************************************************************/
+/****************************************************************************/
+
+/** get a block of aligned memory
+
+    This function is based on malloc and allocates sufficient
+    extra memory to provide an aligned pointer;
+    use the correspondig free function to release the memory.
+*/
+void* aligned_malloc (size_t size, size_t alignment=16)
+{
+#if 1
+  // alignment must be reasonable
+  assert(alignment>=sizeof(void*));
+  assert(alignment%sizeof(void*)==0);
+
+  // get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic
+  void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size
+  if (p==0) return p; // not enough memory
+
+  // align the pointer
+  void **aligned_p = p;
+  while ( ((size_t)aligned_p)%alignment!=0)
+    {
+      aligned_p++;
+    }
+
+  // now put the original pointer just before that address
+  if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) )
+    {
+      void **pp = aligned_p;
+      pp--;
+      *pp = p;
+      return (void *) aligned_p;
+    }
+  else
+    {
+      aligned_p = aligned_p + (alignment/sizeof(void*));
+      void **pp = aligned_p;
+      pp--;
+      *pp = p;
+      return (void *) aligned_p;
+    }
+#else
+  return scalable_aligned_malloc(size,alignment);
+#endif
+}
+
+/** free a block of memory allocated through aligned_malloc
+ */
+void aligned_free (void* p)
+{
+#if 1
+  void **pp = (void**) p;
+  pp--;
+  free(*pp);
+#else
+  scalable_aligned_free(p);
+#endif
+}
+
+/****************************************************************************/
+/****************************************************************************/
+/* matrix class with
+   - aligned memory allocation
+   - possible padding of rows and columns
+   - row and column-wise access
+   - export of underlying pointer for BLAS-like operations
+*/
+/****************************************************************************/
+/****************************************************************************/
+
+/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout
+
+    - alignment is in bytes
+    - padding means allocated rows and columns is a multiple of padding
+    - padding=1 means that exactly m rows and m columns are allocated
+    - matrix allows row and columnwise access
+*/
+template<typename T=double, size_t alignment=16, size_t padding=1>
+class AlignedMatrix
+{
+  typedef T member_type;
+  typedef size_t size_type;
+
+public:
+  //! make a new matrix allocated on the heap
+  AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n)
+  {
+    // make padded_m, padded_n multiples of padding
+    if (alignment%sizeof(T)!=0) throw std::bad_alloc();
+    padded_m = m;
+    if (m%padding!=0) padded_m = (m/padding+1)*padding;
+    padded_n = n;
+    if (n%padding!=0) padded_n = (n/padding+1)*padding;
+
+    // allocate aligned array
+    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+    if (verbose) {
+      std::cout << "allocating aligned matrix" << std::endl;
+      std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl;
+      std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl;
+      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
+    }
+  }
+
+  //! copy constructor
+  AlignedMatrix (const AlignedMatrix& mat)
+  {
+    m = mat.m;
+    n = mat.n;
+    padded_m = m;
+    if (m%padding!=0) padded_m = (m/padding+1)*padding;
+    padded_n = n;
+    if (n%padding!=0) padded_n = (n/padding+1)*padding;
+
+    // allocate aligned array
+    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+  }
+
+  //! destroy the matrix
+  ~AlignedMatrix ()
+  {
+    aligned_free(p);
+  }
+
+  //! taking address returns the internal pointer
+  T* data () { return p; }
+
+  //! taking address returns the internal pointer
+  const T* data () const { return p; }
+
+  //! set new size
+  void resize (size_t newm, size_t newn)
+  {
+    if (newm*newn>padded_m*padded_n)
+      {
+        m = newm;
+        n = newn;
+      }
+    else
+      {
+        // throw away old matrix
+        aligned_free(p);
+
+        // set new size and reallocate
+        m = newm;
+        n = newn;
+
+        // make padded_m, padded_n multiples of padding
+        if (alignment%sizeof(T)!=0) throw std::bad_alloc();
+        padded_m = m;
+        if (m%padding!=0) padded_m = (m/padding+1)*padding;
+        padded_n = n;
+        if (n%padding!=0) padded_n = (n/padding+1)*padding;
+
+        // allocate aligned array
+        p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
+        if (p==0) throw std::bad_alloc();
+      }
+  }
+
+  //! get number of rows in use
+  size_t rows () const { return m;}
+
+  //! get number of columns in use
+  size_t cols () const { return n;}
+
+  //! get number of rows available
+  size_t padded_rows () const { return padded_m;}
+
+  //! get number of columns available
+  size_t padded_cols () const { return padded_n;}
+
+  //! access matrix elements as one big vector
+  const T& operator[] (size_t i) const {return p[i];}
+
+  //! access matrix elements as one big vector
+  T& operator[] (size_t i) {return p[i];}
+
+  //! access with j fastest; uses padding
+  const T& rowmajoraccess (size_t i, size_t j) const
+  {
+    return p[i*padded_n+j];
+  }
+
+  //! access with j fastest; uses padding
+  T& rowmajoraccess (size_t i, size_t j)
+  {
+    return p[i*padded_n+j];
+  }
+
+  //! access with i fastest; uses padding
+  const T& colmajoraccess (size_t i, size_t j) const
+  {
+    return p[j*padded_m+i];
+  }
+
+  //! access with i fastest; uses padding
+  T& colmajoraccess (size_t i, size_t j)
+  {
+    return p[j*padded_m+i];
+  }
+
+private:
+  size_t m,n;
+  size_t padded_m,padded_n;
+  T* p;
+};
+
+/****************************************************************************/
+/****************************************************************************/
+/* vector class with
+   - aligned memory allocation
+   - export of underlying pointer for BLAS-like operations
+*/
+/****************************************************************************/
+/****************************************************************************/
+
+/** Provide a memory aligned vector
+
+    - alignment is in bytes
+*/
+template<typename T=double, size_t alignment=16>
+class AlignedVector
+{
+  typedef T member_type;
+  typedef size_t size_type;
+
+public:
+  //! make a new vector
+  AlignedVector (size_t _n, bool verbose=false) : n(_n)
+  {
+    // allocate aligned array
+    p = (T*) aligned_malloc(n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+    if (verbose) {
+      std::cout << "allocating aligned vector" << std::endl;
+      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
+    }
+  }
+
+  AlignedVector()
+    : n(0)
+    , p(nullptr)
+  {}
+
+  //! copy constructor
+  AlignedVector (const AlignedVector& vec)
+  {
+    n = vec.n;
+    p = (T*) aligned_malloc(n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+  }
+
+  //! assignment operator
+  AlignedVector& operator=(const AlignedVector& vec)
+  {
+    if (p)
+      aligned_free(p);
+    n = vec.n;
+    p = (T*) aligned_malloc(n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+    std::copy(vec.p,vec.p+n,p);
+    return *this;
+  }
+
+  //! free vector
+  ~AlignedVector ()
+  {
+    if (p)
+      aligned_free(p);
+  }
+
+  //! resize vector
+  void resize (int _n)
+  {
+    if (_n<=n)
+      {
+        n = _n;
+        return;
+      }
+    if (p)
+      aligned_free(p);
+    n = _n;
+    p = (T*) aligned_malloc(n*sizeof(T),alignment);
+    if (p==0) throw std::bad_alloc();
+  }
+
+  //! taking address returns internal pointer to data
+  T* data () { return p; }
+
+  //! taking address returns internal pointer to data
+  const T* data () const { return p; }
+
+  //! get number of elements in vector
+  size_t size () const { return n;}
+
+  //! element access
+  const T& operator[] (size_t i) const {return p[i];}
+
+  //! element access
+  T& operator[] (size_t i) {return p[i];}
+
+private:
+  size_t n;
+  T* p;
+};
+
+#endif
diff --git a/dune/perftool/sumfact/onedquadrature.hh b/dune/perftool/sumfact/onedquadrature.hh
new file mode 100644
index 0000000000000000000000000000000000000000..72c4cde64cc7afb7e337b7bb7744ce33f8977e22
--- /dev/null
+++ b/dune/perftool/sumfact/onedquadrature.hh
@@ -0,0 +1,44 @@
+#ifndef DUNE_PERFTOOL_SUMFACT_ONEDQUADRATURE_HH
+#define DUNE_PERFTOOL_SUMFACT_ONEDQUADRATURE_HH
+
+
+#include <dune/geometry/quadraturerules.hh>
+
+
+//! Return sorted quadrature points and weight
+template <typename RF, typename DF, int m>
+void onedQuadraturePointsWeights(RF (&qp)[m], RF (&qw)[m]){
+  // Get oned quadrature rule
+  const int order = 2*m-2;
+  const auto& rule = Dune::QuadratureRules<DF, 1>::rule(Dune::GeometryType::cube,order);
+
+  // Save quadrature points and weight in qp and qp
+  size_t count=0;
+  for (const auto& ip : rule) {
+    size_t group=count/2;
+    size_t member=count%2;
+    size_t newj;
+    if (member==1) newj=group;
+    else newj=m-1-group;
+    qp[newj] = ip.position()[0];
+    qw[newj] = ip.weight();
+    // std::cout << "j=" << count << " newj=" << newj
+    //           << " qp=" << ip.position()[0]
+    //           << " qw=" << ip.weight()
+    //           << std::endl;
+    count++;
+  } // end 1D quadrature loop
+  // Order 1D quadrature points lexicographically
+  for (size_t j=0; j<m/2; j++){
+    if (qp[j]>0.5){
+      RF temp=qp[j];
+      qp[j] = qp[m-1-j];
+      qp[m-1-j] = temp;
+      temp=qw[j];
+      qw[j] = qw[m-1-j];
+      qw[m-1-j] = temp;
+    }
+  }
+}
+
+#endif
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 934208c8741376e7f205414e309b2ea79e59452f..1b084cbc18ac34b4b1193d3c45ed841d0766884a 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -12,6 +12,7 @@ from dune.perftool.generation.cache import (cached,
 from dune.perftool.generation.cpp import (base_class,
                                           class_basename,
                                           class_member,
+                                          constructor_block,
                                           constructor_parameter,
                                           dump_accumulate_timer,
                                           include_file,
diff --git a/python/dune/perftool/generation/cpp.py b/python/dune/perftool/generation/cpp.py
index 166dc9fe3fe94e824558c10dd5bffcfbc9b138d7..ea8727be7c700898724cc0e7ad903c0a1a1fdd63 100644
--- a/python/dune/perftool/generation/cpp.py
+++ b/python/dune/perftool/generation/cpp.py
@@ -75,6 +75,12 @@ def class_basename(classtag=None):
     return generator_factory(item_tags=("clazz", classtag, "basename"))
 
 
+def constructor_block(classtag=None):
+    assert classtag
+    from dune.perftool.generation import generator_factory
+    return generator_factory(item_tags=("clazz", classtag, "constructor_block"), counted=True)
+
+
 def dump_accumulate_timer(name):
     gen = generator_factory(item_tags=("dump_timers"), no_deco=True)
 
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 8ae4b6c5768931901df1a2812be9a08366525d3b..ef6db4e89d56b951b3272397991c237bcb43e6ef 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -36,6 +36,9 @@ def get_form_compiler_arguments():
     parser.add_argument("--ini-file", type=str, help="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)")
     parser.add_argument("--timer", action="store_true", help="measure times")
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
+    # TODO at some point this help description should be updated
+    parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
+    parser.add_argument("--sumfact-alignment", type=int, help="Alignment used in sumfactorization", default=64)
 
     # Modify the positional argument to not be a list
     args = vars(parser.parse_args())
diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py
index 7bba657e854589c5b872111d9cb48060e7bb85da..094d37b8a2f3dad5353b3a168b9b0a84068aa983 100644
--- a/python/dune/perftool/pdelab/argument.py
+++ b/python/dune/perftool/pdelab/argument.py
@@ -1,5 +1,7 @@
 """ Generator functions related to trial and test functions and the accumulation loop"""
 
+from dune.perftool.options import get_option
+
 from dune.perftool.generation import (domain,
                                       iname,
                                       pymbolic_expr,
@@ -39,6 +41,16 @@ def name_trialfunction_gradient(element, restriction, component):
     rawname = "gradu" + "_".join(str(c) for c in component)
     name = restricted_name(rawname, restriction)
     container = name_coefficientcontainer(restriction)
+
+    # TODO
+    #
+    # This is just a temporary test used to create an A-matrix as
+    # local operator class member. Right now it doesn't evaluate
+    # anything.
+    if get_option("sumfact"):
+        from dune.perftool.sumfact import evaluate_coefficient_gradient_sumfact
+        evaluate_coefficient_gradient_sumfact()
+
     evaluate_coefficient_gradient(element, name, container, restriction, component)
     return name
 
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index bc76d54bb196d781df1f9f12617fd2eb0cf6524d..5bf9b95c949e46abf6e88c9e982d0879c3ecedb2 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -623,7 +623,8 @@ def typedef_localoperator(name, formdata):
     include_file(filename, filetag="driver")
     from dune.perftool.pdelab.localoperator import localoperator_basename
     lopname = localoperator_basename(formdata, data)
-    return "using {} = {}<{}, {}>;".format(name, lopname, ugfs, vgfs)
+    range_type = type_range()
+    return "using {} = {}<{}, {}, {}>;".format(name, lopname, ugfs, vgfs, range_type)
 
 
 def type_localoperator(formdata):
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 475c052d7b5241a22eafdc366a32ecb389a06556..85f1f0a26d44deb57c357f635a24ef6b8e8bc42a 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -56,6 +56,24 @@ def lop_template_test_gfs():
     return "GFSV"
 
 
+@template_parameter("operator")
+def lop_template_range_field():
+    return "RF"
+
+
+@class_member("operator")
+def lop_domain_field(name):
+    # TODO: Rethink for not Galerkin Method
+    gfs = lop_template_ansatz_gfs()
+    return "using {} = typename {}::Traits::GridView::ctype;".format(name, gfs)
+
+
+def name_domain_field():
+    name = "DF"
+    lop_domain_field(name)
+    return name
+
+
 def lop_template_gfs(ma):
     from ufl.classes import Argument, Coefficient
     if isinstance(ma.argexpr, Argument):
@@ -383,12 +401,14 @@ def cgen_class_from_cache(tag, members=[]):
 
     base_classes = [bc for bc in retrieve_cache_items('{} and baseclass'.format(tag))]
     constructor_params = [bc for bc in retrieve_cache_items('{} and constructor_param'.format(tag))]
+    from cgen import Block
+    constructor_block = Block(contents=[i for i in retrieve_cache_items("{} and constructor_block".format(tag), make_generable=True)])
     il = [i for i in retrieve_cache_items('{} and initializer'.format(tag))]
     pm = [m for m in retrieve_cache_items('{} and member'.format(tag))]
     tparams = [i for i in retrieve_cache_items('{} and template_param'.format(tag))]
 
     from dune.perftool.cgen.clazz import Constructor
-    constructor = Constructor(arg_decls=constructor_params, clsname=basename, initializer_list=il)
+    constructor = Constructor(block=constructor_block, arg_decls=constructor_params, clsname=basename, initializer_list=il)
 
     from dune.perftool.cgen import Class
     return Class(basename, base_classes=base_classes, members=members + pm, constructors=[constructor], tparam_decls=tparams)
@@ -412,6 +432,7 @@ def generate_localoperator_kernels(formdata, data):
     localoperator_basename(formdata, data)
     lop_template_ansatz_gfs()
     lop_template_test_gfs()
+    lop_template_range_field()
     from dune.perftool.pdelab.parameter import parameterclass_basename
     parameterclass_basename(formdata, data)
 
@@ -424,8 +445,8 @@ def generate_localoperator_kernels(formdata, data):
     base_class('Dune::PDELab::LocalOperatorDefaultFlags', classtag="operator")
     from dune.perftool.pdelab.driver import is_stationary
     if not is_stationary():
-        # TODO replace double with clever typename stuff
-        base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<double>', classtag="operator")
+        rf = lop_template_range_field()
+        base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>'.format(rf), classtag="operator")
 
         # Create set time method in parameter class
         from dune.perftool.pdelab.parameter import define_set_time_method
diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..daaea90a264472f83ea13bd9a58fc3fc356a72c9
--- /dev/null
+++ b/python/dune/perftool/sumfact/__init__.py
@@ -0,0 +1 @@
+from dune.perftool.sumfact.amatrix import (evaluate_coefficient_gradient_sumfact)
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..b30b5b6e37b3d4592a6040d36324ff63a05155e0
--- /dev/null
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -0,0 +1,191 @@
+from dune.perftool.options import get_option
+
+from dune.perftool.generation import (class_member,
+                                      constructor_block,
+                                      get_global_context_value,
+                                      include_file,
+                                      initializer_list,
+                                      )
+
+from dune.perftool.pdelab.localoperator import (name_domain_field,
+                                                lop_template_range_field,
+                                                )
+
+
+@class_member("operator")
+def define_alignment(name):
+    alignment = get_option("sumfact_alignment")
+    return "enum {{ {} = {} }};".format(name, str(alignment))
+
+
+def name_alignment():
+    name = "alignment"
+    define_alignment(name)
+    return name
+
+
+def quadrature_points_per_direction():
+    # TODO use quadrature order from dune.perftool.pdelab.quadrature
+    # as soon as we have a generic implementation
+
+    # Quadrature order
+    q = 2
+
+    # Quadrature points in per direction
+    nb_qp = q / 2 + 1
+
+    return nb_qp
+
+
+@class_member("operator")
+def define_number_of_quadrature_points_per_direction(name):
+    number_qp = quadrature_points_per_direction()
+    return "enum {{ {} = {} }};".format(name, str(number_qp))
+
+
+def name_number_of_quadrature_points_per_direction():
+    name = "m"
+    define_number_of_quadrature_points_per_direction(name)
+    return name
+
+
+def polynomial_degree():
+    form = get_global_context_value("formdata").preprocessed_form
+    return form.coefficients()[0].ufl_element()._degree
+
+
+def basis_functions_per_direction():
+    return polynomial_degree() + 1
+
+
+@class_member("operator")
+def define_number_of_basis_functions_per_direction(name):
+    number_basis = basis_functions_per_direction()
+    return "enum {{ {} = {} }};".format(name, str(number_basis))
+
+
+def name_number_of_basis_functions_per_direction():
+    name = "n"
+    define_number_of_basis_functions_per_direction(name)
+    return name
+
+
+@class_member("operator")
+def define_oned_quadrature_weights(name):
+    range_field = lop_template_range_field()
+    number_qp = name_number_of_quadrature_points_per_direction()
+    return "{} {}[{}];".format(range_field, name, number_qp)
+
+
+def name_oned_quadrature_weights():
+    name = "qw"
+    define_oned_quadrature_weights(name)
+    return name
+
+
+@class_member("operator")
+def define_oned_quadrature_points(name):
+    range_field = lop_template_range_field()
+    number_qp = name_number_of_quadrature_points_per_direction()
+    return "{} {}[{}];".format(range_field, name, number_qp)
+
+
+def name_oned_quadrature_points():
+    name = "qp"
+    define_oned_quadrature_points(name)
+    return name
+
+
+@class_member("operator")
+def typedef_polynomials(name):
+    range_field = lop_template_range_field()
+    domain_field = name_domain_field()
+    degree = polynomial_degree()
+
+    include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="operatorfile")
+
+    # TODO: make switching polynomials possible
+    return "using {} = Dune::QkStuff::GaussLobattoLagrangePolynomials<{}, {}, {}>;".format(name,
+                                                                                           domain_field,
+                                                                                           range_field,
+                                                                                           degree)
+
+
+def type_polynomials():
+    name = "Polynomials1D"
+    typedef_polynomials(name)
+    return name
+
+
+@class_member("operator")
+def define_polynomials(name):
+    polynomials_type = type_polynomials()
+    return "{} {};".format(polynomials_type, name)
+
+
+def name_polynomials():
+    name = "poly"
+    define_polynomials(name)
+    return name
+
+
+@constructor_block("operator")
+def sort_quadrature_points_weights():
+    range_field = lop_template_range_field()
+    domain_field = name_domain_field()
+    number_qp = name_number_of_quadrature_points_per_direction()
+    qp = name_oned_quadrature_points()
+    qw = name_oned_quadrature_weights()
+    include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile")
+    return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});".format(range_field, domain_field, number_qp, qp, qw)
+
+
+@constructor_block("operator")
+def construct_theta(name):
+    # Make sure that the quadrature points are sorted
+    sort_quadrature_points_weights()
+
+    m = name_number_of_quadrature_points_per_direction()
+    n = name_number_of_basis_functions_per_direction()
+    polynomials = name_polynomials()
+    qp = name_oned_quadrature_points()
+
+    return ["for (std::size_t i=0; i<{}; i++){{".format(m),
+            "  for (std::size_t j=0; j<{}; j++){{".format(n),
+            "    {}.colmajoraccess(i,j) = {}.p(j,{}[i]);".format(name, polynomials, qp),
+            "  }",
+            "}"]
+
+
+@class_member("operator")
+def typedef_theta(name):
+    alignment = name_alignment()
+    range_field = lop_template_range_field()
+    return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
+
+
+def type_theta():
+    name = "AMatrix"
+    typedef_theta(name)
+    return name
+
+
+@class_member("operator")
+def define_theta(name):
+    theta_type = type_theta()
+    number_qp = name_number_of_quadrature_points_per_direction()
+    number_basis = name_number_of_basis_functions_per_direction()
+    initializer_list(name, [str(number_qp), str(number_basis)], classtag="operator")
+    construct_theta(name)
+    return "{} {};".format(theta_type, name)
+
+
+def name_theta():
+    name = "Theta"
+    define_theta(name)
+    return name
+
+
+def evaluate_coefficient_gradient_sumfact():
+    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
+    name_theta()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d41b93944ed9bdad71cf18b0d3fbd337589c6012..f0812d3ee192ce780fc9288ca995f58d28c95475 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -3,3 +3,5 @@ add_subdirectory(laplace)
 add_subdirectory(nonlinear)
 add_subdirectory(poisson)
 add_subdirectory(stokes)
+
+add_subdirectory(sumfact)
diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6ef2765c02a5e22303da1bfe440916c3fce78d26
--- /dev/null
+++ b/test/sumfact/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(poisson)
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4620b9a53c6a052155d08b7a42dbdf94c6e9aba5
--- /dev/null
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -0,0 +1,11 @@
+# 1. Poisson Test Case: source f, bc g
+dune_add_formcompiler_system_test(UFLFILE poisson.ufl
+                                  BASENAME sumfact_poisson
+                                  INIFILE poisson.mini
+                                  )
+
+# # 2. Poisson Test Case: DG, f + pure dirichlet
+# dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+#                                   BASENAME sumfact_poisson_dg
+#                                   INIFILE poisson_dg.mini
+#                                   )
diff --git a/test/sumfact/poisson/poisson.mini b/test/sumfact/poisson/poisson.mini
new file mode 100644
index 0000000000000000000000000000000000000000..5f07c985767d2ef9bfbe5e5bebcf82d801222833
--- /dev/null
+++ b/test/sumfact/poisson/poisson.mini
@@ -0,0 +1,18 @@
+__name = sumfact_poisson_{__exec_suffix}
+__exec_suffix = numdiff, symdiff | expand num
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+numerical_jacobian = 1, 0 | expand num
+exact_solution_expression = g
+compare_l2errorsquared = 1e-6
+sumfact = 1
diff --git a/test/sumfact/poisson/poisson.ufl b/test/sumfact/poisson/poisson.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..8c62dd3a24d2f64a3aec350c2096fd65967498d8
--- /dev/null
+++ b/test/sumfact/poisson/poisson.ufl
@@ -0,0 +1,8 @@
+f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());")
+g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());")
+
+V = FiniteElement("CG", "triangle", 1, dirichlet_expression=g)
+u = TrialFunction(V)
+v = TestFunction(V)
+
+forms = [(inner(grad(u), grad(v)) - f*v)*dx]
diff --git a/test/sumfact/poisson/poisson_dg.mini b/test/sumfact/poisson/poisson_dg.mini
new file mode 100644
index 0000000000000000000000000000000000000000..8a2000a43db23777bd2a16ef034a794460bd3b98
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg.mini
@@ -0,0 +1,18 @@
+__name = sumfact_poisson_dg_{__exec_suffix}
+__exec_suffix = numdiff, symdiff | expand num
+
+lowerleft = 0.0 0.0
+upperright = 1.0 1.0
+elements = 32 32
+elementType = simplical
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_dg_ref
+extension = vtu
+
+[formcompiler]
+numerical_jacobian = 1, 0 | expand num
+exact_solution_expression = g
+compare_l2errorsquared = 2e-5
+sumfact = 1
diff --git a/test/sumfact/poisson/poisson_dg.ufl b/test/sumfact/poisson/poisson_dg.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..0ee5a857931d2ceea62b8cf5e52c5aed2b696851
--- /dev/null
+++ b/test/sumfact/poisson/poisson_dg.ufl
@@ -0,0 +1,26 @@
+f = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return 4.*(1.-c.two_norm2())*std::exp(-1.*c.two_norm2());")
+g = Expression("Dune::FieldVector<double,2> c(0.5); c-= x; return std::exp(-1.*c.two_norm2());", on_intersection=True)
+
+cell = triangle
+V = FiniteElement("DG", cell, 1)
+
+u = TrialFunction(V)
+v = TestFunction(V)
+
+n = FacetNormal(cell)('+')
+
+gamma = 1.0
+theta = 1.0
+
+r = inner(grad(u), grad(v))*dx \
+  + inner(n, avg(grad(u)))*jump(v)*dS \
+  + gamma*jump(u)*jump(v)*dS \
+  + theta*jump(u)*inner(avg(grad(v)), n)*dS \
+  - inner(n, grad(u))*v*ds \
+  + gamma*u*v*ds \
+  - theta*u*inner(grad(v), n)*ds \
+  - f*v*dx \
+  + theta*g*inner(grad(v), n)*ds \
+  - gamma*g*v*ds
+
+forms = [r]