diff --git a/patches/loopy/Current.patch b/patches/loopy/Current.patch
index e6bfaf884633839b9a0b4aa43c2c50bbeb27d2e5..0709ea528977ffcf0724283f7c22f5e96898ab12 100644
--- a/patches/loopy/Current.patch
+++ b/patches/loopy/Current.patch
@@ -1,12 +1,12 @@
-From d75091179a72f10c110276720355dbf82ed33ef3 Mon Sep 17 00:00:00 2001
+From 87331320981ca8f7419f32b48609e6dc465bbfff Mon Sep 17 00:00:00 2001
 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
-Date: Fri, 9 Dec 2016 14:57:44 +0100
-Subject: [PATCH] Current loopy patch
+Date: Tue, 20 Dec 2016 11:25:11 +0100
+Subject: [PATCH] Current patch
 
 ---
  loopy/check.py               | 10 +++++-----
- loopy/codegen/instruction.py |  6 +++---
- 2 files changed, 8 insertions(+), 8 deletions(-)
+ loopy/codegen/instruction.py | 43 ++++++++++++++++++++++---------------------
+ 2 files changed, 27 insertions(+), 26 deletions(-)
 
 diff --git a/loopy/check.py b/loopy/check.py
 index 7562eac..ac03be0 100644
@@ -37,10 +37,54 @@ index 7562eac..ac03be0 100644
          kernel.target.pre_codegen_check(kernel)
          check_that_shapes_and_strides_are_arguments(kernel)
 diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
-index c490abb..ec68921 100644
+index c490abb..138fd6f 100644
 --- a/loopy/codegen/instruction.py
 +++ b/loopy/codegen/instruction.py
-@@ -219,9 +219,9 @@ def generate_call_code(codegen_state, insn):
+@@ -105,24 +105,25 @@ def generate_assignment_instruction_code(codegen_state, insn):
+ 
+     # {{{ vectorization handling
+ 
+-    if codegen_state.vectorization_info:
+-        if insn.atomicity:
+-            raise Unvectorizable("atomic operation")
+-
+-        vinfo = codegen_state.vectorization_info
+-        vcheck = VectorizabilityChecker(
+-                kernel, vinfo.iname, vinfo.length)
+-        lhs_is_vector = vcheck(insn.assignee)
+-        rhs_is_vector = vcheck(insn.expression)
+-
+-        if not lhs_is_vector and rhs_is_vector:
+-            raise Unvectorizable(
+-                    "LHS is scalar, RHS is vector, cannot assign")
+-
+-        is_vector = lhs_is_vector
+-
+-        del lhs_is_vector
+-        del rhs_is_vector
++#    if codegen_state.vectorization_info:
++#        if insn.atomicity:
++#            raise Unvectorizable("atomic operation")
++#
++#        vinfo = codegen_state.vectorization_info
++#        vcheck = VectorizabilityChecker(
++#                kernel, vinfo.iname, vinfo.length)
++#        lhs_is_vector = vcheck(insn.assignee)
++#        rhs_is_vector = vcheck(insn.expression)
++#
++#        if not lhs_is_vector and rhs_is_vector:
++#            raise Unvectorizable(
++#                    "LHS is scalar, RHS is vector, cannot assign")
++#
++#        is_vector = lhs_is_vector
++#
++#        del lhs_is_vector
++#        del rhs_is_vector
++    is_vector = False
+ 
+     # }}}
+ 
+@@ -219,9 +220,9 @@ def generate_call_code(codegen_state, insn):
  
      # {{{ vectorization handling
  
diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py
index 131bed1c11c98bf51a12659fa507d3edc3eba778..7efae2d54331886f081fbf16102c6ddf77892763 100644
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -7,6 +7,7 @@ from pymbolic.mapper.substitutor import make_subst_func
 
 import loopy as lp
 import pymbolic.primitives as prim
+from six.moves import intern
 
 
 #
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index d1ed485043989dbae7e0225ace2eb0e8e22a1aea..5f27ede95ccb273d089f231fc64083cb0c854d51 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -42,6 +42,7 @@ def get_form_compiler_arguments():
     parser.add_argument("--timer", action="store_true", help="measure times")
     parser.add_argument("--opcounter", action="store_true", default=False, help="Count operations. Should only be used with yaspgrid. Timer should be set.")
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
+    parser.add_argument("--fastdg", action="store_true", help="Use FastDGGridOperator from PDELab.")
     # TODO at some point this help description should be updated
     parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
     parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization")
diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py
index 336eb60cbd221d72b3337833e63cdd183a14a178..39ef2e247a8206c78339d3e6c6dad0ec242d8c50 100644
--- a/python/dune/perftool/pdelab/driver.py
+++ b/python/dune/perftool/pdelab/driver.py
@@ -346,14 +346,34 @@ def name_fem(expr):
 
 
 @preamble
-def typedef_vectorbackend(name):
+def define_blocksize(name, expr):
+    assert isDG(expr)
+    assert isQuadrilateral(expr)
+    dimension = name_dimension()
+    degree = expr._degree
+    return "static const int {} = Dune::QkStuff::QkSize<{}, {}>::value;".format(name, degree, dimension)
+
+
+def name_blocksize(expr):
+    name = "blocksize"
+    define_blocksize(name, expr)
+    return name
+
+
+@preamble
+def typedef_vectorbackend(name, expr):
     include_file("dune/pdelab/backend/istl.hh", filetag="driver")
-    return "using {} = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none, 1>;".format(name)
+    if get_option("fastdg"):
+        blocksize = name_blocksize(expr)
+        return "using {} = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::fixed, {}>;".format(name, blocksize)
+    else:
+        return "using {} = Dune::PDELab::istl::VectorBackend<Dune::PDELab::istl::Blocking::none, 1>;".format(name)
 
 
-def type_vectorbackend():
-    typedef_vectorbackend("VectorBackend")
-    return "VectorBackend"
+def type_vectorbackend(expr):
+    name = "VectorBackend"
+    typedef_vectorbackend(name, expr)
+    return name
 
 
 def type_orderingtag():
@@ -497,7 +517,7 @@ def name_assembled_constraints(expr):
 
 @preamble
 def typedef_gfs(element, dirichlet, name):
-    vb = type_vectorbackend()
+    vb = type_vectorbackend(element)
     from ufl import FiniteElement, MixedElement, VectorElement, EnrichedElement, RestrictedElement, TensorElement
     if isinstance(element, FiniteElement):
         gv = type_leafview()
@@ -670,8 +690,14 @@ def typedef_gridoperator(name, formdata):
     mb = type_matrixbackend()
     df = type_domainfield()
     r = type_range()
-    include_file("dune/pdelab/gridoperator/gridoperator.hh", filetag="driver")
-    return "using {} = Dune::PDELab::GridOperator<{}, {}, {}, {}, {}, {}, {}, {}, {}>;".format(name, ugfs, vgfs, lop, mb, df, r, r, ucc, vcc)
+    if get_option("fastdg"):
+        if not get_option("sumfact"):
+            raise PerftoolCodegenError("FastDGGridOperator is only implemented for sumfactorization.")
+        include_file("dune/pdelab/gridoperator/fastdg.hh", filetag="driver")
+        return "using {} = Dune::PDELab::FastDGGridOperator<{}, {}, {}, {}, {}, {}, {}, {}, {}>;".format(name, ugfs, vgfs, lop, mb, df, r, r, ucc, vcc)
+    else:
+        include_file("dune/pdelab/gridoperator/gridoperator.hh", filetag="driver")
+        return "using {} = Dune::PDELab::GridOperator<{}, {}, {}, {}, {}, {}, {}, {}, {}>;".format(name, ugfs, vgfs, lop, mb, df, r, r, ucc, vcc)
 
 
 def type_gridoperator(formdata):
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index c13ab899aab3c75fdbc6997f39409e506a9019c4..3822ba43818301ea8a4240163cdefb0239f5ae7e 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -8,7 +8,6 @@ from dune.perftool.generation import (class_member,
                                       domain,
                                       function_mangler,
                                       get_global_context_value,
-                                      globalarg,
                                       iname,
                                       include_file,
                                       initializer_list,
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index e77001d44fa9a0dda206e41a6350604ede5a301a..d3452f31a0391493ce21a571a9c06c98fe4a6625 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -29,10 +29,12 @@ from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.switch import (get_facedir,
                                           get_facemod,
                                           )
+from dune.perftool.pdelab.argument import name_coefficientcontainer
 from dune.perftool.pdelab.geometry import (local_dimension,
                                            world_dimension,
                                            )
 from dune.perftool.loopy.buffer import initialize_buffer
+from dune.perftool.options import get_option
 from dune.perftool.pdelab.driver import FEM_name_mangling
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.tools import maybe_wrap_subscript
@@ -44,6 +46,7 @@ from loopy.match import Writes
 import pymbolic.primitives as prim
 
 
+
 def name_sumfact_base_buffer():
     count = get_counter('sumfact_base_buffer')
     name = "buffer_{}".format(str(count))
@@ -75,33 +78,39 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor):
 
         # Get the vectorization info. If this happens during the dry run, we get dummies
         from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, restriction)
+        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
 
         # Initialize the buffer for the sum fact kernel
         shape = (product(mat.cols for mat in a_matrices),)
         if index is not None:
             shape = shape + (4,)
-        input = initialize_buffer(buffer,
-                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
-                                  num=2
-                                  ).get_temporary(shape=shape,
-                                                  name=input,
-                                                  )
+        inp = initialize_buffer(buf,
+                                base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
+                                num=2
+                                ).get_temporary(shape=shape,
+                                                name=inp,
+                                                )
         if insn_dep is None:
-            insn_dep = frozenset({Writes(input)})
+            insn_dep = frozenset({Writes(inp)})
 
-        # Setup the input!
-        setup_theta(input, element, restriction, component, index)
+        if get_option('fastdg'):
+            # Name of direct input, shape and globalarg is set in sum_factorization_kernel
+            direct_input = name_coefficientcontainer(restriction)
+        else:
+            direct_input = None
+            # Setup the input!
+            setup_theta(inp, element, restriction, component, index)
 
         # Add a sum factorization kernel that implements the
         # evaluation of the gradients of basis functions at quadrature
         # points (stage 1)
         var, insn_dep = sum_factorization_kernel(a_matrices,
-                                                 buffer,
+                                                 buf,
                                                  1,
                                                  preferred_position=i,
                                                  insn_dep=insn_dep,
                                                  restriction=restriction,
+                                                 direct_input=direct_input,
                                                  )
         buffers.append(var)
 
@@ -139,31 +148,38 @@ def pymbolic_trialfunction(element, restriction, component, visitor):
 
     # Get the vectorization info. If this happens during the dry run, we get dummies
     from dune.perftool.sumfact.vectorization import get_vectorization_info
-    a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, restriction)
+    a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
 
     # Flip flop buffers for sumfactorization
     shape = (product(mat.cols for mat in a_matrices),)
     if index is not None:
         shape = shape + (4,)
-    initialize_buffer(buffer,
+    initialize_buffer(buf,
                       base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                       num=2
                       ).get_temporary(shape=shape,
-                                      name=input,
+                                      name=inp,
                                       )
 
-    # Setup the input!
-    setup_theta(input, element, restriction, component, index)
+    # TODO: fastdg and vectorization
+    if get_option('fastdg'):
+        # Name of direct input, shape and globalarg is set in sum_factorization_kernel
+        direct_input = name_coefficientcontainer(restriction)
+    else:
+        direct_input = None
+        # Setup the input!
+        setup_theta(inp, element, restriction, component, index)
 
     # Add a sum factorization kernel that implements the evaluation of
     # the basis functions at quadrature points (stage 1)
     var, _ = sum_factorization_kernel(a_matrices,
-                                      buffer,
+                                      buf,
                                       1,
                                       preferred_position=None,
-                                      insn_dep=frozenset({Writes(input)}),
+                                      insn_dep=frozenset({Writes(inp)}),
                                       outshape=tuple(mat.rows for mat in a_matrices if mat.rows != 1),
                                       restriction=restriction,
+                                      direct_input=direct_input,
                                       )
 
     if index:
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 797a700f32add368a171d7038707c0f9d31c4208..e135fb75a1a8650a088852f95d67d1da03a4b77e 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -58,7 +58,6 @@ from pymbolic.primitives import (Call,
                                  )
 from dune.perftool.sumfact.quadrature import quadrature_inames
 from dune.perftool.sumfact.vectorization import find_sumfact
-from loopy import Reduction, GlobalArg
 from loopy.symbolic import FunctionIdentifier, IdentityMapper
 
 import loopy as lp
@@ -142,7 +141,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
 
         # Get the vectorization info. If this happens during the dry run, we get dummies
         from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
+        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
 
         # Initialize a base storage for this buffer and get a temporay pointing to it
         shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1)
@@ -156,12 +155,12 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
             index = ()
             vectag = frozenset()
 
-        temp = initialize_buffer(buffer,
+        temp = initialize_buffer(buf,
                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                                  num=2
                                  ).get_temporary(shape=shape,
                                                  dim_tags=dim_tags,
-                                                 name=input,
+                                                 name=inp,
                                                  )
 
         # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron
@@ -203,7 +202,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
         # with the test function (stage 3)
         pref_pos = i if accterm.argument.index else None
         result, insn_dep = sum_factorization_kernel(a_matrices,
-                                                    buffer,
+                                                    buf,
                                                     3,
                                                     insn_dep=insn_dep,
                                                     additional_inames=frozenset(visitor.inames),
@@ -245,18 +244,53 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                (maybe_wrap_subscript(result, prim.Variable(iname)),),
                                )
 
-        expr = Call(PDELabAccumulationFunction(accum, rank),
-                    (ansatz_lfs.get_args() +
-                     test_lfs.get_args() +
-                     (result,)
-                     )
-                    )
-        instruction(assignees=(),
-                    expression=expr,
-                    forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
-                    forced_iname_deps_is_final=True,
-                    depends_on=insn_dep,
-                    )
+        # In the case of FastDGGridOperator we can write directly into the resiudal/jacobi
+        #
+        # TODO: At the moment this only works if we do not vectorize
+        # (over gradients) because loopy tries to acces a vectorclass
+        # variable.
+        if get_option('fastdg'):
+            ft = get_global_context_value("form_type")
+            if ft=='residual':
+                accum = accum + ".data()"
+                size = basis_functions_per_direction() ** world_dimension()
+                globalarg(accum, dtype=np.float64, shape=(size,), managed=False)
+                assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,))
+                expression = prim.Sum((assignee,result))
+                instruction(assignee=assignee,
+                            expression=expression,
+                            forced_iname_deps=frozenset(inames),
+                            forced_iname_deps_is_final=True,
+                            depends_on=insn_dep,
+                            )
+            else:
+                assert ft=='jacobian'
+                accum = accum + ".data()"
+                size = basis_functions_per_direction() ** world_dimension()
+                globalarg(accum, dtype=np.float64, shape=(size, size), managed=True)
+                assignee = prim.Subscript(prim.Variable(accum), (ansatz_lfs.index, test_lfs.index))
+                expression = prim.Sum((assignee,result))
+                instruction(assignee=assignee,
+                            expression=expression,
+                            forced_iname_deps=frozenset(inames + visitor.inames),
+                            forced_iname_deps_is_final=True,
+                            depends_on=insn_dep,
+                            )
+        # Default: Generate accumulation instructions
+        else:
+            expr = Call(PDELabAccumulationFunction(accum, rank),
+                        (ansatz_lfs.get_args() +
+                         test_lfs.get_args() +
+                         (result,)
+                        )
+            )
+            instruction(assignees=(),
+                        expression=expr,
+                        forced_iname_deps=frozenset(inames + visitor.inames + vecinames),
+                        forced_iname_deps_is_final=True,
+                        depends_on=insn_dep,
+            )
+
 
         # Mark the transformation that moves the quadrature loop inside the trialfunction loops for application
         transform(nest_quadrature_loops, visitor.inames)
@@ -278,12 +312,15 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
 
 
 @generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0)))
-def sum_factorization_kernel(a_matrices, buf, stage,
+def sum_factorization_kernel(a_matrices,
+                             buf,
+                             stage,
                              insn_dep=frozenset({}),
                              additional_inames=frozenset({}),
                              preferred_position=None,
                              outshape=None,
                              restriction=0,
+                             direct_input=None,
                              ):
     """
     Calculate a sum factorization matrix product.
@@ -320,58 +357,75 @@ def sum_factorization_kernel(a_matrices, buf, stage,
                                   )})
 
     for l, a_matrix in enumerate(a_matrices):
-        # Get a temporary that interprets the base storage of the input
-        # as a column-major matrix. In later iteration of the amatrix loop
-        # this reinterprets the output of the previous iteration.
+        # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
+        # and get inames that realize the product.
         inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
-        inp = get_buffer_temporary(buf,
-                                   shape=inp_shape + vec_shape,
-                                   dim_tags=ftags)
-
-        # The input temporary will only be read from, so we need to silence the loopy warning
-        silenced_warning('read_no_write({})'.format(inp))
-
-        switch_base_storage(buf)
-
-        # Get a temporary that interprets the base storage of the output
-        # as row-major matrix
         out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
-
-        out = get_buffer_temporary(buf,
-                                   shape=out_shape + vec_shape,
-                                   dim_tags=ctags)
-
-        # Get the inames needed for one matrix-matrix multiplication
         i = sumfact_iname(out_shape[0], "row")
         j = sumfact_iname(out_shape[1], "col")
-
-        # Maybe introduce a vectorization iname for this matrix-matrix multiplication
         vec_iname = ()
         if a_matrix.vectorized:
             iname = sumfact_iname(4, "vec")
             vec_iname = (prim.Variable(iname),)
             transform(lp.tag_inames, [(iname, "vec")])
 
-        if a_matrix.cols == 1:
-            # A trivial reduction is implemented as a product, otherwise we run into
-            # a code generation corner case producing way too complicated code. This
-            # could be fixed upstream, but the loopy code realizing reductions is not
-            # trivial and the priority is kind of low.
-            matprod = Product((Subscript(Variable(a_matrix.name), (Variable(i), 0) + vec_iname),
-                               Subscript(Variable(inp), (0, Variable(j)) + vec_iname)
-                               ))
-        else:
+        # A trivial reduction is implemented as a product, otherwise we run into
+        # a code generation corner case producing way too complicated code. This
+        # could be fixed upstream, but the loopy code realizing reductions is not
+        # trivial and the priority is kind of low.
+        if a_matrix.cols != 1:
             k = sumfact_iname(a_matrix.cols, "red")
+            k_expr = prim.Variable(k)
+        else:
+            k_expr = 0
+
+        # Setup the input of the sum factorization kernel. In the first matrix multiplication
+        # this can be taken from
+        # * an input temporary (default)
+        # * a global data structure (if FastDGGridOperator is in use)
+        # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
+        if l==0 and direct_input is not None:
+            globalarg(direct_input, dtype=np.float64, shape=inp_shape)
+            if a_matrix.vectorized:
+                input_summand = prim.Call(prim.Variable("Vec4d"), (prim.Subscript(prim.Variable(direct_input),
+                                                                                  (k_expr, prim.Variable(j))),))
+            else:
+                input_summand = prim.Subscript(prim.Variable(direct_input),
+                                               (k_expr, prim.Variable(j)) + vec_iname)
+        else:
+            # Get a temporary that interprets the base storage of the input
+            # as a column-major matrix. In later iteration of the amatrix loop
+            # this reinterprets the output of the previous iteration.
+            inp = get_buffer_temporary(buf,
+                                       shape=inp_shape + vec_shape,
+                                       dim_tags=ftags)
+
+            # The input temporary will only be read from, so we need to silence the loopy warning
+            silenced_warning('read_no_write({})'.format(inp))
+
+            input_summand = prim.Subscript(prim.Variable(inp),
+                                           (k_expr, prim.Variable(j)) + vec_iname)
+
+        switch_base_storage(buf)
+
+        # Get a temporary that interprets the base storage of the output
+        # as row-major matrix
+        out = get_buffer_temporary(buf,
+                                   shape=out_shape + vec_shape,
+                                   dim_tags=ctags)
 
-            # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-            prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname),
-                            Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname)
-                            ))
-            matprod = Reduction("sum", k, prod)
+        # Write the matrix-matrix multiplication expression
+        matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(i), k_expr) + vec_iname),
+                           input_summand
+                           ))
+        if a_matrix.cols != 1:
+            # ... which may be a reduction, if k>0
+            matprod = lp.Reduction("sum", k, matprod)
 
         # Issue the reduction instruction that implements the multiplication
         # at the same time store the instruction ID for the next instruction to depend on
-        insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname),
+        assignee = prim.Subscript(prim.Variable(out), (prim.Variable(i), prim.Variable(j)) + vec_iname)
+        insn_dep = frozenset({instruction(assignee=assignee,
                                           expression=matprod,
                                           forced_iname_deps=frozenset({i, j}).union(additional_inames),
                                           forced_iname_deps_is_final=True,
diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py
index 100f3b2149d0630acaccaa6c6347bc9f2bae4849..0adcc6ab3b833fe010f36afea3ea17b087f60a37 100644
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -13,8 +13,8 @@ import loopy as lp
 
 
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, r, *args: (a, r))
-def vectorization_info(a_matrices, restriction, new_a_matrices, buffer, input, index, padding):
-    return (new_a_matrices, buffer, input, index, padding)
+def vectorization_info(a_matrices, restriction, new_a_matrices, buf, inp, index, padding):
+    return (new_a_matrices, buf, inp, index, padding)
 
 
 def get_vectorization_info(a_matrices, restriction):
@@ -63,8 +63,8 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
                 position_mapping[sumf] = available.pop()
 
         # Enable vectorization strategy:
-        input = get_counted_variable("joined_input")
-        buffer = get_counted_variable("joined_buffer")
+        inp = get_counted_variable("joined_input")
+        buf = get_counted_variable("joined_buffer")
 
         # Collect the large matrices!
         large_a_matrices = []
@@ -88,7 +88,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
             large_a_matrices.append(large)
 
         for sumf in stage_sumfacts:
-            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buffer, input, position_mapping[sumf], frozenset(available))
+            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buf, inp, position_mapping[sumf], frozenset(available))
     else:
         # Disable vectorization strategy
         no_vectorization(stage_sumfacts)
diff --git a/test/poisson/CMakeLists.txt b/test/poisson/CMakeLists.txt
index 932fdafb5a939c10b4b1825bfcf9055b456c2f96..844fdce8d338ebfcb59b916d21d97d87829f0d60 100644
--- a/test/poisson/CMakeLists.txt
+++ b/test/poisson/CMakeLists.txt
@@ -54,7 +54,7 @@ dune_add_formcompiler_system_test(UFLFILE opcount_poisson_dg.ufl
                                   INIFILE opcount_poisson_dg_symdiff.mini
                                   )
 
-# 3. Poisson Test Case: DG, f + pure dirichlet
+# 9. Poisson Test Case: DG quadrilaterals
 dune_add_formcompiler_system_test(UFLFILE poisson_dg_quadrilateral.ufl
                                   BASENAME poisson_dg_quadrilateral
                                   INIFILE poisson_dg_quadrilateral.mini
diff --git a/test/poisson/poisson_dg_quadrilateral.mini b/test/poisson/poisson_dg_quadrilateral.mini
index 2cb4d1d2a909290f725f026fbf5e8d062470057e..97c59253e3d1fe2282011779bc4f52637b2edc03 100644
--- a/test/poisson/poisson_dg_quadrilateral.mini
+++ b/test/poisson/poisson_dg_quadrilateral.mini
@@ -1,5 +1,6 @@
 __name = poisson_dg_quadrilateral_{__exec_suffix}
-__exec_suffix = numdiff, symdiff | expand num
+__exec_suffix = {__exec_symdiff}
+__exec_symdiff = numdiff, symdiff |expand num
 
 extension = 1.0 1.0
 cells = 32 32
diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt
index 9e7539d2056411c6d1852ce52cef9e7f2b37b0ec..1dc2483f40d0e3a4af5542c2dc2a12dabbce53f4 100644
--- a/test/sumfact/poisson/CMakeLists.txt
+++ b/test/sumfact/poisson/CMakeLists.txt
@@ -1,17 +1,16 @@
-# 1. Poisson Test Case with order 1
+# Poisson CG 2D
 dune_add_formcompiler_system_test(UFLFILE poisson_2d_order1.ufl
                                   BASENAME sumfact_poisson_2d_order1
                                   INIFILE poisson_2d_order1.mini
                                   )
 
-
-# 1. Poisson Test Case with order 2
 dune_add_formcompiler_system_test(UFLFILE poisson_2d_order2.ufl
                                   BASENAME sumfact_poisson_2d_order2
                                   INIFILE poisson_2d_order2.mini
                                   )
 
-# 3D Test cases
+
+# Poisson CG 3D
 dune_add_formcompiler_system_test(UFLFILE poisson_3d_order1.ufl
                                   BASENAME sumfact_poisson_3d_order1
                                   INIFILE poisson_3d_order1.mini
@@ -28,8 +27,16 @@ dune_add_formcompiler_system_test(UFLFILE opcount_poisson_2d_order2.ufl
                                   INIFILE opcount_poisson_2d_order2.mini
                                   )
 
-# 2. Poisson Test Case: DG, f + pure dirichlet
+
+# Poisson DG
 dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
                                   BASENAME sumfact_poisson_dg
                                   INIFILE poisson_dg.mini
                                   )
+
+
+# Poisson DG using FastDGGridOperator
+dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl
+                                  BASENAME sumfact_poisson_fastdg
+                                  INIFILE poisson_fastdg.mini
+                                  )
diff --git a/test/sumfact/poisson/poisson_fastdg.mini b/test/sumfact/poisson/poisson_fastdg.mini
new file mode 100644
index 0000000000000000000000000000000000000000..87e642e977d462e8f5b1577c12e7512187b1e351
--- /dev/null
+++ b/test/sumfact/poisson/poisson_fastdg.mini
@@ -0,0 +1,21 @@
+__name = sumfact_poisson_fastdg_{__exec_suffix}
+__exec_suffix = {quadvec_suffix}_{gradvec_suffix}
+
+quadvec_suffix = quadvec, nonquadvec | expand quadvec
+gradvec_suffix = gradvec, nongradvec | expand gradvec
+
+cells = 16 16
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+extension = vtu
+
+[formcompiler]
+numerical_jacobian = 0
+sumfact = 1
+exact_solution_expression = g
+compare_l2errorsquared = 1e-4
+vectorize_quad = 1, 0 | expand quadvec
+vectorize_grads = 1, 0 | expand gradvec
+fastdg = 1