diff --git a/python/dune/perftool/blockstructured/__init__.py b/python/dune/perftool/blockstructured/__init__.py
index 9657e81c3336681285d65f0bd22c18c36155ff84..8849491e8e6c31f33f1a1c5d9a1e247122158137 100644
--- a/python/dune/perftool/blockstructured/__init__.py
+++ b/python/dune/perftool/blockstructured/__init__.py
@@ -3,9 +3,9 @@ import dune.perftool.blockstructured.argument
 import dune.perftool.blockstructured.geometry
 import dune.perftool.blockstructured.spaces
 import dune.perftool.blockstructured.basis
-
+import dune.perftool.blockstructured.transformations
+from dune.perftool.options import get_form_option
 from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position
-# from dune.perftool.pdelab.geometry import to_global
 from dune.perftool.blockstructured.spaces import lfs_inames
 from dune.perftool.blockstructured.basis import (pymbolic_reference_gradient,
                                                  pymbolic_basis)
@@ -14,7 +14,6 @@ from dune.perftool.blockstructured.geometry import (pymbolic_jacobian_inverse_tr
                                                     pymbolic_facet_jacobian_determinant,
                                                     to_global)
 from dune.perftool.blockstructured.tools import sub_element_inames
-
 from dune.perftool.pdelab import PDELabInterface
 
 
@@ -22,13 +21,19 @@ class BlockStructuredInterface(PDELabInterface):
     def __init__(self):
         PDELabInterface.__init__(self)
 
+    def generate_accumulation_instruction(self, expr, visitor):
+        if get_form_option('vectorization_blockstructured'):
+            from dune.perftool.blockstructured.accumulation import generate_accumulation_instruction
+            return generate_accumulation_instruction(expr, visitor)
+        else:
+            return PDELabInterface.generate_accumulation_instruction(self, expr, visitor)
     #
     # Local function space related generator functions
     #
-
     # TODO current way to squeeze subelem iname in, not really ideal
+
     def lfs_inames(self, element, restriction, number=None, context=''):
-        return lfs_inames(element, restriction, number, context) + sub_element_inames()
+        return sub_element_inames() + lfs_inames(element, restriction, number, context)
 
     #
     # Test and trial function related generator functions
diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee52acbb157721717157d1f17a45f0595b783e25
--- /dev/null
+++ b/python/dune/perftool/blockstructured/accumulation.py
@@ -0,0 +1,68 @@
+from dune.perftool.generation import instruction
+from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.options import get_form_option
+from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.pdelab.localoperator import determine_accumulation_space
+from dune.perftool.pdelab.argument import name_accumulation_variable
+from dune.perftool.pdelab.localoperator import boundary_predicates
+from dune.perftool.generation.loopy import function_mangler, globalarg
+import loopy as lp
+import pymbolic.primitives as prim
+
+
+def name_accumulation_alias(container, accumspace):
+    name = container + "_" + accumspace.lfs.name + "_alias"
+    name_tail = container + "_" + accumspace.lfs.name + "_alias_tail"
+    k = get_form_option("number_of_blocks")
+    p = accumspace.element.degree()
+
+    def _add_alias_insn(name):
+        dim = world_dimension()
+        element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
+        index_stride = tuple((p * k + 1)**i for i in range(0, dim))
+        globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
+        code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name)
+        instruction(within_inames=frozenset(),
+                    code=code,
+                    read_variables=frozenset({container}),
+                    assignees=frozenset({name}))
+
+    _add_alias_insn(name)
+    _add_alias_insn(name_tail)
+    return name
+
+
+@function_mangler
+def residual_weight_mangler(knl, func, arg_dtypes):
+    if isinstance(func, str) and func.endswith('.weight'):
+        return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype_floatingpoint()),), ())
+
+
+def generate_accumulation_instruction(expr, visitor):
+    # Collect the lfs and lfs indices for the accumulate call
+    test_lfs = determine_accumulation_space(visitor.test_info, 0)
+    # In the jacobian case, also determine the space for the ansatz space
+    ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1)
+
+    # Collect the lfs and lfs indices for the accumulate call
+    accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+    accumvar_alias = name_accumulation_alias(accumvar, test_lfs)
+
+    predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id)
+
+    quad_inames = visitor.interface.quadrature_inames()
+    lfs_inames = visitor.test_info.inames
+    if visitor.trial_info:
+        lfs_inames = lfs_inames + visitor.trial_info.inames
+
+    assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames))
+
+    expr_with_weight = prim.Product((expr, prim.Call(prim.Variable(accumvar + '.weight'), ())))
+
+    instruction(assignee=assignee,
+                expression=prim.Sum((expr_with_weight, assignee)),
+                forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
+                forced_iname_deps_is_final=True,
+                predicates=predicates,
+                tags=frozenset({'accum'})
+                )
diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py
index 7ca3a94c60989f3b4ffe4d1247c3e7fd45a6f575..ff08627822fbad61bea36d6873377324deb34050 100644
--- a/python/dune/perftool/blockstructured/argument.py
+++ b/python/dune/perftool/blockstructured/argument.py
@@ -1,12 +1,30 @@
 from dune.perftool.generation import (backend,
                                       kernel_cached,
-                                      valuearg)
+                                      valuearg, instruction, globalarg)
+from dune.perftool.options import get_form_option
 from dune.perftool.pdelab.argument import CoefficientAccess
-from dune.perftool.blockstructured.tools import micro_index_to_macro_index
+from dune.perftool.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
+from dune.perftool.pdelab.geometry import world_dimension
 from loopy.types import NumpyType
 import pymbolic.primitives as prim
 
 
+def name_alias(container, lfs, element):
+    name = container + "_" + lfs.name + "_alias"
+    k = get_form_option("number_of_blocks")
+    p = element.degree()
+    dim = world_dimension()
+    element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim))
+    index_stride = tuple((p * k + 1)**i for i in range(0, dim))
+    globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True)
+    code = "const auto {} = &{}({},0);".format(name, container, lfs.name)
+    instruction(within_inames=frozenset(),
+                code=code,
+                read_variables=frozenset({container}),
+                assignees=frozenset({name}))
+    return name
+
+
 # TODO remove the need for element
 @kernel_cached
 @backend(interface="pymbolic_coefficient", name="blockstructured")
@@ -20,4 +38,9 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
 
     # use higher order FEM index instead of Q1 index
-    return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    if get_form_option("vectorization_blockstructured"):
+        subelem_inames = sub_element_inames()
+        coeff_alias = name_alias(container, lfs, element)
+        return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index))
+    else:
+        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
diff --git a/python/dune/perftool/blockstructured/basis.py b/python/dune/perftool/blockstructured/basis.py
index 2748a7228903e85f71980db1488bc68cf597bcc6..98ab1bbd66f99af2451a7de1e74e5fe24719c3be 100644
--- a/python/dune/perftool/blockstructured/basis.py
+++ b/python/dune/perftool/blockstructured/basis.py
@@ -21,6 +21,7 @@ from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position_in_cell
 from dune.perftool.pdelab.spaces import type_leaf_gfs
 from dune.perftool.pdelab.restriction import restricted_name
 from dune.perftool.blockstructured.spaces import lfs_inames
+from dune.perftool.blockstructured.tools import tensor_index_to_sequential_index
 
 from ufl import MixedElement
 
@@ -70,7 +71,8 @@ def name_localbasis(leaf_element):
 
 @kernel_cached
 def evaluate_basis(leaf_element, name, restriction):
-    temporary_variable(name, shape=(leaf_element.degree(), 1), decl_method=declare_cache_temporary(leaf_element, restriction, 'Function'))
+    temporary_variable(name, shape=((leaf_element.degree() + 1)**world_dimension(), 1),
+                       decl_method=declare_cache_temporary(leaf_element, restriction, 'Function'))
     cache = name_localbasis_cache(leaf_element)
     qp = pymbolic_quadrature_position_in_cell(restriction)
     localbasis = name_localbasis(leaf_element)
@@ -86,14 +88,15 @@ def pymbolic_basis(leaf_element, restriction, number, context=''):
     name = "phi_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_basis(leaf_element, name, restriction)
-    iname = lfs_inames(leaf_element, restriction, number, context=context)[0]
+    inames = lfs_inames(leaf_element, restriction, number, context=context)
 
-    return prim.Subscript(prim.Variable(name), (prim.Variable(iname), 0))
+    return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, leaf_element.degree() + 1), 0))
 
 
 @kernel_cached
 def evaluate_reference_gradient(leaf_element, name, restriction):
-    temporary_variable(name, shape=(leaf_element.degree(), 1, world_dimension()), decl_method=declare_cache_temporary(leaf_element, restriction, 'Jacobian'))
+    temporary_variable(name, shape=((leaf_element.degree() + 1)**world_dimension(), 1, world_dimension()),
+                       decl_method=declare_cache_temporary(leaf_element, restriction, 'Jacobian'))
     cache = name_localbasis_cache(leaf_element)
     qp = pymbolic_quadrature_position_in_cell(restriction)
     localbasis = name_localbasis(leaf_element)
@@ -109,6 +112,6 @@ def pymbolic_reference_gradient(leaf_element, restriction, number, context=''):
     name = "js_{}".format(FEM_name_mangling(leaf_element))
     name = restricted_name(name, restriction)
     evaluate_reference_gradient(leaf_element, name, restriction)
-    iname = lfs_inames(leaf_element, restriction, number, context=context)[0]
+    inames = lfs_inames(leaf_element, restriction, number, context=context)
 
-    return prim.Subscript(prim.Variable(name), (prim.Variable(iname), 0))
+    return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, leaf_element.degree() + 1), 0))
diff --git a/python/dune/perftool/blockstructured/spaces.py b/python/dune/perftool/blockstructured/spaces.py
index 3991050bad79d787d21c52a35c83e6f550faf939..1f39a9d2ee51c228f9dc4acb96c7530ad9dea3ce 100644
--- a/python/dune/perftool/blockstructured/spaces.py
+++ b/python/dune/perftool/blockstructured/spaces.py
@@ -14,7 +14,16 @@ def lfs_inames(element, restriction, count=None, context=''):
 
     lfs = name_leaf_lfs(element, restriction)
 
-    name = "micro_{}_{}_index".format(lfs, context)
-    domain(name, pow(element.degree() + 1, world_dimension()))
+    # register transformation
+    # warning: this will register the transformation a couple of times
+    from dune.perftool.generation import transform
+    from dune.perftool. blockstructured.transformations import blockstructured_iname_duplication
+    transform(blockstructured_iname_duplication)
 
-    return (name, )
+    dim_names = ["x", "y", "z"] + [str(i) for i in range(4, world_dimension() + 1)]
+    name = "micro_{}_{}_index_".format(lfs, context)
+    inames = tuple()
+    for i in range(world_dimension()):
+        inames = inames + (name + dim_names[i],)
+        domain(name + dim_names[i], element.degree() + 1)
+    return inames
diff --git a/python/dune/perftool/blockstructured/tools.py b/python/dune/perftool/blockstructured/tools.py
index 029b0f752b6c5f37854550588070f6072a7de977..a9bf01f26f5a52daa91f6cbbcf01bd31dffabb36 100644
--- a/python/dune/perftool/blockstructured/tools.py
+++ b/python/dune/perftool/blockstructured/tools.py
@@ -21,11 +21,11 @@ import pymbolic.primitives as prim
 def sub_element_inames():
     name = "subel"
     dim = local_dimension()
+    dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)]
     inames = tuple()
     for i in range(dim):
-        name_counted = get_counted_variable(name)
-        inames = inames + (name_counted,)
-        domain(name_counted, get_form_option("number_of_blocks"))
+        inames = inames + ("subel_" + dim_names[i],)
+        domain("subel_" + dim_names[i], get_form_option("number_of_blocks"))
     return inames
 
 
@@ -97,16 +97,16 @@ def sub_facet_inames():
 
 # compute sequential index for given tensor index, the same as index in base-k to base-10
 def tensor_index_to_sequential_index(indices, k):
-    return prim.Sum(tuple(index * k ** i for i, index in enumerate(indices)))
+    return prim.Sum(tuple(prim.Variable(index) * k ** i for i, index in enumerate(indices)))
 
 
 # compute tensor index for given sequential index, the same as index in base-10 to base-k
-def to_tensor_index(iname, k):
+def sequential_index_to_tensor_index(iname, k):
     return tuple(prim.Remainder(prim.Variable(iname) / (k**i), k) for i in range(world_dimension()))
 
 
 # compute index for higher order FEM for a given Q1 index
-def micro_index_to_macro_index(element, iname):
+def micro_index_to_macro_index(element, inames):
     it = get_global_context_value("integral_type")
     if it == "cell":
         subelem_inames = sub_element_inames()
@@ -115,7 +115,5 @@ def micro_index_to_macro_index(element, iname):
 
     k = get_form_option("number_of_blocks")
     p = element.degree()
-    modified_index = prim.Sum((tensor_index_to_sequential_index(to_tensor_index(iname, p + 1), p * k + 1),
-                               tensor_index_to_sequential_index(tuple(prim.Variable(iname) * p for iname in subelem_inames), p * k + 1)))
-
-    return modified_index
+    return prim.Sum(tuple((p * prim.Variable(si) + prim.Variable(bi)) * (p * k + 1) ** i
+                          for i, (si, bi) in enumerate(zip(subelem_inames, inames))))
diff --git a/python/dune/perftool/blockstructured/transformations.py b/python/dune/perftool/blockstructured/transformations.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad010e4579e310e03e4a8d8f9a592d380735256c
--- /dev/null
+++ b/python/dune/perftool/blockstructured/transformations.py
@@ -0,0 +1,26 @@
+from loopy import (has_schedulable_iname_nesting,
+                   get_iname_duplication_options,
+                   duplicate_inames,
+                   )
+
+
+def blockstructured_iname_duplication(kernel):
+    # If the given kernel is schedulable, nothing needs to be done.
+    if has_schedulable_iname_nesting(kernel):
+        return kernel
+
+    # group inames with the same base name for duplication
+    duplication_options = dict(get_iname_duplication_options(kernel))
+    suffixes = ['x', 'y', 'z']
+    for iname in duplication_options:
+        if iname[-1] in suffixes:
+            dup_kernel = kernel
+            base = iname[:-1]
+            for s in suffixes:
+                if base + s in duplication_options:
+                    dup_kernel = duplicate_inames(dup_kernel, base + s, duplication_options[base + s])
+                    del duplication_options[base + s]
+            if has_schedulable_iname_nesting(dup_kernel):
+                return dup_kernel
+
+    raise NotImplementedError("Your kernel needs multiple iname duplications! No generic algorithm implemented for that yet! (#39)")
diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d711719b5f516b1b35379ee05ad31bc659afa55e
--- /dev/null
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -0,0 +1,320 @@
+import loopy as lp
+import numpy as np
+import pymbolic.primitives as prim
+from dune.perftool.loopy.target import dtype_floatingpoint
+from dune.perftool.loopy.temporary import DuneTemporaryVariable
+from dune.perftool.loopy.symbolic import substitute
+from dune.perftool.loopy.vcl import get_vcl_type_size, VCLPermute, VCLLoad, VCLStore
+from dune.perftool.options import get_form_option
+from dune.perftool.pdelab.argument import PDELabAccumulationFunction
+from dune.perftool.pdelab.geometry import world_dimension
+from dune.perftool.tools import get_pymbolic_indices
+
+
+def add_vcl_temporaries(knl):
+    vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
+
+    # add new temporaries for vectors
+    # hope one read insn doesn't have two different reads from the same temporary
+    new_vec_temporaries = dict()
+    new_insns = []
+    vec_iname = 'write_vec_once'
+    from islpy import BasicSet
+    domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(vec_iname, get_vcl_type_size(dtype_floatingpoint())))
+    for alias in vector_alias:
+        vector_name = alias.replace('alias', 'vec')
+        new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64,
+                                                                 shape=(get_vcl_type_size(np.float64),), managed=True,
+                                                                 scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
+        # write once to the vector such that loopy won't complain
+        new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(vec_iname)),
+                                       expression=0, within_inames=frozenset({vec_iname}),
+                                       id='write_' + vector_name))
+
+    from loopy.kernel.data import VectorizeTag
+    return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [domain],
+                    temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries),
+                    iname_to_tag=dict(**knl.iname_to_tag, **{vec_iname: VectorizeTag()}))
+
+
+def add_vcl_accum_insns(knl, iname_inner, iname_outer):
+    nptype = dtype_floatingpoint()
+    vcl_size = get_vcl_type_size(np.float64)
+
+    from loopy.match import Tagged
+    accum_insns = set(lp.find_instructions(knl, Tagged('accum')))
+
+    new_insns = []
+    vng = knl.get_var_name_generator()
+    idg = knl.get_instruction_id_generator()
+    new_vec_temporaries = dict()
+    for insn in knl.instructions:
+        # somehow CInstructions are not hashable....
+        if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns:
+            # write accum expr as "r = expr + r"
+            expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee))
+
+            inames_micro = set((i for i in insn.within_inames if i.startswith('micro')))
+            iname_ix = next((i for i in inames_micro if i.endswith("_x")))
+
+            # need inames for head and tail handling a priori
+            from loopy.match import Not, All
+            replace_head_inames = dict()
+            replace_tail_inames = dict()
+            for iname in inames_micro - frozenset({iname_ix}):
+                head = iname + '_head'
+                tail = iname + '_tail'
+                replace_head_inames[iname] = prim.Variable(head)
+                replace_tail_inames[iname] = prim.Variable(tail)
+                knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_tail')
+                knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_head')
+            inames_head = frozenset((var.name for var in replace_head_inames.values()))
+            inames_tail = frozenset((var.name for var in replace_tail_inames.values()))
+
+            # erstelle a[iy] und b
+            identifier_left = vng('left_node')
+            identifier_right = vng('right_node')
+            new_vec_temporaries[identifier_left] = DuneTemporaryVariable(identifier_left, dtype=np.float64,
+                                                                         shape=(2,) * (world_dimension() - 1) +
+                                                                               (vcl_size,),
+                                                                         managed=True, scope=lp.temp_var_scope.PRIVATE,
+                                                                         dim_tags=('f',) * (world_dimension() - 1) +
+                                                                                  ('vec',))
+            new_vec_temporaries[identifier_right] = DuneTemporaryVariable(identifier_right, dtype=np.float64,
+                                                                          shape=(vcl_size,), managed=True,
+                                                                          scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
+
+            var_left = prim.Subscript(prim.Variable(identifier_left),
+                                      tuple(prim.Variable(i) for i in sorted(inames_micro - frozenset({iname_ix}))) +
+                                      (prim.Variable(iname_inner),))
+            var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(iname_inner),))
+
+            # init a
+            id_init_a = idg('insn_init_' + identifier_left)
+            new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames),
+                                           expression=0,
+                                           id=id_init_a,
+                                           within_inames=(insn.within_inames - frozenset({iname_outer}) -
+                                                          inames_micro) | inames_head,
+                                           tags=frozenset({'head'})))
+
+            # setze werte für a und b
+            expr_right = substitute(expr_without_r, {iname_ix: 1})
+            expr_left = prim.Sum((substitute(expr_without_r, {iname_ix: 0}), var_left))
+
+            id_set_left = idg('insn_' + identifier_left)
+            id_set_right = idg('insn_' + identifier_right)
+            new_insns.append(lp.Assignment(assignee=var_right,
+                                           expression=expr_right,
+                                           id=id_set_right,
+                                           depends_on=insn.depends_on,
+                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+            new_insns.append(lp.Assignment(assignee=var_left,
+                                           expression=expr_left,
+                                           id=id_set_left,
+                                           depends_on=insn.depends_on | frozenset({id_init_a}),
+                                           within_inames=insn.within_inames - frozenset({iname_ix})))
+
+            # r+=a[iy]
+            id_accum = idg('insn_mod_accum')
+            expr_accum = prim.Sum((var_left,
+                                   prim.Call(VCLPermute(nptype, vcl_size, (-1,) + tuple(range(vcl_size - 1))),
+                                             (var_right,)),
+                                   substitute(insn.assignee, {iname_ix: 0})))
+            new_insns.append(lp.Assignment(assignee=substitute(insn.assignee, {iname_ix: 0}),
+                                           expression=expr_accum,
+                                           id=id_accum,
+                                           depends_on=insn.depends_on | frozenset({id_set_left,
+                                                                                   id_init_a, id_set_right}),
+                                           within_inames=insn.within_inames - frozenset({iname_ix}),
+                                           tags=frozenset({'accum'})))
+            # a[iy] = permute
+            id_permute = idg('insn_permute')
+            expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)),
+                                     (var_right,))
+            new_insns.append(lp.Assignment(assignee=var_left,
+                                           expression=expr_permute,
+                                           id=id_permute,
+                                           depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right,
+                                                                                   id_accum}),
+                                           within_inames=insn.within_inames - frozenset({iname_ix})
+                                           ))
+
+            # tail handling, uses tail alias
+            id_accum_tail = idg('insn_accum_tail')
+            subst_map = {iname_inner: vcl_size - 1, iname_outer: get_form_option("number_of_blocks") // vcl_size - 1,
+                         iname_ix: 1, insn.assignee_name: prim.Variable(insn.assignee_name + '_tail'),
+                         **replace_tail_inames}
+            assignee_tail = substitute(insn.assignee, subst_map)
+            expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail))
+
+            new_insns.append(lp.Assignment(assignee=assignee_tail,
+                                           expression=expr_tail,
+                                           id=id_accum_tail,
+                                           depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}),
+                                           within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) -
+                                                          inames_micro) | inames_tail,
+                                           tags=frozenset({'tail'})))
+        else:
+            new_insns.append(insn)
+
+    return knl.copy(instructions=new_insns,
+                    temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
+
+
+def add_vcl_access(knl, iname_inner):
+    from loopy.match import Reads, Tagged
+    accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum'))))
+    read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias'))))
+
+    from loopy.symbolic import CombineMapper
+    from loopy.symbolic import IdentityMapper
+
+    class AliasIndexCollector(CombineMapper, IdentityMapper):
+        def combine(self, values):
+            return sum(values, tuple())
+
+        def map_constant(self, expr):
+            return tuple()
+
+        map_variable = map_constant
+        map_function_symbol = map_constant
+        map_loopy_function_identifier = map_constant
+
+        def map_subscript(self, expr):
+            if expr.aggregate.name.endswith('alias'):
+                return expr.aggregate, expr.index_tuple
+            else:
+                return tuple()
+
+    # add load instructions if the vector is read, based on the read instruction
+    # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird
+    # muss dafür eigentlich nur den namen des vectors anpassen
+    idg = knl.get_instruction_id_generator()
+    aic = AliasIndexCollector()
+    load_insns = []
+    read_dependencies = dict()
+    for id in read_insns:
+        insn = knl.id_to_insn[id]
+
+        alias, index = aic(insn.expression)
+        name_alias = alias.name
+        name_vec = name_alias.replace('alias', 'vec')
+
+        # compute index without vec iname
+        strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
+        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                               if i != 0 and i.name != iname_inner))
+
+        # add load instruction
+        load_id = idg('insn_' + name_vec + '_load')
+        call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
+                                             id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),))
+        read_dependencies.setdefault(id, set())
+        read_dependencies[id].add(load_id)
+
+    # add store instructions if the vector is written, based on the write instruction
+    store_insns = []
+    for id in accum_insns:
+        insn = knl.id_to_insn[id]
+
+        alias, index = aic(insn.expression)
+        name_alias = alias.name
+        name_vec = name_alias.replace('alias', 'vec')
+
+        # flat index without vec iname
+        strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
+        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                               if i != 0 and i.name != iname_inner))
+
+        # add store instruction
+        store_id = idg('insn_' + name_vec + '_store')
+        call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
+                                              id=store_id, within_inames=insn.within_inames,
+                                              depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id]))
+
+    # replace alias with vcl vector, except for accumulation assignee
+    vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
+    dim = world_dimension()
+    dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)]
+    # remove CInstructions since loopy extract expects to get only assignments
+    knl_without_cinsn = knl.copy(instructions=[insn for insn in knl.instructions
+                                               if not isinstance(insn, lp.CInstruction)])
+    for alias in vector_alias:
+        # Rename lhs which would match the substitution rule since loopy doesn't want substitutions as lhs
+        new_insns = []
+        for insn in knl_without_cinsn.instructions:
+            if isinstance(insn, lp.Assignment) and isinstance(insn.assignee, prim.Subscript):
+                if insn.assignee.aggregate.name == alias:
+                    new_insns.append(insn.copy(assignee=prim.Subscript(prim.Variable('dummy_' + alias),
+                                                                       insn.assignee.index_tuple)))
+                    pass
+                else:
+                    new_insns.append(insn)
+            else:
+                new_insns.append(insn)
+        knl_without_cinsn = knl_without_cinsn.copy(instructions=new_insns)
+
+        # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner]
+        parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \
+                     ',ix,' + ','.join(['i' + d for d in dim_names[1:dim]])
+        knl_without_cinsn = lp.extract_subst(knl_without_cinsn, alias + '_subst', '{}[{}]'.format(alias, parameters),
+                                             parameters=parameters)
+        new_subst = knl_without_cinsn.substitutions.copy()
+        rule = new_subst[alias + '_subst']
+        rule.expression = prim.Subscript(prim.Variable(alias.replace('alias', 'vec')), (prim.Variable('ex_i'),))
+        knl_without_cinsn = knl_without_cinsn.copy(substitutions=new_subst)
+
+    from loopy.match import All
+    knl_without_cinsn = lp.expand_subst(knl_without_cinsn, All())
+    knl = knl_without_cinsn.copy(instructions=knl_without_cinsn.instructions + [insn for insn in knl.instructions
+                                                                                if isinstance(insn, lp.CInstruction)])
+
+    # add store and load dependencies and set right accumulation assignee
+    new_insns = []
+    for insn in knl.instructions:
+        if insn.id not in read_insns | accum_insns:
+            new_insns.append(insn)
+        else:
+            if insn.id in accum_insns:
+                assignee_alias = insn.assignee
+                try:
+                    assignee_vec = next((expr for expr in insn.expression.children
+                                         if isinstance(expr, prim.Subscript) and
+                                         expr.aggregate.name.replace('vec', 'alias') ==
+                                         assignee_alias.aggregate.name.replace('dummy_', '')))
+                except StopIteration:
+                    from dune.perftool.error import PerftoolVectorizationError
+                    raise PerftoolVectorizationError
+                new_insns.append(insn.copy(assignee=assignee_vec,
+                                           depends_on=insn.depends_on | read_dependencies[insn.id]))
+            else:
+                new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id]))
+
+    return knl.copy(instructions=new_insns + load_insns + store_insns)
+
+
+def find_accumulation_inames(knl):
+    inames = set()
+    for insn in knl.instructions:
+        if any((n.startswith('r_') and n.endswith('alias') for n in insn.write_dependency_names())):
+            inames |= insn.within_inames
+
+    inames = set((i for i in inames if i.startswith('micro') and not i.endswith('_x')))
+
+    return inames
+
+
+def vectorize_micro_elements(knl):
+    if "subel_x" in knl.all_inames():
+        vcl_size = get_vcl_type_size(np.float64)
+        knl = lp.split_iname(knl, "subel_x", vcl_size, inner_tag='vec')
+        array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
+        knl = lp.split_array_axis(knl, array_alias, 0, vcl_size)
+
+        knl = add_vcl_temporaries(knl)
+        knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer')
+        knl = add_vcl_access(knl, 'subel_x_inner')
+    return knl
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 22a0c317295ae93976c364bd394022447d7e6d43..bb9bf793b3adf64f3c29fa5883a97762d0438ee2 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -8,7 +8,7 @@ from dune.perftool.loopy.vcl import VCLTypeRegistry
 from dune.perftool.generation import (include_file,
                                       retrieve_cache_functions,
                                       )
-from dune.perftool.options import get_option
+from dune.perftool.options import get_form_option, get_option
 from dune.perftool.tools import round_to_multiple
 
 from loopy.symbolic import Literal
@@ -209,6 +209,25 @@ class DuneASTBuilder(CASTBuilder):
             return ret
 
 
+class BlockstructuredDuneExpressionToCExpressionMapper(DuneExpressionToCExpressionMapper):
+    def map_variable(self, expr, type_context):
+        arg = self.kernel.arg_dict.get(expr.name)
+        # allow the use of pointers
+        if isinstance(arg, DuneGlobalArg) and expr.name.endswith('alias'):
+            return prim.Variable(expr.name)
+        else:
+            return ExpressionToCExpressionMapper.map_variable(self, expr, type_context)
+
+
+class BlockstructuredDuneASTBuilder(DuneASTBuilder):
+    def get_expression_to_c_expression_mapper(self, codegen_state):
+        return BlockstructuredDuneExpressionToCExpressionMapper(codegen_state)
+
+    def add_vector_access(self, access_expr, index):
+        # In the vectorized blockstructured case I need read access to an element of an vector
+        return prim.Subscript(access_expr, index)
+
+
 class DuneTarget(TargetBase):
     def __init__(self, declare_temporaries=True):
         # Set fortran_abi to allow reusing CASTBuilder for the moment
@@ -222,7 +241,10 @@ class DuneTarget(TargetBase):
         return DummyHostASTBuilder(self)
 
     def get_device_ast_builder(self):
-        return DuneASTBuilder(self)
+        if get_form_option("blockstructured"):
+            return BlockstructuredDuneASTBuilder(self)
+        else:
+            return DuneASTBuilder(self)
 
     def dtype_to_typename(self, dtype):
         if dtype.dtype.kind == "V":
diff --git a/python/dune/perftool/loopy/transformations/duplicate.py b/python/dune/perftool/loopy/transformations/duplicate.py
index eecdd9a8a16523a8b9e0422784734c7a58ab2f24..7d5f3d96e8fc6a46ab0a55e3c54ce85e72795bf5 100644
--- a/python/dune/perftool/loopy/transformations/duplicate.py
+++ b/python/dune/perftool/loopy/transformations/duplicate.py
@@ -1,5 +1,4 @@
 """ Iname duplication strategies to make kernels schedulable """
-
 from loopy import (has_schedulable_iname_nesting,
                    get_iname_duplication_options,
                    duplicate_inames,
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index f76f7881cf76e4360d2b6b8c0e691cfcad1488c1..345dec931596c07c8641a03f5c0c5035f86d06b6 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -91,6 +91,21 @@ def vcl_cast_mangler(knl, func, arg_dtypes):
         return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],))
 
 
+class VCLPermute(lp.symbolic.FunctionIdentifier):
+    def __init__(self, nptype, vector_width, permutation):
+        self.nptype = nptype
+        self.vector_width = vector_width
+        self.permutation = permutation
+
+    def __getinitargs__(self):
+        return (self.nptype, self.vector_width, self.permutation)
+
+    @property
+    def name(self):
+        return "permute{}<{}>".format(get_vcl_typename(self.nptype, vector_width=self.vector_width)[-2:],
+                                      ','.join(map(str, self.permutation)))
+
+
 @function_mangler
 def vcl_function_mangler(knl, func, arg_dtypes):
     if func == "mul_add":
@@ -108,3 +123,41 @@ def vcl_function_mangler(knl, func, arg_dtypes):
         vcl = lp.types.NumpyType(get_vcl_type(dtype))
         include_file("dune/perftool/sumfact/horizontaladd.hh", filetag="operatorfile")
         return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,))
+
+    if isinstance(func, VCLPermute):
+        dtype = arg_dtypes[0]
+        vcl = lp.types.NumpyType(get_vcl_type(dtype))
+        return lp.CallMangleInfo(func.name, (vcl,), (vcl,))
+
+
+class VCLLoad(lp.symbolic.FunctionIdentifier):
+    def __init__(self, vec):
+        self.vec = vec
+
+    def __getinitargs__(self):
+        return (self.vec,)
+
+    @property
+    def name(self):
+        return "{}.load".format(self.vec)
+
+
+class VCLStore(lp.symbolic.FunctionIdentifier):
+    def __init__(self, vec):
+        self.vec = vec
+
+    def __getinitargs__(self):
+        return (self.vec,)
+
+    @property
+    def name(self):
+        return "{}.store".format(self.vec)
+
+
+@function_mangler
+def vcl_store_and_load_mangler(knl, func, arg_dtypes):
+    if isinstance(func, VCLLoad):
+        return lp.CallMangleInfo(func.name, (), (lp.types.NumpyType(np.int32),))
+
+    if isinstance(func, VCLStore):
+        return lp.CallMangleInfo(func.name, (), (lp.types.NumpyType(np.int32),))
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 229b989002dd417ddb67d451f6c0de11f2bfc124..1ecc85c5903b69f83953502e8afa485f5a962702 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -92,6 +92,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
     precompute_quadrature_info = PerftoolOption(default=True, helpstr="compute quadrature points and weights in the constructor of the local operator")
     blockstructured = PerftoolOption(default=False, helpstr="Use block structure")
     number_of_blocks = PerftoolOption(default=1, helpstr="Number of sub blocks in one direction")
+    vectorization_blockstructured = PerftoolOption(default=False, helpstr="Vectorize block structuring")
     adjoint = PerftoolOption(default=False, helpstr="Generate adjoint operator")
     control = PerftoolOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable")
     objective_function = PerftoolOption(default=None, helpstr="Name of form representing the objective function in UFL file")
diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py
index 2c724f8e4a716c46475434d4f1949de6ba512632..55c9a7ef7676c7647d485486d73440aed9d22aa2 100644
--- a/python/dune/perftool/pdelab/basis.py
+++ b/python/dune/perftool/pdelab/basis.py
@@ -168,18 +168,17 @@ def evaluate_coefficient(visitor, element, name, container, restriction, index):
     from ufl import FiniteElement
     assert isinstance(sub_element, FiniteElement)
 
-    temporary_variable(name, shape=(),)
+    temporary_variable(name, shape=(), managed=True)
 
     lfs = name_lfs(element, restriction, index)
     basis = visitor.interface.pymbolic_basis(sub_element, restriction, 0, context='trial')
-    basisindex, _ = get_pymbolic_indices(basis)
-
+    basisindex = get_pymbolic_indices(basis)[:-1]
     if get_form_option("blockstructured"):
         from dune.perftool.blockstructured.argument import pymbolic_coefficient
         coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex)
     else:
         from dune.perftool.pdelab.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, basisindex)
+        coeff = pymbolic_coefficient(container, lfs, basisindex[0])
 
     assignee = Variable(name)
 
@@ -200,16 +199,13 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction
     from ufl import FiniteElement
     assert isinstance(sub_element, FiniteElement)
 
-    temporary_variable(name,
-                       shape=(element.cell().geometric_dimension(),),
-                       shape_impl=("arr",),
-                       )
+    temporary_variable(name, shape=(element.cell().geometric_dimension(),), managed=True)
 
     dimindex = component_iname(count=0)
 
     lfs = name_lfs(element, restriction, index)
     basis = visitor.interface.pymbolic_reference_gradient(sub_element, restriction, 0, context='trialgrad')
-    basisindex, _ = get_pymbolic_indices(basis)
+    basisindex = get_pymbolic_indices(basis)[:-1]
     from dune.perftool.tools import maybe_wrap_subscript
     basis = maybe_wrap_subscript(basis, Variable(dimindex))
 
@@ -218,7 +214,7 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction
         coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex)
     else:
         from dune.perftool.pdelab.argument import pymbolic_coefficient
-        coeff = pymbolic_coefficient(container, lfs, basisindex)
+        coeff = pymbolic_coefficient(container, lfs, basisindex[0])
 
     assignee = Subscript(Variable(name), (Variable(dimindex),))
 
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index c89a23cf1f8970213825806804015fc861797ea1..a4bd0201c10f681499b6dbd450b585306ca4ec94 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -262,7 +262,7 @@ def determine_accumulation_space(info, number):
     if get_form_option("blockstructured"):
         from dune.perftool.blockstructured.tools import micro_index_to_macro_index
         from dune.perftool.blockstructured.spaces import lfs_inames
-        lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, info.restriction, count=number)[0])
+        lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, info.restriction, count=number))
     else:
         from dune.perftool.pdelab.spaces import lfs_inames
         lfsi = Variable(lfs_iname(subel, info.restriction, count=number))
@@ -275,6 +275,7 @@ def determine_accumulation_space(info, number):
     return AccumulationSpace(lfs=lfs,
                              index=lfsi,
                              restriction=info.restriction,
+                             element=element
                              )
 
 
@@ -529,7 +530,6 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
                          name=name,
                          lang_version=(2017, 2, 1),
                          )
-
     from loopy import make_reduction_inames_unique
     kernel = make_reduction_inames_unique(kernel)
 
@@ -547,6 +547,9 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin
             kernel = vectorize_quadrature_loop(kernel)
         else:
             raise NotImplementedError("Only vectorizing sumfactorized code right now!")
+    if get_form_option("vectorization_blockstructured"):
+        from dune.perftool.blockstructured.vectorization import vectorize_micro_elements
+        kernel = vectorize_micro_elements(kernel)
 
     # Now add the preambles to the kernel
     preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))]
diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py
index 4a95dbced5e24d7efd0ddadc422511a1fc1362cc..d5c0a18ebe8f7e32316aa459c959ab1eb9ba75f8 100644
--- a/python/dune/perftool/tools.py
+++ b/python/dune/perftool/tools.py
@@ -24,13 +24,15 @@ def get_pymbolic_indices(expr):
 
     def ind(i):
         if isinstance(i, int):
-            return i
-        return get_pymbolic_basename(i)
+            return (i,)
+        elif isinstance(i, prim.Sum) or isinstance(i, prim.Product):
+            return sum((ind(c) for c in i.children if isinstance(c, prim.Expression)), ())
+        return (get_pymbolic_basename(i),)
 
     if not isinstance(expr.index, tuple):
         return (ind(expr.index),)
 
-    return tuple(ind(i) for i in expr.index)
+    return sum((ind(i) for i in expr.index), ())
 
 
 def maybe_wrap_subscript(expr, indices):
diff --git a/test/blockstructured/poisson/CMakeLists.txt b/test/blockstructured/poisson/CMakeLists.txt
index 9355f4364cd703d6ed808a25075657aeb254f3cf..721cadc2ef9672d4ae6f8af6ef5479478c76df50 100644
--- a/test/blockstructured/poisson/CMakeLists.txt
+++ b/test/blockstructured/poisson/CMakeLists.txt
@@ -18,4 +18,9 @@ dune_add_formcompiler_system_test(UFLFILE poisson_neumann.ufl
 dune_add_formcompiler_system_test(UFLFILE poisson.ufl
         BASENAME blockstructured_poisson_matrix_free
         INIFILE poisson_matrix_free.mini
+        )
+
+dune_add_formcompiler_system_test(UFLFILE poisson_vec.ufl
+        BASENAME blockstructured_poisson_vec
+        INIFILE poisson_vec.mini
         )
\ No newline at end of file
diff --git a/test/blockstructured/poisson/poisson_vec.mini b/test/blockstructured/poisson/poisson_vec.mini
new file mode 100644
index 0000000000000000000000000000000000000000..8d7e2d9774274243f8c7b976355a34f8487ad581
--- /dev/null
+++ b/test/blockstructured/poisson/poisson_vec.mini
@@ -0,0 +1,18 @@
+__name = blockstructured_poisson_vec
+
+cells = 5 5
+extension = 1. 1.
+
+[wrapper.vtkcompare]
+name = {__name}
+reference = poisson_ref
+extension = vtu
+
+[formcompiler]
+compare_l2errorsquared = 1e-7
+
+[formcompiler.r]
+numerical_jacobian = 1
+blockstructured = 1
+number_of_blocks = 16
+vectorization_blockstructured = 1
\ No newline at end of file
diff --git a/test/blockstructured/poisson/poisson_vec.ufl b/test/blockstructured/poisson/poisson_vec.ufl
new file mode 100644
index 0000000000000000000000000000000000000000..a0ed7f33138b8689a78cd193659439a0691a3d8c
--- /dev/null
+++ b/test/blockstructured/poisson/poisson_vec.ufl
@@ -0,0 +1,14 @@
+cell = quadrilateral
+x = SpatialCoordinate(cell)
+
+g = x[0]**2 + x[1]**2
+f = -4
+
+V = FiniteElement("CG", cell, 1)
+u = TrialFunction(V)
+v = TestFunction(V)
+
+r =(inner(grad(u), grad(v)) - f*v)*dx
+interpolate_expression = g
+exact_solution = g
+is_dirichlet = 1