diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py
new file mode 100644
index 0000000000000000000000000000000000000000..05b2c8b61e56b0edc96a14640ad2c7e8512c7066
--- /dev/null
+++ b/python/dune/perftool/blockstructured/accumulation.py
@@ -0,0 +1,47 @@
+from dune.perftool.generation import temporary_variable, instruction
+from dune.perftool.options import get_option
+from dune.perftool.pdelab.localoperator import determine_accumulation_space
+from dune.perftool.pdelab.argument import name_accumulation_variable
+from dune.perftool.pdelab.localoperator import boundary_predicates
+import pymbolic.primitives as prim
+
+
+def name_accumulation_alias(container, accumspace):
+    name = container+"_"+accumspace.lfs.name+"_alias"
+    k = get_option("number_of_blocks")
+    p = accumspace.element.degree()
+    temporary_variable(name, shape=(k + 1, k + 1, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1),
+                       base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
+    code = "{} = &{}.container()({},0);".format(container + "_"+accumspace.lfs.name+"_alias", container, accumspace.lfs.name)
+    instruction(within_inames=frozenset(),
+                code=code,
+                read_variables=frozenset({container}),
+                assignees=frozenset({name}))
+    return name
+
+
+def generate_accumulation_instruction(expr, visitor):
+    # Collect the lfs and lfs indices for the accumulate call
+    test_lfs = determine_accumulation_space(visitor.test_info, 0)
+    # In the jacobian case, also determine the space for the ansatz space
+    ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1)
+
+    # Collect the lfs and lfs indices for the accumulate call
+    accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
+    accumvar_alias = name_accumulation_alias(accumvar, test_lfs)
+
+    predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id)
+
+    quad_inames = visitor.interface.quadrature_inames()
+    lfs_inames = visitor.test_info.inames
+    if visitor.trial_info:
+        lfs_inames = lfs_inames + visitor.trial_info.inames
+
+    assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames))
+
+    instruction(assignee=assignee,
+                expression=prim.Sum((expr, assignee)),
+                forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
+                forced_iname_deps_is_final=True,
+                predicates=predicates
+                )
diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py
index 1d7312b3748cb8d8963b3fe057e50e763e52c687..1d4fe319408230aa317ed0c23e9e3073e8870f4d 100644
--- a/python/dune/perftool/blockstructured/argument.py
+++ b/python/dune/perftool/blockstructured/argument.py
@@ -9,6 +9,20 @@ import pymbolic.primitives as prim
 import loopy as lp
 
 
+def name_alias(container, lfs, element):
+    name = container+"_"+lfs.name+"_alias"
+    k = get_option("number_of_blocks")
+    p = element.degree()
+    temporary_variable(name, shape=(k + 1, k + 1, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1),
+                       base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
+    code = "{} = &{}({},0);".format(name,container, lfs.name)
+    instruction(within_inames=frozenset(),
+                code=code,
+                read_variables=frozenset({container}),
+                assignees=frozenset({name}))
+    return name
+
+
 # TODO remove the need for element
 @kernel_cached
 @backend(interface="pymbolic_coefficient", name="blockstructured")
@@ -22,4 +36,9 @@ def pymbolic_coefficient(container, lfs, element, index):
         lfs = prim.Variable(lfs)
 
     # use higher order FEM index instead of Q1 index
-    return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
+    if get_option("vectorization_blockstructured"):
+        subelem_inames = sub_element_inames()
+        coeff_alias = name_alias(container, lfs, element)
+        return prim.Subscript(lp.TaggedVariable(coeff_alias, 'coeff_alias'), tuple(prim.Variable(i) for i in subelem_inames+index))
+    else:
+        return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
index cef42b962b8a1c7876ccc7ae2914796f0acdbc28..0e15869eb3572b2a65aa746c38eb2a92fbc4d27c 100644
--- a/python/dune/perftool/blockstructured/vectorization.py
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -17,32 +17,28 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 
     from loopy.symbolic import WalkMapper
     class FindAlias(WalkMapper):
-        def __init__(self, id, expr):
+        def __init__(self, id, expr, vecs, insn_to_vec):
             self.id = id
+            self.vecs = vecs
+            self.insn_to_vec = insn_to_vec
             self.rec(expr)
 
         def post_visit(self,expr):
-            if isinstance(expr, prim.Call):
-                func = expr.function
-                param = expr.parameters
-                if (isinstance(func, CoefficientAccess) or isinstance(func, PDELabAccumulationFunction)) \
-                        and iname_inner in get_pymbolic_indices(prim.Subscript((), (param[1],))):
-                    lfs = param[0].name
-                    vec_name = (func.container if isinstance(func, CoefficientAccess) else func.accumobj) + "_" + lfs + "_vec"
-                    read_vecs.add(vec_name)
-                    read_insn_to_vec_instance.setdefault(self.id, set())
-                    read_insn_to_vec_instance[self.id].add((vec_name, expr))
-
-                    if isinstance(func, PDELabAccumulationFunction):
-                        write_vecs.add(vec_name)
-                        write_insn_to_vec_instance[self.id] = (vec_name, expr)
-
+            if isinstance(expr, prim.Subscript) \
+                    and isinstance(expr.aggregate, prim.Variable) \
+                    and expr.aggregate.name.endswith('alias'):
+                vec_name = expr.aggregate.name.replace('alias', 'vec')
+                self.vecs.add(vec_name)
+                self.insn_to_vec.setdefault(self.id, set())
+                self.insn_to_vec[self.id].add((vec_name,expr))
             return
     # find all instances where an alias is read from or written to
     # save the name of the corresponding vector and the alias subscript expression
     for insn in knl.instructions:
         if isinstance(insn, lp.MultiAssignmentBase):
-            FindAlias(insn.id, insn.expression)
+            FindAlias(insn.id, insn.expression, read_vecs, read_insn_to_vec_instance)
+            if isinstance(insn, lp.Assignment):
+                FindAlias(insn.id, insn.assignee, write_vecs, write_insn_to_vec_instance)
 
     read_insns = [knl.id_to_insn[id] for id in read_insn_to_vec_instance]
     write_insns = [knl.id_to_insn[id] for id in write_insn_to_vec_instance]
@@ -53,16 +49,16 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
         new_vec_temporaries[vec] = DuneTemporaryVariable(vec, dtype=np.float64, shape=(4,), managed=True,
                                                          scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
 
-    # write accum expr as "expr + r"
     modified_accum_insn = []
     replace_accum_insn = dict()
     vng = knl.get_var_name_generator()
     idg = knl.get_instruction_id_generator()
     for insn in write_insns:
-        if isinstance(insn, lp.CallInstruction) and isinstance(insn.expression.function, PDELabAccumulationFunction):
-            vec_name, expr = write_insn_to_vec_instance[insn.id]
-            expr_accum = insn.expression.parameters[-1]
-
+        if isinstance(insn, lp.Assignment):
+            # write accum expr as "r = expr + r"
+            expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee))
+            if expr_without_r == insn.expression:
+                continue
             # finde micro inames
             iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x")))
             iname_iy = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_y")))
@@ -80,7 +76,7 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 
             # init a
             id_init_a = idg('insn_init_'+identifier_a)
-            modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a, {iname_iy: prim.Variable(iname_iy+'_head')}),
+            modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_iy+'_head')}),
                                                      expression=0,
                                                      id=id_init_a,
                                                      within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy,
@@ -90,8 +86,8 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
                                        )
 
             # setze werte für a und b
-            expr_b = substitute(expr_accum, {iname_ix: 1})
-            expr_a = prim.Sum((substitute(expr_accum, {iname_ix: 0}), var_a))
+            expr_b = substitute(expr_without_r, {iname_ix:1})
+            expr_a = prim.Sum((substitute(expr_without_r, {iname_ix:0}), var_a))
 
             id_set_a = idg('insn_'+identifier_a)
             id_set_b = idg('insn_'+identifier_b)
@@ -112,13 +108,12 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 
             # r+=a[iy]
             id_accum = idg('insn_mod_accum')
-            r_vec = prim.Subscript(prim.Variable(vec_name),(prim.Variable(iname_inner),))
-            expr_accum_mod = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)), r_vec))
-            replace_accum_insn[insn.id] = lp.Assignment(assignee=r_vec,
-                                                        expression=expr_accum_mod,
+            expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)),
+                                   substitute(insn.assignee, {iname_ix:0})))
+            replace_accum_insn[insn.id] = lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}),
+                                                        expression=expr_accum,
                                                         id=id_accum,
-                                                        depends_on=insn.depends_on|frozenset({id_set_a, id_set_b,
-                                                                                              id_init_a}),
+                                                        depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}),
                                                         within_inames=insn.within_inames-frozenset({iname_ix})
                                                         )
             # a[iy] = permute
@@ -133,21 +128,35 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
                                                      )
                                        )
 
-            # tail handling
+            # tail handling               | frozenset({iname_iy+'_tail'})))
             id_accum_tail = idg('insn_accum_tail')
             subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")/4, iname_iy: prim.Variable(iname_iy+'_tail'),
                          iname_ix: 0}
-            expr_tail = prim.Call(expr.function, tuple(substitute(p, subst_map) for p in expr.parameters[:-1])
-                                  + (prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)),))
-
-            modified_accum_insn.append(lp.CallInstruction(assignees=(),
-                                                          expression=expr_tail,
-                                                          id=id_accum_tail,
-                                                          depends_on=frozenset({replace_accum_insn[insn.id].id,
-                                                                                id_permute, id_set_a, id_init_a}),
-                                                          within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer,
-                                                                                                         iname_ix, iname_iy}))
-                                                                        | frozenset({iname_iy+'_tail'})))
+            assignee_tail = substitute(insn.assignee, subst_map)
+            expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)),
+                                  assignee_tail))
+            modified_accum_insn.append(lp.Assignment(assignee=assignee_tail,
+                                                     expression=expr_tail,
+                                                     id=id_accum_tail,
+                                                     depends_on=frozenset({replace_accum_insn[insn.id].id,
+                                                                           id_permute, id_set_a, id_init_a}),
+                                                     within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer,
+                                                                                                    iname_ix, iname_iy}))
+                                                                   | frozenset({iname_iy+'_tail'})))
+
+    # update found instances with respect to modified accumulation instruction
+    def update_insn_to_vec_instance(insns, expr, insn_to_vec_instance):
+        new_insns = []
+        for insn in insns:
+            if insn.id in replace_accum_insn:
+                new_insns.append(replace_accum_insn[insn.id])
+                insn_to_vec_instance.pop(insn.id)
+                FindAlias(new_insns[-1].id, expr(new_insns[-1]), set(), insn_to_vec_instance)
+            else:
+                new_insns.append(insn)
+        return new_insns
+    read_insns = update_insn_to_vec_instance(read_insns, lambda i: i.expression, read_insn_to_vec_instance)
+    write_insns = update_insn_to_vec_instance(write_insns, lambda i: i.assignee, write_insn_to_vec_instance)
 
     # add load instructions if the vector is read, based on the read instruction
     # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird
@@ -156,79 +165,59 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
     read_dependencies = dict()
     for insn in read_insns:
         for vec, expr in read_insn_to_vec_instance[insn.id]:
-            # different code for accumulation variable
-            if isinstance(expr.function, PDELabAccumulationFunction):
-                iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x")))
-                index = substitute(expr.parameters[1], {iname_inner:0, iname_ix: 0})
-                code = "{}.load(&{}.container()({}, {}));".format(vec, expr.function.accumobj, expr.parameters[0], index)
-                within_inames = insn.within_inames-frozenset({iname_ix})
-            else:
-                # flat index without vec iname
-                # add load instruction
-                index = substitute(expr.parameters[1], {iname_inner:0})
-                code = "{}.load(&{}({}, {}));".format(vec, expr.function.name, expr.parameters[0], index)
-                within_inames = insn.within_inames|insn.reduction_inames()
+            alias = vec.replace('vec', 'alias')
+
+            # flat index without vec iname
+            strides = tuple(tag.stride for tag in knl.temporary_variables[alias].dim_tags)
+            index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner:0}).index_tuple, strides)))
 
-            # load_id = knl.make_unique_instruction_id(insns=knl.instructions+load_insns, based_on='insn_'+vec+'_load')
+            # add load instruction
             load_id = idg('insn_'+vec+'_load')
-            load_insns.append(lp.CInstruction(iname_exprs=[], code=code,
-                                              within_inames=within_inames,
-                                              #assignees=(lp.Variable(vec), ), # sonst denkt looy das müsste ein array von Vec4d sein...
-                                              id=load_id))
+            call_load = prim.Call(prim.Variable(vec+'.load'), (prim.Sum((prim.Variable(alias), index)),))
+            load_insns.append(lp.CallInstruction(assignees=(),expression=call_load,
+                                                 id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),))
             read_dependencies.setdefault(insn.id, set())
             read_dependencies[insn.id].add(load_id)
 
     # add store instructions if the vector is written, based on the write instruction
     store_insns = []
     for insn in write_insns:
-        vec, expr = write_insn_to_vec_instance[insn.id]
-
-        # flat index without vec iname
-        iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x")))
-        index = substitute(expr.parameters[1], {iname_inner:0, iname_ix: 0})
-
-        # add store instruction
-        code = "{}.store(&{}.container()({}, {}));".format(vec, expr.function.accumobj, expr.parameters[0], index)
-        # store_id = knl.make_unique_instruction_id(insns=knl.instructions+load_insns, based_on='insn_'+vec+'_store')
-        store_id = idg('insn_'+vec+'_store')
-        store_insns.append(lp.CInstruction(iname_exprs=[], code=code,
-                                           within_inames=insn.within_inames-frozenset({iname_ix}),
-                                           depends_on=insn.depends_on
-                                                      | frozenset({replace_accum_insn[insn.id].id})
-                                                      | read_dependencies[insn.id],
-                                           id=store_id))
-
+        for vec, expr in write_insn_to_vec_instance[insn.id]:
+            alias = vec.replace('vec', 'alias')
+
+            # flat index without vec iname
+            strides = tuple(tag.stride for tag in knl.temporary_variables[alias].dim_tags)
+            index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner: 0, iname_ix: 0}).index_tuple, strides)))
+
+            # add store instruction
+            store_id = idg('insn_'+vec+'_store')
+            call_store = prim.Call(prim.Variable(vec+'.store'), (prim.Sum((prim.Variable(alias), index)),))
+            store_insns.append(lp.CallInstruction(assignees=(),expression=call_store,
+                                                  id=store_id,within_inames=insn.within_inames,
+                                                  depends_on=insn.depends_on
+                                                             | frozenset({insn.id})
+                                                             | read_dependencies[insn.id],
+                                                  ))
+
+    # exchange alias for vector
     new_insns = []
     for insn in knl.instructions:
+        insn = replace_accum_insn.get(insn.id, insn)
         if insn.id not in read_insn_to_vec_instance.keys() | write_insn_to_vec_instance.keys():
             new_insns.append(insn)
         else:
-            if insn.id in replace_accum_insn:
-                new_insn = replace_accum_insn[insn.id].copy(depends_on=replace_accum_insn[insn.id].depends_on
-                                                                       | read_dependencies[insn.id])
-            else:
-                subst_map = dict()
-                for vec, expr in read_insn_to_vec_instance[insn.id]:
-                    subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),))
-
-                class NodeSubstitutor(SubstitutionMapper):
-                    def __init__(self):
-                        from pymbolic.mapper.substitutor import make_subst_func
-                        self.subst_func = make_subst_func(subst_map)
-
-                    def map_call(self, expr, *args, **kwargs):
-                        result = self.subst_func(expr)
-                        if result is not None:
-                            return result
-                        else:
-                            return SubstitutionMapper.map_call(self, expr, args, kwargs)
-
-                new_insn = insn
-                if insn in read_insns:
-                    new_insn = new_insn.copy(expression=NodeSubstitutor()(new_insn.expression),
-                                             depends_on=new_insn.depends_on | read_dependencies[insn.id])
-                if insn in write_insns:
-                    new_insn = new_insn.copy(assignee=NodeSubstitutor()(new_insn.assignee))
+            subst_map = dict()
+            for vec, expr in read_insn_to_vec_instance[insn.id]:
+                subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),))
+
+            new_insn = insn
+
+            if insn in read_insns:
+                new_insn = new_insn.copy(expression=substitute(new_insn.expression, subst_map),
+                                         depends_on=new_insn.depends_on | read_dependencies[insn.id])
+            if insn in write_insns:
+                new_insn = new_insn.copy(assignee=substitute(new_insn.assignee, subst_map))
+
             new_insns.append(new_insn)
 
     from loopy.kernel.creation import resolve_dependencies
@@ -239,7 +228,7 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 def find_accumulation_inames(knl):
     inames = set()
     for insn in knl.instructions:
-        if isinstance(insn, lp.CallInstruction) and isinstance(insn.expression.function, PDELabAccumulationFunction):
+        if any((n.startswith('r_') and n.endswith('alias') for n in insn.write_dependency_names())):
             inames |= insn.within_inames
 
     inames = set((i for i in inames if i.startswith('micro') and not i.endswith('_x')))
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index c681ec6fdb5538c0f480b1d30058c4900532254f..ec2937196b8c0cddac70d98b3ec4f99f3aab67c9 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -63,6 +63,14 @@ def type_floatingpoint():
 
 
 class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper):
+    def map_variable(self, expr, type_context):
+        tv = self.kernel.temporary_variables.get(expr.name)
+        if isinstance(tv, DuneTemporaryVariable) and tv.base_storage:
+            return prim.Variable(expr.name)
+        else:
+            return ExpressionToCExpressionMapper.map_variable(self, expr, type_context)
+
+
     def map_subscript(self, expr, type_context):
         arr = self.find_array(expr)
         if isinstance(arr, (DuneTemporaryVariable, DuneGlobalArg)) and not arr.managed: