diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py
index b4b57f97b5a379585a00eecb421c8d84a0828a7f..ff5a4ae36f7bf279ac56d582d6408163e30c35a1 100644
--- a/python/dune/perftool/blockstructured/accumulation.py
+++ b/python/dune/perftool/blockstructured/accumulation.py
@@ -11,14 +11,20 @@ import pymbolic.primitives as prim
 
 def name_accumulation_alias(container, accumspace):
     name = container+"_"+accumspace.lfs.name+"_alias"
+    name_tail = container+"_"+accumspace.lfs.name+"_alias_tail"
     k = get_option("number_of_blocks")
     p = accumspace.element.degree()
-    globalarg(name, shape=(k, k, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), managed=True)
-    code = "auto {} = &{}.container()({},0);".format(container + "_"+accumspace.lfs.name+"_alias", container, accumspace.lfs.name)
-    instruction(within_inames=frozenset(),
-                code=code,
-                read_variables=frozenset({container}),
-                assignees=frozenset({name}))
+
+    def _add_alias_insn(name):
+        globalarg(name, shape=(k, k, p + 1, p + 1), strides=(p, p * k + 1, 1, k + 1), managed=True)
+        code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name)
+        instruction(within_inames=frozenset(),
+                    code=code,
+                    read_variables=frozenset({container}),
+                    assignees=frozenset({name}))
+
+    _add_alias_insn(name)
+    _add_alias_insn(name_tail)
     return name
 
 
@@ -53,5 +59,6 @@ def generate_accumulation_instruction(expr, visitor):
                 expression=prim.Sum((expr_with_weight, assignee)),
                 forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
                 forced_iname_deps_is_final=True,
-                predicates=predicates
+                predicates=predicates,
+                tags=frozenset({'accum'})
                 )
diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
index af64fab9caa8e6e1038f4f9ecc852ecb4e3b2ca2..00c3b68486c19f37cd80b50927e741e71751f4fd 100644
--- a/python/dune/perftool/blockstructured/vectorization.py
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -4,60 +4,38 @@ import pymbolic.primitives as prim
 from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.symbolic import substitute
 from dune.perftool.options import get_option
+from dune.perftool.pdelab.argument import PDELabAccumulationFunction
+from dune.perftool.tools import get_pymbolic_indices
 
 
-def add_vcl_vector(knl, iname_inner, iname_outer):
-    read_insn_to_vec_instance = dict()
-    write_insn_to_vec_instance = dict()
-    read_vecs = set()
-    write_vecs = set()
-
-    from loopy.symbolic import WalkMapper
-    class FindAlias(WalkMapper):
-        def __init__(self, id, expr, vecs, insn_to_vec):
-            self.id = id
-            self.vecs = vecs
-            self.insn_to_vec = insn_to_vec
-            self.rec(expr)
-
-        def visit(self, expr, **kwargs):
-            if isinstance(expr, prim.Subscript) \
-                    and expr.aggregate.name.endswith('alias'):
-                vec_name = expr.aggregate.name.replace('alias', 'vec')
-                self.vecs.add(vec_name)
-                self.insn_to_vec.setdefault(self.id, set())
-                self.insn_to_vec[self.id].add((vec_name,expr))
-                return False
-            else:
-                return True
-    # find all instances where an alias is read from or written to
-    # save the name of the corresponding vector and the alias subscript expression
-    for insn in knl.instructions:
-        if isinstance(insn, lp.MultiAssignmentBase):
-            FindAlias(insn.id, insn.expression, read_vecs, read_insn_to_vec_instance)
-            if isinstance(insn, lp.Assignment):
-                FindAlias(insn.id, insn.assignee, write_vecs, write_insn_to_vec_instance)
-
-    read_insns = [knl.id_to_insn[id] for id in read_insn_to_vec_instance]
-    write_insns = [knl.id_to_insn[id] for id in write_insn_to_vec_instance]
+def add_vcl_temporaries(knl):
+    vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
 
     # add new temporaries for vectors
+    # hope one read insn doesn't have two different reads from the same temporary
     new_vec_temporaries = dict()
-    for vec in read_vecs|write_vecs:
-        new_vec_temporaries[vec] = DuneTemporaryVariable(vec, dtype=np.float64, shape=(4,), managed=True,
-                                                         scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
+    for alias in vector_alias:
+        vector_name = alias.replace('alias','vec')
+        new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(4,), managed=True,
+                                                                 scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
+
+    return knl.copy(temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
 
-    modified_accum_insn = []
-    replace_accum_insn = dict()
+
+def add_vcl_accum_insns(knl, iname_inner, iname_outer, iname_head, iname_tail):
+    from loopy.match import Tagged
+    accum_insns = set(lp.find_instructions(knl, Tagged('accum')))
+
+    new_insns = []
     vng = knl.get_var_name_generator()
     idg = knl.get_instruction_id_generator()
-    for insn in write_insns:
-        if isinstance(insn, lp.Assignment):
+    new_vec_temporaries = dict()
+    for insn in knl.instructions:
+        # somehow CInstructions are not hashable....
+        if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns:
             # write accum expr as "r = expr + r"
             expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee))
-            if expr_without_r == insn.expression:
-                continue
-            # finde micro inames
+
             iname_ix = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_x")))
             iname_iy = next((i for i in insn.within_inames if i.startswith('micro') and i.endswith("_y")))
 
@@ -74,14 +52,15 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 
             # init a
             id_init_a = idg('insn_init_'+identifier_a)
-            modified_accum_insn.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_iy+'_head')}),
-                                                     expression=0,
-                                                     id=id_init_a,
-                                                     within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy,
-                                                                                                  iname_outer}))
-                                                                   |frozenset({iname_iy+'_head'}),
-                                                     )
-                                       )
+            new_insns.append(lp.Assignment(assignee=substitute(var_a,{iname_iy:prim.Variable(iname_head)}),
+                                           expression=0,
+                                           id=id_init_a,
+                                           within_inames=(insn.within_inames-frozenset({iname_ix, iname_iy,
+                                                                                        iname_outer}))
+                                                         | frozenset({iname_head}),
+                                           tags=frozenset({'head'})
+                                           )
+                             )
 
             # setze werte für a und b
             expr_b = substitute(expr_without_r, {iname_ix:1})
@@ -89,138 +68,170 @@ def add_vcl_vector(knl, iname_inner, iname_outer):
 
             id_set_a = idg('insn_'+identifier_a)
             id_set_b = idg('insn_'+identifier_b)
-            modified_accum_insn.append(lp.Assignment(assignee=var_b,
-                                                     expression=expr_b,
-                                                     id=id_set_b,
-                                                     depends_on=insn.depends_on,
-                                                     within_inames=insn.within_inames-frozenset({iname_ix}),
-                                                     )
-                                       )
-            modified_accum_insn.append(lp.Assignment(assignee=var_a,
-                                                     expression=expr_a,
-                                                     id=id_set_a,
-                                                     depends_on=insn.depends_on|frozenset({id_init_a}),
-                                                     within_inames=insn.within_inames-frozenset({iname_ix}),
-                                                     )
-                                       )
+            new_insns.append(lp.Assignment(assignee=var_b,
+                                           expression=expr_b,
+                                           id=id_set_b,
+                                           depends_on=insn.depends_on,
+                                           within_inames=insn.within_inames-frozenset({iname_ix}),
+                                           )
+                             )
+            new_insns.append(lp.Assignment(assignee=var_a,
+                                           expression=expr_a,
+                                           id=id_set_a,
+                                           depends_on=insn.depends_on|frozenset({id_init_a}),
+                                           within_inames=insn.within_inames-frozenset({iname_ix}),
+                                           )
+                             )
 
             # r+=a[iy]
             id_accum = idg('insn_mod_accum')
             expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)),
                                    substitute(insn.assignee, {iname_ix:0})))
-            replace_accum_insn[insn.id] = lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}),
-                                                        expression=expr_accum,
-                                                        id=id_accum,
-                                                        depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}),
-                                                        within_inames=insn.within_inames-frozenset({iname_ix})
-                                                        )
+            new_insns.append(lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}),
+                                           expression=expr_accum,
+                                           id=id_accum,
+                                           depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b}),
+                                           within_inames=insn.within_inames-frozenset({iname_ix}),
+                                           tags=frozenset({'accum'})
+                                           )
+                             )
             # a[iy] = permute
             id_permute = idg('insn_permute')
             expr_permute = prim.Call(prim.Variable('permute4d<3,-1,-1,-1>'), (var_b,))
-            modified_accum_insn.append(lp.Assignment(assignee=var_a,
-                                                     expression=expr_permute,
-                                                     id=id_permute,
-                                                     depends_on=replace_accum_insn[insn.id].depends_on
-                                                                |frozenset({replace_accum_insn[insn.id].id}),
-                                                     within_inames=insn.within_inames-frozenset({iname_ix})
-                                                     )
-                                       )
-
-            # tail handling               | frozenset({iname_iy+'_tail'})))
+            new_insns.append(lp.Assignment(assignee=var_a,
+                                           expression=expr_permute,
+                                           id=id_permute,
+                                           depends_on=insn.depends_on|frozenset({id_set_a,id_init_a,id_set_b,
+                                                                                 id_accum}),
+                                           within_inames=insn.within_inames-frozenset({iname_ix})
+                                           )
+                             )
+
+            # tail handling, uses tail alias
             id_accum_tail = idg('insn_accum_tail')
-            subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")/4, iname_iy: prim.Variable(iname_iy+'_tail'),
-                         iname_ix: 0}
+            subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//4, iname_iy: prim.Variable(iname_tail),
+                         iname_ix: 0, insn.assignee_name: prim.Variable(insn.assignee_name+'_tail')}
             assignee_tail = substitute(insn.assignee, subst_map)
-            expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_iy+'_tail'), 0)),
+            expr_tail = prim.Sum((prim.Subscript(prim.Variable(identifier_a), (prim.Variable(iname_tail), 0)),
                                   assignee_tail))
-            modified_accum_insn.append(lp.Assignment(assignee=assignee_tail,
-                                                     expression=expr_tail,
-                                                     id=id_accum_tail,
-                                                     depends_on=frozenset({replace_accum_insn[insn.id].id,
-                                                                           id_permute, id_set_a, id_init_a}),
-                                                     within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer,
-                                                                                                    iname_ix, iname_iy}))
-                                                                   | frozenset({iname_iy+'_tail'})))
-
-    # update found instances with respect to modified accumulation instruction
-    def update_insn_to_vec_instance(insns, expr, insn_to_vec_instance):
-        new_insns = []
-        for insn in insns:
-            if insn.id in replace_accum_insn:
-                new_insns.append(replace_accum_insn[insn.id])
-                insn_to_vec_instance.pop(insn.id)
-                FindAlias(new_insns[-1].id, expr(new_insns[-1]), set(), insn_to_vec_instance)
+
+            new_insns.append(lp.Assignment(assignee=assignee_tail,
+                                           expression=expr_tail,
+                                           id=id_accum_tail,
+                                           depends_on=frozenset({id_accum,id_permute, id_set_a, id_init_a}),
+                                           within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer,
+                                                                                          iname_ix, iname_iy}))
+                                                         | frozenset({iname_tail}),
+                                           tags=frozenset({'tail'})))
+        else:
+            new_insns.append(insn)
+
+    return knl.copy(instructions=new_insns,
+                    temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
+
+
+def add_vcl_access(knl, iname_inner):
+    from loopy.match import Reads, Tagged
+    accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum'))))
+    read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias'))))
+
+    from loopy.symbolic import CombineMapper
+    from loopy.symbolic import IdentityMapper
+    class AliasIndexCollector(CombineMapper, IdentityMapper):
+        def combine(self, values):
+            return sum(values, tuple())
+
+        def map_constant(self, expr):
+            return tuple()
+
+        map_variable = map_constant
+        map_function_symbol = map_constant
+
+        def map_subscript(self, expr):
+            if expr.aggregate.name.endswith('alias'):
+                return expr.aggregate, expr.index_tuple
             else:
-                new_insns.append(insn)
-        return new_insns
-    read_insns = update_insn_to_vec_instance(read_insns, lambda i: i.expression, read_insn_to_vec_instance)
-    write_insns = update_insn_to_vec_instance(write_insns, lambda i: i.assignee, write_insn_to_vec_instance)
+                return tuple()
 
     # add load instructions if the vector is read, based on the read instruction
     # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird
     # muss dafür eigentlich nur den namen des vectors anpassen
+    idg = knl.get_instruction_id_generator()
+    aic = AliasIndexCollector()
     load_insns = []
     read_dependencies = dict()
-    for insn in read_insns:
-        for vec, expr in read_insn_to_vec_instance[insn.id]:
-            alias = vec.replace('vec', 'alias')
-
-            # flat index without vec iname
-            strides = tuple(tag.stride for tag in knl.arg_dict[alias].dim_tags)
-            index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner:0}).index_tuple, strides)))
-
-            # add load instruction
-            load_id = idg('insn_'+vec+'_load')
-            call_load = prim.Call(prim.Variable(vec+'.load'), (prim.Sum((prim.Variable(alias), index)),))
-            load_insns.append(lp.CallInstruction(assignees=(),expression=call_load,
-                                                 id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),))
-            read_dependencies.setdefault(insn.id, set())
-            read_dependencies[insn.id].add(load_id)
+    for id in read_insns:
+        insn = knl.id_to_insn[id]
+
+        alias, index = aic(insn.expression)
+        name_alias = alias.name
+        name_vec = name_alias.replace('alias', 'vec')
+
+        # compute index without vec iname
+        strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
+        index = prim.Sum(tuple(prim.Product((i,s)) for i,s in zip(index, strides) if i!=0 and i.name != iname_inner))
+
+        # add load instruction
+        load_id = idg('insn_'+name_vec+'_load')
+        call_load = prim.Call(prim.Variable(name_vec+'.load'), (prim.Sum((prim.Variable(name_alias), index)),))
+        load_insns.append(lp.CallInstruction(assignees=(),expression=call_load,
+                                             id=load_id,within_inames=insn.within_inames|insn.reduction_inames(),))
+        read_dependencies.setdefault(id, set())
+        read_dependencies[id].add(load_id)
 
     # add store instructions if the vector is written, based on the write instruction
     store_insns = []
-    for insn in write_insns:
-        for vec, expr in write_insn_to_vec_instance[insn.id]:
-            alias = vec.replace('vec', 'alias')
-
-            # flat index without vec iname
-            strides = tuple(tag.stride for tag in knl.arg_dict[alias].dim_tags)
-            index = prim.Sum(tuple(prim.Product(z) for z in zip(substitute(expr, {iname_inner: 0, iname_ix: 0}).index_tuple, strides)))
-
-            # add store instruction
-            store_id = idg('insn_'+vec+'_store')
-            call_store = prim.Call(prim.Variable(vec+'.store'), (prim.Sum((prim.Variable(alias), index)),))
-            store_insns.append(lp.CallInstruction(assignees=(),expression=call_store,
-                                                  id=store_id,within_inames=insn.within_inames,
-                                                  depends_on=insn.depends_on
-                                                             | frozenset({insn.id})
-                                                             | read_dependencies[insn.id],
-                                                  ))
-
-    # exchange alias for vector
+    for id in accum_insns:
+        insn = knl.id_to_insn[id]
+
+        alias, index = aic(insn.expression)
+        name_alias = alias.name
+        name_vec = name_alias.replace('alias', 'vec')
+
+        # flat index without vec iname
+        strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
+        index = prim.Sum(tuple(prim.Product((i,s)) for i,s in zip(index, strides) if i!=0 and i.name != iname_inner))
+
+        # add store instruction
+        store_id = idg('insn_'+name_vec+'_store')
+        call_store = prim.Call(prim.Variable(name_vec+'.store'), (prim.Sum((prim.Variable(name_alias), index)),))
+        store_insns.append(lp.CallInstruction(assignees=(),expression=call_store,
+                                              id=store_id,within_inames=insn.within_inames,
+                                              depends_on=insn.depends_on
+                                                         | frozenset({id})
+                                                         | read_dependencies[id],
+                                              ))
+
+    # replace alias with vcl vector, except for accumulation assignee
+    vector_alias = [a for a in knl.arg_dict if a.endswith('alias')]
+    for alias in vector_alias:
+        knl = lp.extract_subst(knl, alias+'_subst', '{}[ex_o, ex_i,ey,ix,iy]'.format(alias),
+                               parameters='ex_o,ex_i,ey,ix,iy')
+        new_subst = knl.substitutions.copy()
+        rule = new_subst[alias+'_subst']
+        rule.expression = prim.Subscript(prim.Variable(alias.replace('alias','vec')), (prim.Variable('ex_i'),))
+        knl = knl.copy(substitutions=new_subst)
+
+    from loopy.match import All
+    knl = lp.expand_subst(knl, All())
+
+    # add store and load dependencies and set right accumulation assignee
     new_insns = []
     for insn in knl.instructions:
-        insn = replace_accum_insn.get(insn.id, insn)
-        if insn.id not in read_insn_to_vec_instance.keys() | write_insn_to_vec_instance.keys():
+        if insn.id not in read_insns | accum_insns:
             new_insns.append(insn)
         else:
-            subst_map = dict()
-            for vec, expr in read_insn_to_vec_instance[insn.id]:
-                subst_map[expr] = prim.Subscript(prim.Variable(vec), (prim.Variable(iname_inner),))
-
-            new_insn = insn
-
-            if insn in read_insns:
-                new_insn = new_insn.copy(expression=substitute(new_insn.expression, subst_map),
-                                         depends_on=new_insn.depends_on | read_dependencies[insn.id])
-            if insn in write_insns:
-                new_insn = new_insn.copy(assignee=substitute(new_insn.assignee, subst_map))
-
-            new_insns.append(new_insn)
+            if insn.id in accum_insns:
+                assignee_alias = insn.assignee
+                assignee_vec = next((expr for expr in insn.expression.children
+                                     if isinstance(expr, prim.Subscript)
+                                     and expr.aggregate.name.replace('vec','alias') == assignee_alias.aggregate.name))
+                new_insns.append(insn.copy(assignee=assignee_vec,
+                                           depends_on=insn.depends_on | read_dependencies[insn.id]))
+            else:
+                new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id]))
 
-    from loopy.kernel.creation import resolve_dependencies
-    return resolve_dependencies(knl.copy(instructions=new_insns+load_insns+store_insns+modified_accum_insn,
-                                         temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)))
+    return knl.copy(instructions=new_insns+load_insns+store_insns)
 
 
 def find_accumulation_inames(knl):
@@ -240,8 +251,16 @@ def vectorize_micro_elements(knl):
         from loopy.match import Not, All
         accum_inames = find_accumulation_inames(knl)
         for iname in accum_inames:
+            iname_head = iname+'_head'
+            iname_tail = iname+'_tail'
             knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_tail')
             knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_head')
+
         knl = lp.split_iname(knl,"subel_x",4, inner_tag='vec')
-        knl = add_vcl_vector(knl,'subel_x_inner', 'subel_x_outer')
+        array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
+        knl = lp.split_array_axis(knl, array_alias, 0, 4)
+
+        knl = add_vcl_temporaries(knl)
+        knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer', iname_head, iname_tail)
+        knl = add_vcl_access(knl, 'subel_x_inner')
     return knl