diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index b45d92f83d0afbe330e5feb2870b687c27e364ee..600ca11e29fc2306816fef8f7b5be3b07373d623 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -173,8 +173,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size):
                     temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
 
 
-def add_vcl_access(knl, iname_inner, vcl_size):
-    from loopy.match import Reads, Tagged
+def add_vcl_access(knl, iname_inner, vcl_size, levels=0):
     accum_insns = set((insn.id for insn in lp.find_instructions(knl, And((Tagged('accum_vec{}'.format(vcl_size)),
                                                                           Iname(iname_inner))))))
     read_insns = set((insn.id for insn in lp.find_instructions(knl, And((Reads('*alias'), Iname(iname_inner))))))
@@ -221,15 +220,15 @@ def add_vcl_access(knl, iname_inner, vcl_size):
 
         # compute index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
-        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
-                               if i != 0 and i.name != iname_inner))
+        flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                                    if not (isinstance(i, prim.Variable) and i.name == iname_inner)))
 
         # find write insns
         write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
 
         # add load instruction
         load_id = idg('insn_' + name_vec + '_load')
-        call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
         load_insns.append(lp.CallInstruction(assignees=(), expression=call_load,
                                              id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),
                                              depends_on=insn.depends_on | write_ids,))
@@ -248,15 +247,15 @@ def add_vcl_access(knl, iname_inner, vcl_size):
 
         # flat index without vec iname
         strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags)
-        index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
-                               if i != 0 and i.name != iname_inner))
+        flat_index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides)
+                                    if not (isinstance(i, prim.Variable) and i.name == iname_inner)))
 
         # find write insns
         write_ids = frozenset(i.id for i in lp.find_instructions(knl, Or((Writes(name_vec), Writes(name_vec)))))
 
         # add store instruction
         store_id = idg('insn_' + name_vec + '_store')
-        call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),))
+        call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), flat_index)),))
         store_insns.append(lp.CallInstruction(assignees=(), expression=call_store,
                                               id=store_id, within_inames=insn.within_inames,
                                               depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] |
@@ -284,14 +283,17 @@ def add_vcl_access(knl, iname_inner, vcl_size):
         knl_with_subst_insns = knl_with_subst_insns.copy(instructions=new_insns)
 
         # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner]
-        parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \
-                     ',ix,' + ','.join(['i' + d for d in dim_names[1:dim]])
+        parameters = ','.join(['ex_o{}'.format(l) for l in range(levels + 1)]) + \
+                     ',v_i,' + \
+                     ','.join(['e' + d for d in dim_names[1:dim]]) + \
+                     ',ix,' + \
+                     ','.join(['i' + d for d in dim_names[1:dim]])
         knl_with_subst_insns = lp.extract_subst(knl_with_subst_insns, alias + '_subst', '{}[{}]'.format(alias, parameters),
                                              parameters=parameters)
         new_subst = knl_with_subst_insns.substitutions.copy()
         rule = new_subst[alias + '_subst']
         rule.expression = prim.Subscript(prim.Variable(alias.replace(alias_suffix, vector_sufix)),
-                                         (prim.Variable('ex_i'),))
+                                         (prim.Variable('v_i'),))
         knl_with_subst_insns = knl_with_subst_insns.copy(substitutions=new_subst)
 
     knl_with_subst_insns = lp.expand_subst(knl_with_subst_insns, Iname(iname_inner))
@@ -462,10 +464,14 @@ def realize_tail(knl, iname_inner, iname_outer, vcl_size):
 
 def vectorize_micro_elements(knl):
     vec_iname = "subel_x"
+    orig_iname = vec_iname
     if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell':
         vcl_size = get_vcl_type_size(np.float64)
 
         has_tail = get_form_option('number_of_blocks') % vcl_size > 0
+        vectorize_tail = True
+
+        knl = add_iname_array(knl, vec_iname)
 
         # manually add tail, since split_iname with slabs tries to vectorize the tail
         if has_tail:
@@ -477,19 +483,34 @@ def vectorize_micro_elements(knl):
 
             knl = lp.split_iname(knl, vec_iname, vcl_size)
             knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
+
+            tail_iname = vec_iname + '_inner' + '_tail'
         else:
             knl = lp.split_iname(knl, vec_iname, vcl_size)
 
         knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
 
-        knl = add_iname_array(knl, vec_iname)
-
         array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-        iname_vector = [a for a in knl.temporary_variables.keys() if a.endswith('vec')]
-        knl = lp.split_array_axis(knl, array_alias + iname_vector, 0, vcl_size)
-        knl = lp.tag_array_axes(knl, iname_vector, ('c', 'vec'))
+        knl = lp.split_array_axis(knl, array_alias, 0, vcl_size)
+
+        def _do_vectorization(knl, iname_inner, iname_outer, vcl_size, levels=0):
+            knl = add_vcl_iname_array(knl, orig_iname, iname_inner, vcl_size)
+            knl = add_vcl_temporaries(knl, vcl_size)
+            knl = add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size)
+            knl = add_vcl_access(knl, iname_inner, vcl_size, levels)
+            return knl
+
+        knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
+        if has_tail and vectorize_tail:
+            vcl_size = vcl_size // 2
+            vec_iname = tail_iname
+
+            knl = lp.split_iname(knl, vec_iname, vcl_size)
+            knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
+
+            array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
+            knl = lp.split_array_axis(knl, array_alias, 1, vcl_size)
+
+            knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size, levels=1)
 
-        knl = add_vcl_temporaries(knl, vcl_size)
-        knl = add_vcl_accum_insns(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
-        knl = add_vcl_access(knl, vec_iname + '_inner', vcl_size)
     return knl