diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py
index 03ce2cf678f57d10c6d76a81aaf89fa3fe79af98..c03ebc1f12a73243bd0ba204c81802633cfec631 100644
--- a/python/dune/codegen/blockstructured/vectorization.py
+++ b/python/dune/codegen/blockstructured/vectorization.py
@@ -186,11 +186,17 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
     from loopy.symbolic import IdentityMapper
 
     class AliasIndexCollector(CombineMapper, IdentityMapper):
+        def __init__(self):
+            self.found_alias = False
+
         def combine(self, values):
             return sum(values, tuple())
 
         def map_constant(self, expr):
-            return tuple()
+            if self.found_alias:
+                return (expr,)
+            else:
+                return tuple()
 
         map_variable = map_constant
         map_function_symbol = map_constant
@@ -198,7 +204,10 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0):
 
         def map_subscript(self, expr):
             if expr.aggregate.name.endswith(alias_suffix):
-                return expr.aggregate, expr.index_tuple
+                self.found_alias = True
+                indices = self.combine((self.rec(index) for index in expr.index_tuple))
+                self.found_alias = False
+                return expr.aggregate, indices
             else:
                 return tuple()
 
@@ -467,50 +476,48 @@ def vectorize_micro_elements(knl):
     orig_iname = vec_iname
     if vec_iname in knl.all_inames() and get_global_context_value('integral_type') == 'cell':
         vcl_size = get_vcl_type_size(np.float64)
+        knl = add_iname_array(knl, vec_iname)
 
-        has_tail = get_form_option('number_of_blocks') % vcl_size > 0
-        vectorize_tail = True
+        def _do_vectorization(knl, vec_iname, vcl_size, level=0):
+            inner_iname = vec_iname + '_inner'
+            outer_iname = vec_iname + '_outer'
 
-        knl = add_iname_array(knl, vec_iname)
+            has_tail = get_form_option('number_of_blocks') % vcl_size > 0
+            vectorize_tail = (vcl_size // 2) > 1 and \
+                (get_form_option('number_of_blocks') % vcl_size) % (vcl_size // 2) == 0
 
-        # manually add tail, since split_iname with slabs tries to vectorize the tail
-        if has_tail:
-            vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size
-            from loopy.kernel.tools import DomainChanger
-            domch = DomainChanger(knl, (vec_iname,))
-            knl = knl.copy(domains=domch.get_domains_with(
-                BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound))))
+            # manually add tail, since split_iname with slabs tries to vectorize the tail
+            if has_tail:
+                # fake suitable loop bound
+                vectorizable_bound = (get_form_option('number_of_blocks') // vcl_size) * vcl_size
+                from loopy.kernel.tools import DomainChanger
+                domch = DomainChanger(knl, (vec_iname,))
+                knl = knl.copy(domains=domch.get_domains_with(
+                    BasicSet('{{ [{0}]: 0<={0}<{1} }}'.format(vec_iname, vectorizable_bound))))
 
-            knl = lp.split_iname(knl, vec_iname, vcl_size)
-            knl = realize_tail(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
+                knl = lp.split_iname(knl, vec_iname, vcl_size, outer_iname=outer_iname, inner_iname=inner_iname)
 
-            tail_iname = vec_iname + '_inner' + '_tail'
-        else:
-            knl = lp.split_iname(knl, vec_iname, vcl_size)
+                tail_iname = vec_iname + '_inner' + '_tail'
+                knl = realize_tail(knl, inner_iname, outer_iname, tail_iname, vcl_size)
+            else:
+                knl = lp.split_iname(knl, vec_iname, vcl_size)
 
-        knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
+            knl = lp.tag_inames(knl, [(inner_iname, 'vec')])
 
-        array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-        knl = lp.split_array_axis(knl, array_alias, 0, vcl_size)
+            array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
+            knl = lp.split_array_axis(knl, array_alias, level, vcl_size)
 
-        def _do_vectorization(knl, iname_inner, iname_outer, vcl_size, levels=0):
-            knl = add_vcl_iname_array(knl, orig_iname, iname_inner, vcl_size)
+            knl = add_vcl_iname_array(knl, orig_iname, inner_iname, vcl_size)
             knl = add_vcl_temporaries(knl, vcl_size)
-            knl = add_vcl_accum_insns(knl, iname_inner, iname_outer, vcl_size)
-            knl = add_vcl_access(knl, iname_inner, vcl_size, levels)
-            return knl
+            knl = add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size)
+            knl = add_vcl_access(knl, inner_iname, vcl_size, level)
 
-        knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size)
-        if has_tail and vectorize_tail:
-            vcl_size = vcl_size // 2
-            vec_iname = tail_iname
+            if has_tail and vectorize_tail:
+                from pudb import set_trace; set_trace()
+                knl = _do_vectorization(knl, tail_iname, vcl_size // 2, level + 1)
 
-            knl = lp.split_iname(knl, vec_iname, vcl_size)
-            knl = lp.tag_inames(knl, [(vec_iname + '_inner', 'vec')])
-
-            array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-            knl = lp.split_array_axis(knl, array_alias, 1, vcl_size)
+            return knl
 
-            knl = _do_vectorization(knl, vec_iname + '_inner', vec_iname + '_outer', vcl_size, levels=1)
+        knl = _do_vectorization(knl, orig_iname, vcl_size)
 
     return knl