diff --git a/python/dune/perftool/loopy/collectvector.py b/python/dune/perftool/loopy/collectvector.py
index b5023e0e8fce7371c8a79d2c586cc5d64bc4bb4e..3d69d623f28851466fc44690ef876366f4f7aeb3 100644
--- a/python/dune/perftool/loopy/collectvector.py
+++ b/python/dune/perftool/loopy/collectvector.py
@@ -1,8 +1,12 @@
 """ A kernel transformation that collects data until the vector size is reached """
 
-from dune.perftool.loopy.vcl import VCLLoad, VCLStore
+from dune.perftool.loopy.vcl import get_vcl_type_size
+from dune.perftool.loopy.vectorview import (add_vector_view,
+                                            get_vector_view_name,
+                                            )
 from dune.perftool.tools import get_pymbolic_basename
 
+from loopy.kernel.creation import parse_domains
 from loopy.symbolic import pw_aff_to_expr
 
 from pymbolic.mapper.dependency import DependencyMapper
@@ -13,53 +17,17 @@ import loopy as lp
 import numpy as np
 
 
-def load_instruction(vec, basepointer, offset=0, **kwargs):
-    # TODO: What I really want to pass in here is a pointer type
-    pointer = basepointer
-    if offset:
-        pointer = prim.Sum((pointer, offset))
-    return lp.CallInstruction((),
-                              prim.Call(VCLLoad(vec), (pointer,)),
-                              tags=kwargs.get('tags', frozenset()).union(frozenset(["vcl_load"])),
-                              **kwargs
-                              )
-
-
-def store_instruction(vec, basepointer, offset=0, **kwargs):
-    # TODO: What I really want to pass in here is a pointer type
-    pointer = basepointer
-    if offset:
-        pointer = prim.Sum((pointer, offset))
-    return lp.CallInstruction((),
-                              prim.Call(VCLStore(vec), (pointer,)),
-                              tags=kwargs.get('tags', frozenset()).union(frozenset(["vcl_store"])),
-                              **kwargs
-                              )
-
-
-def collect_vector_data(knl, insns, inames, vec_size=4):
-    # TODO: In theory this vec_size should be deduced from the types
-    # that we find in the actual instructions
+def collect_vector_data(knl, insns, inames):
+    #
+    # Process/Assert/Standardize the input
+    #
 
-    # Process inames input -> frozenset
+    # inames input -> tuple
     if isinstance(inames, str):
         inames = inames.split(",")
-    inames = frozenset(inames)
-
-    # Filter instructions into the groups that we need for this transformation:
-    # 1. Those that should be vectorized by this transformation -> insns
-    # 2. Writers of quantities that depend on the inames and that 1) need -> write_insns
-    # 3. "Outside" instructions, that the instructions from 1)+2) depend on -> dep_insns
-    # 4. All other instructions (needed to correctly reconstruct) -> other_insns
-    #
-    # All these should be lists of instruction instances.
+    inames = tuple(i.strip() for i in inames)
 
-    # 1. Those that should be vectorized by this transformation -> insns
-    #
-    # Instructions may be passed in as either:
-    # * A match expression object
-    # * A comma separated string of IDs
-    # * An iterable of IDs
+    # insns -> list of Instruction instances
     if isinstance(insns, lp.match.MatchExpressionBase):
         insns = lp.find_instructions(knl, insns)
     else:
@@ -67,205 +35,80 @@ def collect_vector_data(knl, insns, inames, vec_size=4):
             insns = [i.strip() for i in insns.split(",")]
         insns = [knl.id_to_insn[i] for i in insns]
 
-    # 2. Writers of quantities that depend on the inames and that 1) need -> write_insns
     #
-    # To determine these, we need to
-    # * Find all written quantities in the instructions from 1)
-    # * Find the instructions that write to these quantities
-    # * Filter only those that depend on the given inames
-    quantities = {}
-    depmapper = DependencyMapper()
+    # Inspect the given instructions for dependent quantities
+    # and precompute them
+    #
+
+    quantities = []
     for insn in insns:
-        for expr in depmapper(insn.expression):
-            basename = get_pymbolic_basename(expr)
-            quantities.setdefault(basename, frozenset())
-            quantities[basename] = quantities[basename].union(frozenset([expr]))
+        for expr in DependencyMapper()(insn.expression):
+            quantities.append(get_pymbolic_basename(expr))
+    quantities = set(quantities)
+
+    for quantity in quantities:
+        # Introduce a substitution rule and find save name
+        subst_old = knl.substitutions.keys()
+        knl = lp.assignment_to_subst(knl, quantity, extra_arguments=inames)
+        subst_new = knl.substitutions.keys()
+        subst_name, = set(subst_new) - set(subst_old)
 
-    write_match = lp.match.Or(tuple(lp.match.Writes(q) for q in quantities))
-    iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames))
-    match = lp.match.And((write_match, iname_match))
-    write_insns = lp.find_instructions(knl, match)
+        # Do precomputation of the quantity
+        prec_quantity = "{}_precomputed".format(quantity)
+        compute_id = "{}_compute_id".format(quantity)
+        compute_ids.append(compute_id)
 
-    # 3. "Outside" instructions, that the instructions from 1) depend on -> dep_insns
-    deps = frozenset([]).union(*(insn.depends_on for insn in insns + write_insns))
-    deps = deps - frozenset([i.id for i in insns + write_insns])
-    dep_insns = lp.find_instructions(knl, lp.match.Or(tuple(lp.match.Id(i) for i in deps)))
+        knl = lp.precompute(knl, subst_name, inames,
+                            temporary_name=prec_quantity,
+                            compute_insn_id=compute_id)
 
-    # 4. All other instructions (needed to correctly reconstruct) -> other_insns
-    used_ids = frozenset([i.id for i in insns + write_insns + dep_insns])
-    other_insns = lp.find_instructions(knl, lp.match.Not(lp.match.Or(tuple(lp.match.Id(i) for i in used_ids))))
+        # Introduce a vector view of the precomputation result
+        knl = add_vector_view(knl, prec_quantity)
 
     #
-    # Assert some assumptions on the instructions
+    # Construct a flat loop for the given instructions
     #
 
-    # An instruction occurs in but one of these groups:
-    assert len(set(insns + write_insns + dep_insns + other_insns)) == len(insns + write_insns + dep_insns + other_insns)
-
-    # Analyse the inames of the given instructions and identify inames
-    # that they all have in common. Those inames will also be iname dependencies
-    # of inserted instruction.
-    common_inames = frozenset([]).union(*(insn.within_inames for insn in insns)) - inames
-
-    # Have a list gathering all *new* instructions
     new_insns = []
-    temporaries = knl.temporary_variables
-    keep_rules = []
-
-    # Insert a flat consecutive counter 'total_index'
-    temporaries['total_index'] = lp.TemporaryVariable('total_index',  # name
-                                                      dtype=np.int32,
-                                                      scope=lp.temp_var_scope.LOCAL,
-                                                      )
-    new_insns.append(lp.Assignment(prim.Variable("total_index"),  # assignee
-                                   0,  # expression
-                                   within_inames=common_inames,
-                                   within_inames_is_final=True,
-                                   id="assign_total_index",
-                                   ))
-    new_insns.append(lp.Assignment(prim.Variable("total_index"),  # assignee
-                                   prim.Sum((prim.Variable("total_index"), 1)),  # expression
-                                   within_inames=common_inames.union(inames),
-                                   within_inames_is_final=True,
-                                   depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_total_index"})),
-                                   depends_on_is_final=True,
-                                   id="update_total_index",
-                                   ))
+    other_insns = [i for i in knl.instructions if i.id not in [j.id for j in insns]]
 
-    # Insert a rotating index, that counts 0 , .. , vecsize - 1
-    temporaries['rotate_index'] = lp.TemporaryVariable('rotate_index',  # name
-                                                       dtype=np.int32,
-                                                       scope=lp.temp_var_scope.LOCAL,
-                                                       )
-    new_insns.append(lp.Assignment(prim.Variable("rotate_index"),  # assignee
-                                   0,  # expression
-                                   within_inames=common_inames,
-                                   within_inames_is_final=True,
-                                   id="assign_rotate_index",
-                                   ))
-    new_insns.append(lp.Assignment(prim.Variable("rotate_index"),  # assignee
-                                   prim.Remainder(prim.Sum((prim.Variable("rotate_index"), 1)), vec_size),  # expression
-                                   within_inames=common_inames.union(inames),
-                                   within_inames_is_final=True,
-                                   depends_on=frozenset([i.id for i in write_insns]).union(frozenset({"assign_rotate_index"})),
-                                   depends_on_is_final=True,
-                                   id="update_rotate_index",
-                                   ))
+    size = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames))
+    vec_size = get_vcl_type_size(np.float64)
+    size = prim.FloorDiv(size, vec_size)
 
-    # Pre-evaluate all the needed quantities
-    replacemap_arr = {}
-    replacemap_poi = {}
-    replacemap_vec = {}
-    for quantity, quantity_exprs in quantities.items():
-        # TODO for now I only consider the case where an array occurs but once!
-        assert len(quantity_exprs) == 1
-        quantity_expr, = quantity_exprs
-
-        # Determine the shape of the
-
-        arrname = quantity + '_buffered_arr'
-        temporaries[arrname] = lp.TemporaryVariable(arrname,
-                                                    dtype=np.float64,
-                                                    shape=(vec_size,),
-                                                    dim_tags="c",
-                                                    base_storage=quantity + '_base_storage',
-                                                    scope=lp.temp_var_scope.LOCAL,
-                                                    )
-
-        vecname = quantity + '_buffered_vec'
-        temporaries[vecname] = lp.TemporaryVariable(vecname,
-                                                    dtype=np.float64,
-                                                    shape=(vec_size,),
-                                                    dim_tags="vec",
-                                                    base_storage=quantity + '_base_storage',
-                                                    scope=lp.temp_var_scope.LOCAL,
-                                                    )
-
-        replacemap_arr[quantity] = prim.Subscript(prim.Variable(arrname), (prim.Variable('rotate_index'),))
-        replacemap_poi[quantity] = prim.Variable(arrname)
-        replacemap_vec[quantity_expr] = prim.Variable(vecname)
-
-    for insn in write_insns:
-        if isinstance(insn, lp.Assignment):
-            new_insns.append(insn.copy(assignee=replacemap_arr[get_pymbolic_basename(insn.assignee)],
-                                       )
-                             )
-        if isinstance(insn, lp.CInstruction):
-            # Rip apart the code and change the assignee
-            assignee, expression = insn.code.split("=")
-            assignee = assignee.strip()
-            assert assignee in replacemap_arr
-
-            code = "{} ={}".format(str(replacemap_arr[assignee]), expression)
-            new_insns.append(insn.copy(code=code))
+    temporaries = knl.temporary_variables
+    temporaries["flatsize"] = lp.TemporaryVariable("flatsize",
+                                                   dtype=np.int32,
+                                                   shape=(),
+                                                   )
+    new_insns.append(lp.Assignment(prim.Variable("flatsize"),
+                                   size,
+                                   )
+                     )
 
-    # Determine the condition for the continue statement
-    upper_bound = prim.Product(tuple(pw_aff_to_expr(knl.get_iname_bounds(i).size) for i in inames))
-    total_check = prim.Comparison(prim.Variable("total_index"), "<", upper_bound)
-    rotate_check = prim.Comparison(prim.Variable("rotate_index"), "!=", 0)
-    check = prim.LogicalAnd((rotate_check, total_check))
+    # Add an additional domain to the kernel
+    new_iname = "flat_{}".format("_".join(inames))
+    domain = "{{ [{0}] : 0<={0}<flatsize }}".format(new_iname, str(size))
+    domain = parse_domains(domain, {})
+    knl = knl.copy(domains=knl.domains + domain,
+                   temporary_variables=temporaries)
 
-    # Insert the 'continue' statement
-    new_insns.append(lp.CInstruction((),  # iname exprs that the code needs access to
-                                     "continue;",  # the code
-                                     predicates=frozenset({check}),
-                                     depends_on=frozenset({"update_rotate_index", "update_total_index"}).union(frozenset([i.id for i in write_insns])),
-                                     depends_on_is_final=True,
-                                     within_inames=common_inames.union(inames),
-                                     within_inames_is_final=True,
-                                     id="continue_stmt",
-                                     ))
+    # Split and tag the flat iname
+    knl = lp.split_iname(knl, new_iname, vec_size, inner_tag="vec")
+    new_inames = ("{}_outer".format(new_iname), "{}_inner".format(new_iname))
+    knl = lp.assume(knl, "flatsize mod {} = 0".format(vec_size))
 
     for insn in insns:
-        # Turn the assignee array into a base storage array
-        basename = get_pymbolic_basename(insn.assignee)
-        assert basename in temporaries
-        if temporaries[basename].base_storage is None:
-            temporaries[basename] = temporaries[basename].copy(base_storage="{}_base".format(basename))
-
-        # Load the data for this instruction
-        load_instructions = []
-        for expr in depmapper(insn.expression):
-            name = get_pymbolic_basename(expr)
-            new_insns.append(load_instruction(replacemap_vec[expr],
-                                              replacemap_poi[name],
-                                              depends_on=frozenset({"continue_stmt"}),
-                                              within_inames=inames,
-                                              within_inames_is_final=True,
-                                              id="{}_load_instr".format(name)
-                                              )
-                             )
-            load_instructions.append("{}_load_instr".format(name))
-
-        # TODO do something about the assignee!
-        name = "{}_result_buffer".format(insn.id)
-        temporaries[name] = lp.TemporaryVariable(name,
-                                                 dtype=np.float64,
-                                                 shape=(vec_size,),
-                                                 dim_tags="vec",
-                                                 base_storage="{}_base".format(basename),
-                                                 scope=lp.temp_var_scope.LOCAL,
-                                                 )
-        new_insns.append(insn.copy(assignee=prim.Variable(name),
-                                   expression=substitute(insn.expression, variable_assignments=replacemap_vec),
-                                   depends_on=insn.depends_on.union(frozenset(["continue_stmt"])).union(frozenset(load_instructions))
-                                   )
-                         )
-
-        new_insns.append(store_instruction(prim.Variable(name),
-                                           prim.Variable(basename),
-                                           offset=prim.Sum((prim.Variable("total_index") // vec_size, -1)),
-                                           depends_on=frozenset({insn.id}),
-                                           within_inames=inames,
-                                           within_inames_is_final=True
-                                           )
-                         )
-
-        keep_rules.append('temp_to_write({})'.format(basename))
-        keep_rules.append('read_no_write({})'.format(basename))
+        # Get a vector view of the lhs expression
+        lhsname = get_pymbolic_basename(insn.assignee)
+        knl = add_vector_view(knl, lhsname)
+        lhsname = get_vector_view_name(lhsname)
+
+        new_insns.append(lp.Assignment(prim.Subscript(prim.Variable(lhsname), tuple(prim.Variable(i) for i in new_inames)),
+                                       prim.Subscript(prim.Variable(get_vector_view_name("wk_precomputed")), tuple(prim.Variable(i) for i in new_inames)),
+                                       within_inames=frozenset(new_inames),
+                                       within_inames_is_final=True,
+                                       )
+                          )
 
-    # Return a kernel
-    return knl.copy(instructions=dep_insns + other_insns + new_insns,
-                    temporary_variables=temporaries,
-                    silenced_warnings=knl.silenced_warnings + keep_rules
-                    )
+    return knl.copy(instructions=new_insns + other_insns)
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index e19f3e1a4afdb0f3688af5bf2c3a5a42773a67b4..295a6029b3ba6891b84a85132fac2d640df271e6 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -2,7 +2,9 @@ from dune.perftool.generation import post_include
 
 from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.vcl import VCLTypeRegistry
-from dune.perftool.generation import retrieve_cache_functions
+from dune.perftool.generation import (include_file,
+                                      retrieve_cache_functions,
+                                      )
 
 from loopy.target import (TargetBase,
                           ASTBuilderBase,
@@ -109,6 +111,7 @@ class DuneTarget(TargetBase):
 
     def dtype_to_typename(self, dtype):
         if dtype.dtype.kind == "V":
+            include_file("dune/perftool/vectorclass/vectorclass.h", filetag="operatorfile")
             return VCLTypeRegistry.names[dtype.dtype]
         else:
             return _registry[dtype.dtype.name]
diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py
index 272e0ec7097a2ac07e0d19f1e1b439ed1a208da8..78dea3cfdf05c34d24b9ba872334a61d9b16d5be 100644
--- a/python/dune/perftool/loopy/vcl.py
+++ b/python/dune/perftool/loopy/vcl.py
@@ -47,6 +47,10 @@ def _populate_vcl_type_registry():
 _populate_vcl_type_registry()
 
 
+def get_vcl_type_size(nptype, register_size=256):
+    return register_size // (np.dtype(nptype).itemsize * 8)
+
+
 def get_vcl_type(nptype, register_size=None, vec_size=None):
     assert register_size or vec_size
     assert not(register_size and vec_size)
@@ -55,34 +59,3 @@ def get_vcl_type(nptype, register_size=None, vec_size=None):
         vec_size = register_size // (np.dtype(nptype).itemsize * 8)
 
     return VCLTypeRegistry.types[np.dtype(nptype), vec_size]
-
-
-class VCLLoad(FunctionIdentifier):
-    def __init__(self, vec):
-        self.vec = vec
-
-    def __getinitargs__(self):
-        return (self.vec,)
-
-    @property
-    def name(self):
-        return "{}->load".format(self.vec)
-
-
-class VCLStore(FunctionIdentifier):
-    def __init__(self, vec):
-        self.vec = vec
-
-    def __getinitargs__(self):
-        return (self.vec,)
-
-    @property
-    def name(self):
-        return "{}->store".format(self.vec)
-
-
-@function_mangler
-def vcl_mangler(target, func, dtypes):
-    if isinstance(func, (VCLLoad, VCLStore)):
-        include_file("dune/perftool/vectorclass/vectorclass.h", filetag="operatorfile")
-        return CallMangleInfo(func.name, (), (NumpyType(np.int32),))
diff --git a/python/dune/perftool/loopy/vectorview.py b/python/dune/perftool/loopy/vectorview.py
new file mode 100644
index 0000000000000000000000000000000000000000..86aefa81dabc0035b21ca9a846ed0d7111a7bb1d
--- /dev/null
+++ b/python/dune/perftool/loopy/vectorview.py
@@ -0,0 +1,48 @@
+"""
+Implement tools around the idea of having two 'views' on memory:
+One being an ordinary array with proper shape and so on, and one
+being a an array of SIMD vectors
+"""
+
+from dune.perftool.loopy.vcl import get_vcl_type_size
+
+import loopy as lp
+import numpy as np
+import pymbolic.primitives as prim
+
+def get_vector_view_name(tmpname):
+    return tmpname + "_vec"
+
+
+def add_vector_view(knl, tmpname):
+    """
+    Kernel transformation to add a vector view temporary
+    that interprets the same memory as another temporary
+    """
+    temporaries = knl.temporary_variables
+    assert tmpname in temporaries
+    temp = temporaries[tmpname]
+    vecname = get_vector_view_name(tmpname)
+
+    if vecname in knl.temporary_variables:
+        return knl
+
+    # Add base storage to the original temporary!
+    if not temp.base_storage:
+        temp = temp.copy(base_storage=tmpname + "_base")
+        temporaries[tmpname] = temp
+
+    # Determine the shape by dividing total size by vector size
+    vecsize = get_vcl_type_size(temp.dtype)
+    size = prim.FloorDiv(prim.Product(temp.shape), vecsize)
+
+    # Now add a vector view temporary
+    vecname = tmpname + "_vec"
+    temporaries[vecname] = lp.TemporaryVariable(vecname,
+                                                dim_tags="c,vec",
+                                                shape=(size, vecsize),
+                                                base_storage=tmpname + "_base",
+                                                dtype=np.float64,
+                                                )
+
+    return knl.copy(temporary_variables=temporaries)