diff --git a/dune/perftool/sumfact/alignedmatvec.hh b/dune/perftool/sumfact/alignedmatvec.hh
deleted file mode 100644
index 199f414129551e21990911b1f864c8bfe9250e96..0000000000000000000000000000000000000000
--- a/dune/perftool/sumfact/alignedmatvec.hh
+++ /dev/null
@@ -1,313 +0,0 @@
-#ifndef __ALIGNEDMATVEC__
-#define __ALIGNEDMATVEC__
-
-#include <iostream>
-#include <iomanip>
-
-/****************************************************************************/
-/****************************************************************************/
-// dynamic allocation of aligned memory blocks
-/****************************************************************************/
-/****************************************************************************/
-
-/** get a block of aligned memory
-
-    This function is based on malloc and allocates sufficient
-    extra memory to provide an aligned pointer;
-    use the correspondig free function to release the memory.
-*/
-void* aligned_malloc (size_t size, size_t alignment=16)
-{
-  // alignment must be reasonable
-  assert(alignment>=sizeof(void*));
-  assert(alignment%sizeof(void*)==0);
-
-  // get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic
-  void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size
-  if (p==0) return p; // not enough memory
-
-  // align the pointer
-  void **aligned_p = p;
-  while ( ((size_t)aligned_p)%alignment!=0)
-    {
-      aligned_p++;
-    }
-
-  // now put the original pointer just before that address
-  if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) )
-    {
-      void **pp = aligned_p;
-      pp--;
-      *pp = p;
-      return (void *) aligned_p;
-    }
-  else
-    {
-      aligned_p = aligned_p + (alignment/sizeof(void*));
-      void **pp = aligned_p;
-      pp--;
-      *pp = p;
-      return (void *) aligned_p;
-    }
-}
-
-/** free a block of memory allocated through aligned_malloc
- */
-void aligned_free (void* p)
-{
-  void **pp = (void**) p;
-  pp--;
-  free(*pp);
-}
-
-/****************************************************************************/
-/****************************************************************************/
-/* matrix class with
-   - aligned memory allocation
-   - possible padding of rows and columns
-   - row and column-wise access
-   - export of underlying pointer for BLAS-like operations
-*/
-/****************************************************************************/
-/****************************************************************************/
-
-/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout
-
-    - alignment is in bytes
-    - padding means allocated rows and columns is a multiple of padding
-    - padding=1 means that exactly m rows and m columns are allocated
-    - matrix allows row and columnwise access
-*/
-template<typename T=double, size_t alignment=16, size_t padding=1>
-class AlignedMatrix
-{
-  typedef T member_type;
-  typedef size_t size_type;
-
-public:
-  //! make a new matrix allocated on the heap
-  AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n)
-  {
-    // make padded_m, padded_n multiples of padding
-    if (alignment%sizeof(T)!=0) throw std::bad_alloc();
-    padded_m = m;
-    if (m%padding!=0) padded_m = (m/padding+1)*padding;
-    padded_n = n;
-    if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-    // allocate aligned array
-    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    if (verbose) {
-      std::cout << "allocating aligned matrix" << std::endl;
-      std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl;
-      std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl;
-      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
-    }
-  }
-
-  //! copy constructor
-  AlignedMatrix (const AlignedMatrix& mat)
-  {
-    m = mat.m;
-    n = mat.n;
-    padded_m = m;
-    if (m%padding!=0) padded_m = (m/padding+1)*padding;
-    padded_n = n;
-    if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-    // allocate aligned array
-    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! destroy the matrix
-  ~AlignedMatrix ()
-  {
-    aligned_free(p);
-  }
-
-  //! taking address returns the internal pointer
-  T* data () { return p; }
-
-  //! taking address returns the internal pointer
-  const T* data () const { return p; }
-
-  //! set new size
-  void resize (size_t newm, size_t newn)
-  {
-    if (newm*newn>padded_m*padded_n)
-      {
-        m = newm;
-        n = newn;
-      }
-    else
-      {
-        // throw away old matrix
-        aligned_free(p);
-
-        // set new size and reallocate
-        m = newm;
-        n = newn;
-
-        // make padded_m, padded_n multiples of padding
-        if (alignment%sizeof(T)!=0) throw std::bad_alloc();
-        padded_m = m;
-        if (m%padding!=0) padded_m = (m/padding+1)*padding;
-        padded_n = n;
-        if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-        // allocate aligned array
-        p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-        if (p==0) throw std::bad_alloc();
-      }
-  }
-
-  //! get number of rows in use
-  size_t rows () const { return m;}
-
-  //! get number of columns in use
-  size_t cols () const { return n;}
-
-  //! get number of rows available
-  size_t padded_rows () const { return padded_m;}
-
-  //! get number of columns available
-  size_t padded_cols () const { return padded_n;}
-
-  //! access matrix elements as one big vector
-  const T& operator[] (size_t i) const {return p[i];}
-
-  //! access matrix elements as one big vector
-  T& operator[] (size_t i) {return p[i];}
-
-  //! access with j fastest; uses padding
-  const T& rowmajoraccess (size_t i, size_t j) const
-  {
-    return p[i*padded_n+j];
-  }
-
-  //! access with j fastest; uses padding
-  T& rowmajoraccess (size_t i, size_t j)
-  {
-    return p[i*padded_n+j];
-  }
-
-  //! access with i fastest; uses padding
-  const T& colmajoraccess (size_t i, size_t j) const
-  {
-    return p[j*padded_m+i];
-  }
-
-  //! access with i fastest; uses padding
-  T& colmajoraccess (size_t i, size_t j)
-  {
-    return p[j*padded_m+i];
-  }
-
-private:
-  size_t m,n;
-  size_t padded_m,padded_n;
-  T* p;
-};
-
-/****************************************************************************/
-/****************************************************************************/
-/* vector class with
-   - aligned memory allocation
-   - export of underlying pointer for BLAS-like operations
-*/
-/****************************************************************************/
-/****************************************************************************/
-
-/** Provide a memory aligned vector
-
-    - alignment is in bytes
-*/
-template<typename T=double, size_t alignment=16>
-class AlignedVector
-{
-  typedef T member_type;
-  typedef size_t size_type;
-
-public:
-  //! make a new vector
-  AlignedVector (size_t _n, bool verbose=false) : n(_n)
-  {
-    // allocate aligned array
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    if (verbose) {
-      std::cout << "allocating aligned vector" << std::endl;
-      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
-    }
-  }
-
-  AlignedVector()
-    : n(0)
-    , p(nullptr)
-  {}
-
-  //! copy constructor
-  AlignedVector (const AlignedVector& vec)
-  {
-    n = vec.n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! assignment operator
-  AlignedVector& operator=(const AlignedVector& vec)
-  {
-    if (p)
-      aligned_free(p);
-    n = vec.n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    std::copy(vec.p,vec.p+n,p);
-    return *this;
-  }
-
-  //! free vector
-  ~AlignedVector ()
-  {
-    if (p)
-      aligned_free(p);
-  }
-
-  //! resize vector
-  void resize (int _n)
-  {
-    if (_n<=n)
-      {
-        n = _n;
-        return;
-      }
-    if (p)
-      aligned_free(p);
-    n = _n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! taking address returns internal pointer to data
-  T* data () { return p; }
-
-  //! taking address returns internal pointer to data
-  const T* data () const { return p; }
-
-  //! get number of elements in vector
-  size_t size () const { return n;}
-
-  //! element access
-  const T& operator[] (size_t i) const {return p[i];}
-
-  //! element access
-  T& operator[] (size_t i) {return p[i];}
-
-private:
-  size_t n;
-  T* p;
-};
-
-#endif
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index 16c9d4f54a4900164d4e3389cb78eeeadfe0a70a..5f20ec162948328dafa3d261b7d299c835af2c98 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -177,4 +177,4 @@ def loopy_class_member(name, classtag=None, **kwargs):
 
     kwargs.pop("decl_method", None)
     # TODO I guess some filtering has to be applied here.
-    globalarg(name, **kwargs)
\ No newline at end of file
+    globalarg(name, **kwargs)
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index e919951b775ad640281a682216e709ef0de4b621..0296a06ae69fab232b64fa4c71e93d94d44c6ac5 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -30,6 +30,7 @@ from loopy.types import NumpyType
 
 from pytools import Record, product
 
+import pymbolic.primitives as prim
 import numpy
 
 
@@ -42,36 +43,6 @@ class AMatrix(Record):
                         )
 
 
-class ColMajorAccess(FunctionIdentifier):
-    def __init__(self, amatrix):
-        self.amatrix = amatrix
-
-    def __getinitargs__(self):
-        return (self.amatrix,)
-
-    @property
-    def name(self):
-        return '{}.colmajoraccess'.format(self.amatrix)
-
-
-@function_mangler
-def colmajoraccess_mangler(target, func, dtypes):
-    if isinstance(func, ColMajorAccess):
-        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32)))
-
-
-@class_member(classtag="operator")
-def define_alignment(name):
-    alignment = get_option("sumfact_alignment")
-    return "enum {{ {} = {} }};".format(name, str(alignment))
-
-
-def name_alignment():
-    name = "alignment"
-    define_alignment(name)
-    return name
-
-
 def quadrature_points_per_direction():
     # TODO use quadrature order from dune.perftool.pdelab.quadrature
     # as soon as we have a generic implementation
@@ -177,50 +148,52 @@ def theta_iname(name, bound):
     return name
 
 
-def construct_theta(name, transpose, derivative):
-    # Make sure that the quadrature points are sorted
-    sort_quadrature_points_weights()
+class PolynomialLookup(FunctionIdentifier):
+    def __init__(self, pol, derivative):
+        self.pol = pol
+        self.derivative = derivative
 
-    if transpose:
-        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
-    else:
-        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
-    polynomials = name_polynomials()
-    qp = name_oned_quadrature_points()
+    def __getinitargs__(self):
+        return (self.pol, self.derivative)
 
-    i = theta_iname("i", shape[0])
-    j = theta_iname("j", shape[1])
+    @property
+    def name(self):
+        return "{}.{}".format(self.pol, "dp" if self.derivative else "p")
 
-    # access = "j,i" if transpose else "i,j"
-    basispol = "dp" if derivative else "p"
-    polynomial_access = "{},{}[{}]".format(i, qp, j) if transpose else "{},{}[{}]".format(j, qp, i)
-    return instruction(code="{}.colmajoraccess({},{}) = {}.{}({});".format(name, i, j, polynomials, basispol, polynomial_access),
-                       kernel="operator",
-                       within_inames=frozenset({i, j}),
-                       within_inames_is_final=True,
-                       )
 
+@function_mangler
+def polynomial_lookup_mangler(target, func, dtypes):
+    if isinstance(func, PolynomialLookup):
+        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
-@class_member(classtag="operator")
-def typedef_theta(name):
-    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
-    alignment = name_alignment()
-    range_field = lop_template_range_field()
-    return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
 
+def define_theta(name, shape, transpose, derivative):
+    sort_quadrature_points_weights()
+    polynomials = name_polynomials()
+    qp = name_oned_quadrature_points()
 
-def type_theta():
-    name = "AMatrix"
-    typedef_theta(name)
-    return name
+    loopy_class_member(name,
+                       dtype=numpy.float64,
+                       shape=shape,
+                       classtag="operator",
+                       dim_tags="f,f",
+                       managed=True,
+                       )
 
+    # TODO Enforce the alignment here!
 
-@class_member(classtag="operator")
-def define_theta(name, shape, transpose, derivative):
-    theta_type = type_theta()
-    initializer_list(name, [str(axis) for axis in shape], classtag="operator")
-    construct_theta(name, transpose, derivative)
-    return "{} {};".format(theta_type, name)
+    i = theta_iname("i", shape[0])
+    j = theta_iname("j", shape[1])
+
+    if transpose:
+        args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),)))
+    else:
+        args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))
+
+    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))),
+                expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
+                kernel="operator",
+                )
 
 
 def name_theta():
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 0af2d38c0c20ddaaf11cb5569a3a24355df1e8d3..7333d968250702d641ca2404b0e20a53967e0e14 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -14,7 +14,6 @@ from dune.perftool.generation import (backend,
                                       )
 from dune.perftool.sumfact.amatrix import (AMatrix,
                                            basis_functions_per_direction,
-                                           ColMajorAccess,
                                            name_dtheta,
                                            name_theta,
                                            quadrature_points_per_direction,
@@ -159,9 +158,9 @@ def evaluate_basis(element, name, restriction):
     inames = lfs_inames(element, restriction)
     assert(len(quad_inames) == len(inames))
 
-    instruction(expression=prim.Product(tuple(prim.Call(ColMajorAccess(theta),
-                                                        (prim.Variable(i), prim.Variable(j))
-                                                        )
+    instruction(expression=prim.Product(tuple(prim.Subscript(prim.Variable(theta),
+                                                             (prim.Variable(i), prim.Variable(j))
+                                                             )
                                               for (i, j) in zip(quad_inames, inames)
                                               )
                                         ),
@@ -201,9 +200,9 @@ def evaluate_reference_gradient(element, name, restriction):
     dim = formdata.geometric_dimension
 
     for i in range(dim):
-        calls = [prim.Call(ColMajorAccess(theta), (prim.Variable(m), prim.Variable(n)))
+        calls = [prim.Subscript(prim.Variable(theta), (prim.Variable(m), prim.Variable(n)))
                  for (m, n) in zip(quad_inames, inames)]
-        calls[i] = prim.Call(ColMajorAccess(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
+        calls[i] = prim.Subscript(prim.Variable(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
         calls = tuple(calls)
 
         # assignee = prim.Subscript(prim.Variable(name), tuple(prim.Variable(0)))
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 3624022806170316035c10b4fa4e98263d7d52ef..95ae0b11a15ab15e8ceef0d447f991b23145fbe8 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -259,8 +259,7 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
         k = sumfact_iname(a_matrix.cols, "red")
 
         # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-        from dune.perftool.sumfact.amatrix import ColMajorAccess
-        prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))),
+        prod = Product((Subscript(Variable(a_matrix.a_matrix), (Variable(i), Variable(k))),
                         Subscript(Variable(inp), (Variable(k), Variable(j)))
                         ))