diff --git a/dune/perftool/sumfact/alignedmatvec.hh b/dune/perftool/sumfact/alignedmatvec.hh
deleted file mode 100644
index 199f414129551e21990911b1f864c8bfe9250e96..0000000000000000000000000000000000000000
--- a/dune/perftool/sumfact/alignedmatvec.hh
+++ /dev/null
@@ -1,313 +0,0 @@
-#ifndef __ALIGNEDMATVEC__
-#define __ALIGNEDMATVEC__
-
-#include <iostream>
-#include <iomanip>
-
-/****************************************************************************/
-/****************************************************************************/
-// dynamic allocation of aligned memory blocks
-/****************************************************************************/
-/****************************************************************************/
-
-/** get a block of aligned memory
-
-    This function is based on malloc and allocates sufficient
-    extra memory to provide an aligned pointer;
-    use the correspondig free function to release the memory.
-*/
-void* aligned_malloc (size_t size, size_t alignment=16)
-{
-  // alignment must be reasonable
-  assert(alignment>=sizeof(void*));
-  assert(alignment%sizeof(void*)==0);
-
-  // get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic
-  void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size
-  if (p==0) return p; // not enough memory
-
-  // align the pointer
-  void **aligned_p = p;
-  while ( ((size_t)aligned_p)%alignment!=0)
-    {
-      aligned_p++;
-    }
-
-  // now put the original pointer just before that address
-  if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) )
-    {
-      void **pp = aligned_p;
-      pp--;
-      *pp = p;
-      return (void *) aligned_p;
-    }
-  else
-    {
-      aligned_p = aligned_p + (alignment/sizeof(void*));
-      void **pp = aligned_p;
-      pp--;
-      *pp = p;
-      return (void *) aligned_p;
-    }
-}
-
-/** free a block of memory allocated through aligned_malloc
- */
-void aligned_free (void* p)
-{
-  void **pp = (void**) p;
-  pp--;
-  free(*pp);
-}
-
-/****************************************************************************/
-/****************************************************************************/
-/* matrix class with
-   - aligned memory allocation
-   - possible padding of rows and columns
-   - row and column-wise access
-   - export of underlying pointer for BLAS-like operations
-*/
-/****************************************************************************/
-/****************************************************************************/
-
-/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout
-
-    - alignment is in bytes
-    - padding means allocated rows and columns is a multiple of padding
-    - padding=1 means that exactly m rows and m columns are allocated
-    - matrix allows row and columnwise access
-*/
-template<typename T=double, size_t alignment=16, size_t padding=1>
-class AlignedMatrix
-{
-  typedef T member_type;
-  typedef size_t size_type;
-
-public:
-  //! make a new matrix allocated on the heap
-  AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n)
-  {
-    // make padded_m, padded_n multiples of padding
-    if (alignment%sizeof(T)!=0) throw std::bad_alloc();
-    padded_m = m;
-    if (m%padding!=0) padded_m = (m/padding+1)*padding;
-    padded_n = n;
-    if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-    // allocate aligned array
-    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    if (verbose) {
-      std::cout << "allocating aligned matrix" << std::endl;
-      std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl;
-      std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl;
-      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
-    }
-  }
-
-  //! copy constructor
-  AlignedMatrix (const AlignedMatrix& mat)
-  {
-    m = mat.m;
-    n = mat.n;
-    padded_m = m;
-    if (m%padding!=0) padded_m = (m/padding+1)*padding;
-    padded_n = n;
-    if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-    // allocate aligned array
-    p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! destroy the matrix
-  ~AlignedMatrix ()
-  {
-    aligned_free(p);
-  }
-
-  //! taking address returns the internal pointer
-  T* data () { return p; }
-
-  //! taking address returns the internal pointer
-  const T* data () const { return p; }
-
-  //! set new size
-  void resize (size_t newm, size_t newn)
-  {
-    if (newm*newn>padded_m*padded_n)
-      {
-        m = newm;
-        n = newn;
-      }
-    else
-      {
-        // throw away old matrix
-        aligned_free(p);
-
-        // set new size and reallocate
-        m = newm;
-        n = newn;
-
-        // make padded_m, padded_n multiples of padding
-        if (alignment%sizeof(T)!=0) throw std::bad_alloc();
-        padded_m = m;
-        if (m%padding!=0) padded_m = (m/padding+1)*padding;
-        padded_n = n;
-        if (n%padding!=0) padded_n = (n/padding+1)*padding;
-
-        // allocate aligned array
-        p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
-        if (p==0) throw std::bad_alloc();
-      }
-  }
-
-  //! get number of rows in use
-  size_t rows () const { return m;}
-
-  //! get number of columns in use
-  size_t cols () const { return n;}
-
-  //! get number of rows available
-  size_t padded_rows () const { return padded_m;}
-
-  //! get number of columns available
-  size_t padded_cols () const { return padded_n;}
-
-  //! access matrix elements as one big vector
-  const T& operator[] (size_t i) const {return p[i];}
-
-  //! access matrix elements as one big vector
-  T& operator[] (size_t i) {return p[i];}
-
-  //! access with j fastest; uses padding
-  const T& rowmajoraccess (size_t i, size_t j) const
-  {
-    return p[i*padded_n+j];
-  }
-
-  //! access with j fastest; uses padding
-  T& rowmajoraccess (size_t i, size_t j)
-  {
-    return p[i*padded_n+j];
-  }
-
-  //! access with i fastest; uses padding
-  const T& colmajoraccess (size_t i, size_t j) const
-  {
-    return p[j*padded_m+i];
-  }
-
-  //! access with i fastest; uses padding
-  T& colmajoraccess (size_t i, size_t j)
-  {
-    return p[j*padded_m+i];
-  }
-
-private:
-  size_t m,n;
-  size_t padded_m,padded_n;
-  T* p;
-};
-
-/****************************************************************************/
-/****************************************************************************/
-/* vector class with
-   - aligned memory allocation
-   - export of underlying pointer for BLAS-like operations
-*/
-/****************************************************************************/
-/****************************************************************************/
-
-/** Provide a memory aligned vector
-
-    - alignment is in bytes
-*/
-template<typename T=double, size_t alignment=16>
-class AlignedVector
-{
-  typedef T member_type;
-  typedef size_t size_type;
-
-public:
-  //! make a new vector
-  AlignedVector (size_t _n, bool verbose=false) : n(_n)
-  {
-    // allocate aligned array
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    if (verbose) {
-      std::cout << "allocating aligned vector" << std::endl;
-      std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
-    }
-  }
-
-  AlignedVector()
-    : n(0)
-    , p(nullptr)
-  {}
-
-  //! copy constructor
-  AlignedVector (const AlignedVector& vec)
-  {
-    n = vec.n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! assignment operator
-  AlignedVector& operator=(const AlignedVector& vec)
-  {
-    if (p)
-      aligned_free(p);
-    n = vec.n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-    std::copy(vec.p,vec.p+n,p);
-    return *this;
-  }
-
-  //! free vector
-  ~AlignedVector ()
-  {
-    if (p)
-      aligned_free(p);
-  }
-
-  //! resize vector
-  void resize (int _n)
-  {
-    if (_n<=n)
-      {
-        n = _n;
-        return;
-      }
-    if (p)
-      aligned_free(p);
-    n = _n;
-    p = (T*) aligned_malloc(n*sizeof(T),alignment);
-    if (p==0) throw std::bad_alloc();
-  }
-
-  //! taking address returns internal pointer to data
-  T* data () { return p; }
-
-  //! taking address returns internal pointer to data
-  const T* data () const { return p; }
-
-  //! get number of elements in vector
-  size_t size () const { return n;}
-
-  //! element access
-  const T& operator[] (size_t i) const {return p[i];}
-
-  //! element access
-  T& operator[] (size_t i) {return p[i];}
-
-private:
-  size_t n;
-  T* p;
-};
-
-#endif
diff --git a/python/dune/perftool/cgen/clazz.py b/python/dune/perftool/cgen/clazz.py
index f265638ec68ba122cd209909b7c2704cd6992aaf..dca212eca2aecee421bee757c900f2f1778949d8 100644
--- a/python/dune/perftool/cgen/clazz.py
+++ b/python/dune/perftool/cgen/clazz.py
@@ -72,7 +72,7 @@ class Class(Generable):
         for bc in base_classes:
             assert isinstance(bc, BaseClass)
         for mem in members:
-            assert isinstance(mem, ClassMember)
+            assert isinstance(mem, Generable)
 
     def generate(self):
         # define the class header
diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py
index 72fad9024cc46d3084b8318a2ff7db4c0c91cc52..89a98fd7d9ead131f628d92edc0e47d97d521660 100644
--- a/python/dune/perftool/generation/__init__.py
+++ b/python/dune/perftool/generation/__init__.py
@@ -36,6 +36,7 @@ from dune.perftool.generation.loopy import (barrier,
                                             globalarg,
                                             iname,
                                             instruction,
+                                            loopy_class_member,
                                             kernel_cached,
                                             noop_instruction,
                                             silenced_warning,
diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py
index c61fec6a21dbe47e9a4147008ae616471af1a5bc..5f20ec162948328dafa3d261b7d299c835af2c98 100644
--- a/python/dune/perftool/generation/loopy.py
+++ b/python/dune/perftool/generation/loopy.py
@@ -164,3 +164,17 @@ def barrier(**kwargs):
     name = 'barrier_{}'.format(get_counter('barrier'))
     _barrier(id=name, **kwargs)
     return name
+
+
+def loopy_class_member(name, classtag=None, **kwargs):
+    """ A class member is based on loopy! It is an
+    * temporary variable of the constructor kernel
+    * A globalarg of the requesting kernel (to make things pass)
+    """
+    assert classtag
+    temporary_variable(name, kernel=classtag, **kwargs)
+    silenced_warning("read_no_write({})".format(name), kernel=classtag)
+
+    kwargs.pop("decl_method", None)
+    # TODO I guess some filtering has to be applied here.
+    globalarg(name, **kwargs)
diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py
index 57ff1d237c48c0c9ed27ea36698410381d1aa718..f3b7d3e39d2721de7c78f13581f2a2999d646a4b 100644
--- a/python/dune/perftool/loopy/target.py
+++ b/python/dune/perftool/loopy/target.py
@@ -115,11 +115,18 @@ class DuneASTBuilder(CASTBuilder):
         post_include("#define BARRIER asm volatile(\"\": : :\"memory\")", filetag="operatorfile")
         return cgen.Line("BARRIER;")
 
+    def get_temporary_decls(self, codegen_state, schedule_index):
+        if self.target.declare_temporaries:
+            return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
+        else:
+            return []
+
 
 class DuneTarget(TargetBase):
-    def __init__(self):
+    def __init__(self, declare_temporaries=True):
         # Set fortran_abi to allow reusing CASTBuilder for the moment
         self.fortran_abi = False
+        self.declare_temporaries = declare_temporaries
 
     def split_kernel_at_global_barriers(self):
         return False
diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py
index 75d879949e6930313b69faec4022bd70a8287bee..16d45c59cb02ffb9aaa4a9f83bc02e364e2e78ab 100644
--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -43,7 +43,6 @@ def get_form_compiler_arguments():
     parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
     # TODO at some point this help description should be updated
     parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
-    parser.add_argument("--sumfact-alignment", type=int, help="Alignment used in sumfactorization", default=64)
     parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization")
 
     # Modify the positional argument to not be a list
diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py
index 0234b4221c30190e983a2859ff857e4642273fdf..30454153b1e659531ee388cc9df49767229311e9 100644
--- a/python/dune/perftool/pdelab/localoperator.py
+++ b/python/dune/perftool/pdelab/localoperator.py
@@ -644,11 +644,29 @@ def cgen_class_from_cache(tag, members=[]):
 
     # Construct the constructor
     constructor_knl = extract_kernel_from_cache(tag)
+    from dune.perftool.loopy.target import DuneTarget
+    constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
     signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params))
     constructor = LoopyKernelMethod([signature], constructor_knl, add_timings=False, initializer_list=il)
 
+    # Take any temporary declarations from the kernel and make them class members
+    target = DuneTarget()
+    from loopy.codegen import CodeGenerationState
+    codegen_state = CodeGenerationState(kernel=constructor_knl,
+                                        implemented_data_info=None,
+                                        implemented_domain=None,
+                                        implemented_predicates=frozenset(),
+                                        seen_dtypes=frozenset(),
+                                        seen_functions=frozenset(),
+                                        seen_atomic_dtypes=frozenset(),
+                                        var_subst_map={},
+                                        allow_complex=False,
+                                        is_generating_device_code=True,
+                                        )
+    decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)
+
     from dune.perftool.cgen import Class
-    return Class(basename, base_classes=base_classes, members=[constructor] + members + pm, tparam_decls=tparams)
+    return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams)
 
 
 def generate_localoperator_kernels(formdata, data):
diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py
index 387fb400fd186e593fb36954c1582e0ae781d65d..0296a06ae69fab232b64fa4c71e93d94d44c6ac5 100644
--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -13,6 +13,7 @@ from dune.perftool.generation import (class_member,
                                       include_file,
                                       initializer_list,
                                       instruction,
+                                      loopy_class_member,
                                       preamble,
                                       silenced_warning,
                                       temporary_variable,
@@ -29,6 +30,7 @@ from loopy.types import NumpyType
 
 from pytools import Record, product
 
+import pymbolic.primitives as prim
 import numpy
 
 
@@ -41,36 +43,6 @@ class AMatrix(Record):
                         )
 
 
-class ColMajorAccess(FunctionIdentifier):
-    def __init__(self, amatrix):
-        self.amatrix = amatrix
-
-    def __getinitargs__(self):
-        return (self.amatrix,)
-
-    @property
-    def name(self):
-        return '{}.colmajoraccess'.format(self.amatrix)
-
-
-@function_mangler
-def colmajoraccess_mangler(target, func, dtypes):
-    if isinstance(func, ColMajorAccess):
-        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32)))
-
-
-@class_member(classtag="operator")
-def define_alignment(name):
-    alignment = get_option("sumfact_alignment")
-    return "enum {{ {} = {} }};".format(name, str(alignment))
-
-
-def name_alignment():
-    name = "alignment"
-    define_alignment(name)
-    return name
-
-
 def quadrature_points_per_direction():
     # TODO use quadrature order from dune.perftool.pdelab.quadrature
     # as soon as we have a generic implementation
@@ -93,30 +65,30 @@ def basis_functions_per_direction():
     return polynomial_degree() + 1
 
 
-@class_member(classtag="operator")
 def define_oned_quadrature_weights(name):
-    range_field = lop_template_range_field()
-    number_qp = quadrature_points_per_direction()
-    return "{} {}[{}];".format(range_field, name, number_qp)
+    loopy_class_member(name,
+                       dtype=numpy.float64,
+                       classtag="operator",
+                       shape=(quadrature_points_per_direction(),),
+                       )
 
 
 def name_oned_quadrature_weights():
     name = "qw"
-    globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64))
     define_oned_quadrature_weights(name)
     return name
 
 
-@class_member(classtag="operator")
 def define_oned_quadrature_points(name):
-    range_field = lop_template_range_field()
-    number_qp = quadrature_points_per_direction()
-    return "{} {}[{}];".format(range_field, name, number_qp)
+    loopy_class_member(name,
+                       dtype=numpy.float64,
+                       classtag="operator",
+                       shape=(quadrature_points_per_direction(),),
+                       )
 
 
 def name_oned_quadrature_points():
     name = "qp"
-    globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64))
     define_oned_quadrature_points(name)
     return name
 
@@ -176,50 +148,52 @@ def theta_iname(name, bound):
     return name
 
 
-def construct_theta(name, transpose, derivative):
-    # Make sure that the quadrature points are sorted
-    sort_quadrature_points_weights()
+class PolynomialLookup(FunctionIdentifier):
+    def __init__(self, pol, derivative):
+        self.pol = pol
+        self.derivative = derivative
 
-    if transpose:
-        shape = (basis_functions_per_direction(), quadrature_points_per_direction())
-    else:
-        shape = (quadrature_points_per_direction(), basis_functions_per_direction())
-    polynomials = name_polynomials()
-    qp = name_oned_quadrature_points()
+    def __getinitargs__(self):
+        return (self.pol, self.derivative)
 
-    i = theta_iname("i", shape[0])
-    j = theta_iname("j", shape[1])
+    @property
+    def name(self):
+        return "{}.{}".format(self.pol, "dp" if self.derivative else "p")
 
-    # access = "j,i" if transpose else "i,j"
-    basispol = "dp" if derivative else "p"
-    polynomial_access = "{},{}[{}]".format(i, qp, j) if transpose else "{},{}[{}]".format(j, qp, i)
-    return instruction(code="{}.colmajoraccess({},{}) = {}.{}({});".format(name, i, j, polynomials, basispol, polynomial_access),
-                       kernel="operator",
-                       within_inames=frozenset({i, j}),
-                       within_inames_is_final=True,
-                       )
+
+@function_mangler
+def polynomial_lookup_mangler(target, func, dtypes):
+    if isinstance(func, PolynomialLookup):
+        return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
 
 
-@class_member(classtag="operator")
-def typedef_theta(name):
-    include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
-    alignment = name_alignment()
-    range_field = lop_template_range_field()
-    return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
+def define_theta(name, shape, transpose, derivative):
+    sort_quadrature_points_weights()
+    polynomials = name_polynomials()
+    qp = name_oned_quadrature_points()
 
+    loopy_class_member(name,
+                       dtype=numpy.float64,
+                       shape=shape,
+                       classtag="operator",
+                       dim_tags="f,f",
+                       managed=True,
+                       )
 
-def type_theta():
-    name = "AMatrix"
-    typedef_theta(name)
-    return name
+    # TODO Enforce the alignment here!
 
+    i = theta_iname("i", shape[0])
+    j = theta_iname("j", shape[1])
 
-@class_member(classtag="operator")
-def define_theta(name, shape, transpose, derivative):
-    theta_type = type_theta()
-    initializer_list(name, [str(axis) for axis in shape], classtag="operator")
-    construct_theta(name, transpose, derivative)
-    return "{} {};".format(theta_type, name)
+    if transpose:
+        args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),)))
+    else:
+        args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))
+
+    instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))),
+                expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
+                kernel="operator",
+                )
 
 
 def name_theta():
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index 0af2d38c0c20ddaaf11cb5569a3a24355df1e8d3..7333d968250702d641ca2404b0e20a53967e0e14 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -14,7 +14,6 @@ from dune.perftool.generation import (backend,
                                       )
 from dune.perftool.sumfact.amatrix import (AMatrix,
                                            basis_functions_per_direction,
-                                           ColMajorAccess,
                                            name_dtheta,
                                            name_theta,
                                            quadrature_points_per_direction,
@@ -159,9 +158,9 @@ def evaluate_basis(element, name, restriction):
     inames = lfs_inames(element, restriction)
     assert(len(quad_inames) == len(inames))
 
-    instruction(expression=prim.Product(tuple(prim.Call(ColMajorAccess(theta),
-                                                        (prim.Variable(i), prim.Variable(j))
-                                                        )
+    instruction(expression=prim.Product(tuple(prim.Subscript(prim.Variable(theta),
+                                                             (prim.Variable(i), prim.Variable(j))
+                                                             )
                                               for (i, j) in zip(quad_inames, inames)
                                               )
                                         ),
@@ -201,9 +200,9 @@ def evaluate_reference_gradient(element, name, restriction):
     dim = formdata.geometric_dimension
 
     for i in range(dim):
-        calls = [prim.Call(ColMajorAccess(theta), (prim.Variable(m), prim.Variable(n)))
+        calls = [prim.Subscript(prim.Variable(theta), (prim.Variable(m), prim.Variable(n)))
                  for (m, n) in zip(quad_inames, inames)]
-        calls[i] = prim.Call(ColMajorAccess(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
+        calls[i] = prim.Subscript(prim.Variable(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
         calls = tuple(calls)
 
         # assignee = prim.Subscript(prim.Variable(name), tuple(prim.Variable(0)))
diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py
index 3624022806170316035c10b4fa4e98263d7d52ef..95ae0b11a15ab15e8ceef0d447f991b23145fbe8 100644
--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -259,8 +259,7 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
         k = sumfact_iname(a_matrix.cols, "red")
 
         # Construct the matrix-matrix-multiplication expression a_ik*in_kj
-        from dune.perftool.sumfact.amatrix import ColMajorAccess
-        prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))),
+        prod = Product((Subscript(Variable(a_matrix.a_matrix), (Variable(i), Variable(k))),
                         Subscript(Variable(inp), (Variable(k), Variable(j)))
                         ))