diff --git a/dune/perftool/sumfact/alignedmatvec.hh b/dune/perftool/sumfact/alignedmatvec.hh deleted file mode 100644 index 199f414129551e21990911b1f864c8bfe9250e96..0000000000000000000000000000000000000000 --- a/dune/perftool/sumfact/alignedmatvec.hh +++ /dev/null @@ -1,313 +0,0 @@ -#ifndef __ALIGNEDMATVEC__ -#define __ALIGNEDMATVEC__ - -#include <iostream> -#include <iomanip> - -/****************************************************************************/ -/****************************************************************************/ -// dynamic allocation of aligned memory blocks -/****************************************************************************/ -/****************************************************************************/ - -/** get a block of aligned memory - - This function is based on malloc and allocates sufficient - extra memory to provide an aligned pointer; - use the correspondig free function to release the memory. -*/ -void* aligned_malloc (size_t size, size_t alignment=16) -{ - // alignment must be reasonable - assert(alignment>=sizeof(void*)); - assert(alignment%sizeof(void*)==0); - - // get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic - void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size - if (p==0) return p; // not enough memory - - // align the pointer - void **aligned_p = p; - while ( ((size_t)aligned_p)%alignment!=0) - { - aligned_p++; - } - - // now put the original pointer just before that address - if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) ) - { - void **pp = aligned_p; - pp--; - *pp = p; - return (void *) aligned_p; - } - else - { - aligned_p = aligned_p + (alignment/sizeof(void*)); - void **pp = aligned_p; - pp--; - *pp = p; - return (void *) aligned_p; - } -} - -/** free a block of memory allocated through aligned_malloc - */ -void aligned_free (void* p) -{ - void **pp = (void**) p; - pp--; - free(*pp); -} - -/****************************************************************************/ -/****************************************************************************/ -/* matrix class with - - aligned memory allocation - - possible padding of rows and columns - - row and column-wise access - - export of underlying pointer for BLAS-like operations -*/ -/****************************************************************************/ -/****************************************************************************/ - -/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout - - - alignment is in bytes - - padding means allocated rows and columns is a multiple of padding - - padding=1 means that exactly m rows and m columns are allocated - - matrix allows row and columnwise access -*/ -template<typename T=double, size_t alignment=16, size_t padding=1> -class AlignedMatrix -{ - typedef T member_type; - typedef size_t size_type; - -public: - //! make a new matrix allocated on the heap - AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n) - { - // make padded_m, padded_n multiples of padding - if (alignment%sizeof(T)!=0) throw std::bad_alloc(); - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - if (verbose) { - std::cout << "allocating aligned matrix" << std::endl; - std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl; - std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl; - std::cout << "p=" << std::hex << std::uppercase << p << std::endl; - } - } - - //! copy constructor - AlignedMatrix (const AlignedMatrix& mat) - { - m = mat.m; - n = mat.n; - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! destroy the matrix - ~AlignedMatrix () - { - aligned_free(p); - } - - //! taking address returns the internal pointer - T* data () { return p; } - - //! taking address returns the internal pointer - const T* data () const { return p; } - - //! set new size - void resize (size_t newm, size_t newn) - { - if (newm*newn>padded_m*padded_n) - { - m = newm; - n = newn; - } - else - { - // throw away old matrix - aligned_free(p); - - // set new size and reallocate - m = newm; - n = newn; - - // make padded_m, padded_n multiples of padding - if (alignment%sizeof(T)!=0) throw std::bad_alloc(); - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - } - - //! get number of rows in use - size_t rows () const { return m;} - - //! get number of columns in use - size_t cols () const { return n;} - - //! get number of rows available - size_t padded_rows () const { return padded_m;} - - //! get number of columns available - size_t padded_cols () const { return padded_n;} - - //! access matrix elements as one big vector - const T& operator[] (size_t i) const {return p[i];} - - //! access matrix elements as one big vector - T& operator[] (size_t i) {return p[i];} - - //! access with j fastest; uses padding - const T& rowmajoraccess (size_t i, size_t j) const - { - return p[i*padded_n+j]; - } - - //! access with j fastest; uses padding - T& rowmajoraccess (size_t i, size_t j) - { - return p[i*padded_n+j]; - } - - //! access with i fastest; uses padding - const T& colmajoraccess (size_t i, size_t j) const - { - return p[j*padded_m+i]; - } - - //! access with i fastest; uses padding - T& colmajoraccess (size_t i, size_t j) - { - return p[j*padded_m+i]; - } - -private: - size_t m,n; - size_t padded_m,padded_n; - T* p; -}; - -/****************************************************************************/ -/****************************************************************************/ -/* vector class with - - aligned memory allocation - - export of underlying pointer for BLAS-like operations -*/ -/****************************************************************************/ -/****************************************************************************/ - -/** Provide a memory aligned vector - - - alignment is in bytes -*/ -template<typename T=double, size_t alignment=16> -class AlignedVector -{ - typedef T member_type; - typedef size_t size_type; - -public: - //! make a new vector - AlignedVector (size_t _n, bool verbose=false) : n(_n) - { - // allocate aligned array - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - if (verbose) { - std::cout << "allocating aligned vector" << std::endl; - std::cout << "p=" << std::hex << std::uppercase << p << std::endl; - } - } - - AlignedVector() - : n(0) - , p(nullptr) - {} - - //! copy constructor - AlignedVector (const AlignedVector& vec) - { - n = vec.n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! assignment operator - AlignedVector& operator=(const AlignedVector& vec) - { - if (p) - aligned_free(p); - n = vec.n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - std::copy(vec.p,vec.p+n,p); - return *this; - } - - //! free vector - ~AlignedVector () - { - if (p) - aligned_free(p); - } - - //! resize vector - void resize (int _n) - { - if (_n<=n) - { - n = _n; - return; - } - if (p) - aligned_free(p); - n = _n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! taking address returns internal pointer to data - T* data () { return p; } - - //! taking address returns internal pointer to data - const T* data () const { return p; } - - //! get number of elements in vector - size_t size () const { return n;} - - //! element access - const T& operator[] (size_t i) const {return p[i];} - - //! element access - T& operator[] (size_t i) {return p[i];} - -private: - size_t n; - T* p; -}; - -#endif diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index 16c9d4f54a4900164d4e3389cb78eeeadfe0a70a..5f20ec162948328dafa3d261b7d299c835af2c98 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -177,4 +177,4 @@ def loopy_class_member(name, classtag=None, **kwargs): kwargs.pop("decl_method", None) # TODO I guess some filtering has to be applied here. - globalarg(name, **kwargs) \ No newline at end of file + globalarg(name, **kwargs) diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py index e919951b775ad640281a682216e709ef0de4b621..0296a06ae69fab232b64fa4c71e93d94d44c6ac5 100644 --- a/python/dune/perftool/sumfact/amatrix.py +++ b/python/dune/perftool/sumfact/amatrix.py @@ -30,6 +30,7 @@ from loopy.types import NumpyType from pytools import Record, product +import pymbolic.primitives as prim import numpy @@ -42,36 +43,6 @@ class AMatrix(Record): ) -class ColMajorAccess(FunctionIdentifier): - def __init__(self, amatrix): - self.amatrix = amatrix - - def __getinitargs__(self): - return (self.amatrix,) - - @property - def name(self): - return '{}.colmajoraccess'.format(self.amatrix) - - -@function_mangler -def colmajoraccess_mangler(target, func, dtypes): - if isinstance(func, ColMajorAccess): - return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32))) - - -@class_member(classtag="operator") -def define_alignment(name): - alignment = get_option("sumfact_alignment") - return "enum {{ {} = {} }};".format(name, str(alignment)) - - -def name_alignment(): - name = "alignment" - define_alignment(name) - return name - - def quadrature_points_per_direction(): # TODO use quadrature order from dune.perftool.pdelab.quadrature # as soon as we have a generic implementation @@ -177,50 +148,52 @@ def theta_iname(name, bound): return name -def construct_theta(name, transpose, derivative): - # Make sure that the quadrature points are sorted - sort_quadrature_points_weights() +class PolynomialLookup(FunctionIdentifier): + def __init__(self, pol, derivative): + self.pol = pol + self.derivative = derivative - if transpose: - shape = (basis_functions_per_direction(), quadrature_points_per_direction()) - else: - shape = (quadrature_points_per_direction(), basis_functions_per_direction()) - polynomials = name_polynomials() - qp = name_oned_quadrature_points() + def __getinitargs__(self): + return (self.pol, self.derivative) - i = theta_iname("i", shape[0]) - j = theta_iname("j", shape[1]) + @property + def name(self): + return "{}.{}".format(self.pol, "dp" if self.derivative else "p") - # access = "j,i" if transpose else "i,j" - basispol = "dp" if derivative else "p" - polynomial_access = "{},{}[{}]".format(i, qp, j) if transpose else "{},{}[{}]".format(j, qp, i) - return instruction(code="{}.colmajoraccess({},{}) = {}.{}({});".format(name, i, j, polynomials, basispol, polynomial_access), - kernel="operator", - within_inames=frozenset({i, j}), - within_inames_is_final=True, - ) +@function_mangler +def polynomial_lookup_mangler(target, func, dtypes): + if isinstance(func, PolynomialLookup): + return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64))) -@class_member(classtag="operator") -def typedef_theta(name): - include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile") - alignment = name_alignment() - range_field = lop_template_range_field() - return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment) +def define_theta(name, shape, transpose, derivative): + sort_quadrature_points_weights() + polynomials = name_polynomials() + qp = name_oned_quadrature_points() -def type_theta(): - name = "AMatrix" - typedef_theta(name) - return name + loopy_class_member(name, + dtype=numpy.float64, + shape=shape, + classtag="operator", + dim_tags="f,f", + managed=True, + ) + # TODO Enforce the alignment here! -@class_member(classtag="operator") -def define_theta(name, shape, transpose, derivative): - theta_type = type_theta() - initializer_list(name, [str(axis) for axis in shape], classtag="operator") - construct_theta(name, transpose, derivative) - return "{} {};".format(theta_type, name) + i = theta_iname("i", shape[0]) + j = theta_iname("j", shape[1]) + + if transpose: + args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),))) + else: + args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),))) + + instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))), + expression=prim.Call(PolynomialLookup(polynomials, derivative), args), + kernel="operator", + ) def name_theta(): diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 0af2d38c0c20ddaaf11cb5569a3a24355df1e8d3..7333d968250702d641ca2404b0e20a53967e0e14 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -14,7 +14,6 @@ from dune.perftool.generation import (backend, ) from dune.perftool.sumfact.amatrix import (AMatrix, basis_functions_per_direction, - ColMajorAccess, name_dtheta, name_theta, quadrature_points_per_direction, @@ -159,9 +158,9 @@ def evaluate_basis(element, name, restriction): inames = lfs_inames(element, restriction) assert(len(quad_inames) == len(inames)) - instruction(expression=prim.Product(tuple(prim.Call(ColMajorAccess(theta), - (prim.Variable(i), prim.Variable(j)) - ) + instruction(expression=prim.Product(tuple(prim.Subscript(prim.Variable(theta), + (prim.Variable(i), prim.Variable(j)) + ) for (i, j) in zip(quad_inames, inames) ) ), @@ -201,9 +200,9 @@ def evaluate_reference_gradient(element, name, restriction): dim = formdata.geometric_dimension for i in range(dim): - calls = [prim.Call(ColMajorAccess(theta), (prim.Variable(m), prim.Variable(n))) + calls = [prim.Subscript(prim.Variable(theta), (prim.Variable(m), prim.Variable(n))) for (m, n) in zip(quad_inames, inames)] - calls[i] = prim.Call(ColMajorAccess(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i]))) + calls[i] = prim.Subscript(prim.Variable(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i]))) calls = tuple(calls) # assignee = prim.Subscript(prim.Variable(name), tuple(prim.Variable(0))) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 3624022806170316035c10b4fa4e98263d7d52ef..95ae0b11a15ab15e8ceef0d447f991b23145fbe8 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -259,8 +259,7 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional k = sumfact_iname(a_matrix.cols, "red") # Construct the matrix-matrix-multiplication expression a_ik*in_kj - from dune.perftool.sumfact.amatrix import ColMajorAccess - prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))), + prod = Product((Subscript(Variable(a_matrix.a_matrix), (Variable(i), Variable(k))), Subscript(Variable(inp), (Variable(k), Variable(j))) ))