diff --git a/dune/perftool/sumfact/alignedmatvec.hh b/dune/perftool/sumfact/alignedmatvec.hh deleted file mode 100644 index 199f414129551e21990911b1f864c8bfe9250e96..0000000000000000000000000000000000000000 --- a/dune/perftool/sumfact/alignedmatvec.hh +++ /dev/null @@ -1,313 +0,0 @@ -#ifndef __ALIGNEDMATVEC__ -#define __ALIGNEDMATVEC__ - -#include <iostream> -#include <iomanip> - -/****************************************************************************/ -/****************************************************************************/ -// dynamic allocation of aligned memory blocks -/****************************************************************************/ -/****************************************************************************/ - -/** get a block of aligned memory - - This function is based on malloc and allocates sufficient - extra memory to provide an aligned pointer; - use the correspondig free function to release the memory. -*/ -void* aligned_malloc (size_t size, size_t alignment=16) -{ - // alignment must be reasonable - assert(alignment>=sizeof(void*)); - assert(alignment%sizeof(void*)==0); - - // get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic - void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size - if (p==0) return p; // not enough memory - - // align the pointer - void **aligned_p = p; - while ( ((size_t)aligned_p)%alignment!=0) - { - aligned_p++; - } - - // now put the original pointer just before that address - if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) ) - { - void **pp = aligned_p; - pp--; - *pp = p; - return (void *) aligned_p; - } - else - { - aligned_p = aligned_p + (alignment/sizeof(void*)); - void **pp = aligned_p; - pp--; - *pp = p; - return (void *) aligned_p; - } -} - -/** free a block of memory allocated through aligned_malloc - */ -void aligned_free (void* p) -{ - void **pp = (void**) p; - pp--; - free(*pp); -} - -/****************************************************************************/ -/****************************************************************************/ -/* matrix class with - - aligned memory allocation - - possible padding of rows and columns - - row and column-wise access - - export of underlying pointer for BLAS-like operations -*/ -/****************************************************************************/ -/****************************************************************************/ - -/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout - - - alignment is in bytes - - padding means allocated rows and columns is a multiple of padding - - padding=1 means that exactly m rows and m columns are allocated - - matrix allows row and columnwise access -*/ -template<typename T=double, size_t alignment=16, size_t padding=1> -class AlignedMatrix -{ - typedef T member_type; - typedef size_t size_type; - -public: - //! make a new matrix allocated on the heap - AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n) - { - // make padded_m, padded_n multiples of padding - if (alignment%sizeof(T)!=0) throw std::bad_alloc(); - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - if (verbose) { - std::cout << "allocating aligned matrix" << std::endl; - std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl; - std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl; - std::cout << "p=" << std::hex << std::uppercase << p << std::endl; - } - } - - //! copy constructor - AlignedMatrix (const AlignedMatrix& mat) - { - m = mat.m; - n = mat.n; - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! destroy the matrix - ~AlignedMatrix () - { - aligned_free(p); - } - - //! taking address returns the internal pointer - T* data () { return p; } - - //! taking address returns the internal pointer - const T* data () const { return p; } - - //! set new size - void resize (size_t newm, size_t newn) - { - if (newm*newn>padded_m*padded_n) - { - m = newm; - n = newn; - } - else - { - // throw away old matrix - aligned_free(p); - - // set new size and reallocate - m = newm; - n = newn; - - // make padded_m, padded_n multiples of padding - if (alignment%sizeof(T)!=0) throw std::bad_alloc(); - padded_m = m; - if (m%padding!=0) padded_m = (m/padding+1)*padding; - padded_n = n; - if (n%padding!=0) padded_n = (n/padding+1)*padding; - - // allocate aligned array - p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - } - - //! get number of rows in use - size_t rows () const { return m;} - - //! get number of columns in use - size_t cols () const { return n;} - - //! get number of rows available - size_t padded_rows () const { return padded_m;} - - //! get number of columns available - size_t padded_cols () const { return padded_n;} - - //! access matrix elements as one big vector - const T& operator[] (size_t i) const {return p[i];} - - //! access matrix elements as one big vector - T& operator[] (size_t i) {return p[i];} - - //! access with j fastest; uses padding - const T& rowmajoraccess (size_t i, size_t j) const - { - return p[i*padded_n+j]; - } - - //! access with j fastest; uses padding - T& rowmajoraccess (size_t i, size_t j) - { - return p[i*padded_n+j]; - } - - //! access with i fastest; uses padding - const T& colmajoraccess (size_t i, size_t j) const - { - return p[j*padded_m+i]; - } - - //! access with i fastest; uses padding - T& colmajoraccess (size_t i, size_t j) - { - return p[j*padded_m+i]; - } - -private: - size_t m,n; - size_t padded_m,padded_n; - T* p; -}; - -/****************************************************************************/ -/****************************************************************************/ -/* vector class with - - aligned memory allocation - - export of underlying pointer for BLAS-like operations -*/ -/****************************************************************************/ -/****************************************************************************/ - -/** Provide a memory aligned vector - - - alignment is in bytes -*/ -template<typename T=double, size_t alignment=16> -class AlignedVector -{ - typedef T member_type; - typedef size_t size_type; - -public: - //! make a new vector - AlignedVector (size_t _n, bool verbose=false) : n(_n) - { - // allocate aligned array - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - if (verbose) { - std::cout << "allocating aligned vector" << std::endl; - std::cout << "p=" << std::hex << std::uppercase << p << std::endl; - } - } - - AlignedVector() - : n(0) - , p(nullptr) - {} - - //! copy constructor - AlignedVector (const AlignedVector& vec) - { - n = vec.n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! assignment operator - AlignedVector& operator=(const AlignedVector& vec) - { - if (p) - aligned_free(p); - n = vec.n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - std::copy(vec.p,vec.p+n,p); - return *this; - } - - //! free vector - ~AlignedVector () - { - if (p) - aligned_free(p); - } - - //! resize vector - void resize (int _n) - { - if (_n<=n) - { - n = _n; - return; - } - if (p) - aligned_free(p); - n = _n; - p = (T*) aligned_malloc(n*sizeof(T),alignment); - if (p==0) throw std::bad_alloc(); - } - - //! taking address returns internal pointer to data - T* data () { return p; } - - //! taking address returns internal pointer to data - const T* data () const { return p; } - - //! get number of elements in vector - size_t size () const { return n;} - - //! element access - const T& operator[] (size_t i) const {return p[i];} - - //! element access - T& operator[] (size_t i) {return p[i];} - -private: - size_t n; - T* p; -}; - -#endif diff --git a/python/dune/perftool/cgen/clazz.py b/python/dune/perftool/cgen/clazz.py index f265638ec68ba122cd209909b7c2704cd6992aaf..dca212eca2aecee421bee757c900f2f1778949d8 100644 --- a/python/dune/perftool/cgen/clazz.py +++ b/python/dune/perftool/cgen/clazz.py @@ -72,7 +72,7 @@ class Class(Generable): for bc in base_classes: assert isinstance(bc, BaseClass) for mem in members: - assert isinstance(mem, ClassMember) + assert isinstance(mem, Generable) def generate(self): # define the class header diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py index 72fad9024cc46d3084b8318a2ff7db4c0c91cc52..89a98fd7d9ead131f628d92edc0e47d97d521660 100644 --- a/python/dune/perftool/generation/__init__.py +++ b/python/dune/perftool/generation/__init__.py @@ -36,6 +36,7 @@ from dune.perftool.generation.loopy import (barrier, globalarg, iname, instruction, + loopy_class_member, kernel_cached, noop_instruction, silenced_warning, diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index c61fec6a21dbe47e9a4147008ae616471af1a5bc..5f20ec162948328dafa3d261b7d299c835af2c98 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -164,3 +164,17 @@ def barrier(**kwargs): name = 'barrier_{}'.format(get_counter('barrier')) _barrier(id=name, **kwargs) return name + + +def loopy_class_member(name, classtag=None, **kwargs): + """ A class member is based on loopy! It is an + * temporary variable of the constructor kernel + * A globalarg of the requesting kernel (to make things pass) + """ + assert classtag + temporary_variable(name, kernel=classtag, **kwargs) + silenced_warning("read_no_write({})".format(name), kernel=classtag) + + kwargs.pop("decl_method", None) + # TODO I guess some filtering has to be applied here. + globalarg(name, **kwargs) diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 57ff1d237c48c0c9ed27ea36698410381d1aa718..f3b7d3e39d2721de7c78f13581f2a2999d646a4b 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -115,11 +115,18 @@ class DuneASTBuilder(CASTBuilder): post_include("#define BARRIER asm volatile(\"\": : :\"memory\")", filetag="operatorfile") return cgen.Line("BARRIER;") + def get_temporary_decls(self, codegen_state, schedule_index): + if self.target.declare_temporaries: + return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index) + else: + return [] + class DuneTarget(TargetBase): - def __init__(self): + def __init__(self, declare_temporaries=True): # Set fortran_abi to allow reusing CASTBuilder for the moment self.fortran_abi = False + self.declare_temporaries = declare_temporaries def split_kernel_at_global_barriers(self): return False diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 75d879949e6930313b69faec4022bd70a8287bee..16d45c59cb02ffb9aaa4a9f83bc02e364e2e78ab 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -43,7 +43,6 @@ def get_form_compiler_arguments(): parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project") # TODO at some point this help description should be updated parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization") - parser.add_argument("--sumfact-alignment", type=int, help="Alignment used in sumfactorization", default=64) parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization") # Modify the positional argument to not be a list diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 0234b4221c30190e983a2859ff857e4642273fdf..30454153b1e659531ee388cc9df49767229311e9 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -644,11 +644,29 @@ def cgen_class_from_cache(tag, members=[]): # Construct the constructor constructor_knl = extract_kernel_from_cache(tag) + from dune.perftool.loopy.target import DuneTarget + constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params)) constructor = LoopyKernelMethod([signature], constructor_knl, add_timings=False, initializer_list=il) + # Take any temporary declarations from the kernel and make them class members + target = DuneTarget() + from loopy.codegen import CodeGenerationState + codegen_state = CodeGenerationState(kernel=constructor_knl, + implemented_data_info=None, + implemented_domain=None, + implemented_predicates=frozenset(), + seen_dtypes=frozenset(), + seen_functions=frozenset(), + seen_atomic_dtypes=frozenset(), + var_subst_map={}, + allow_complex=False, + is_generating_device_code=True, + ) + decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0) + from dune.perftool.cgen import Class - return Class(basename, base_classes=base_classes, members=[constructor] + members + pm, tparam_decls=tparams) + return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams) def generate_localoperator_kernels(formdata, data): diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py index 387fb400fd186e593fb36954c1582e0ae781d65d..0296a06ae69fab232b64fa4c71e93d94d44c6ac5 100644 --- a/python/dune/perftool/sumfact/amatrix.py +++ b/python/dune/perftool/sumfact/amatrix.py @@ -13,6 +13,7 @@ from dune.perftool.generation import (class_member, include_file, initializer_list, instruction, + loopy_class_member, preamble, silenced_warning, temporary_variable, @@ -29,6 +30,7 @@ from loopy.types import NumpyType from pytools import Record, product +import pymbolic.primitives as prim import numpy @@ -41,36 +43,6 @@ class AMatrix(Record): ) -class ColMajorAccess(FunctionIdentifier): - def __init__(self, amatrix): - self.amatrix = amatrix - - def __getinitargs__(self): - return (self.amatrix,) - - @property - def name(self): - return '{}.colmajoraccess'.format(self.amatrix) - - -@function_mangler -def colmajoraccess_mangler(target, func, dtypes): - if isinstance(func, ColMajorAccess): - return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32))) - - -@class_member(classtag="operator") -def define_alignment(name): - alignment = get_option("sumfact_alignment") - return "enum {{ {} = {} }};".format(name, str(alignment)) - - -def name_alignment(): - name = "alignment" - define_alignment(name) - return name - - def quadrature_points_per_direction(): # TODO use quadrature order from dune.perftool.pdelab.quadrature # as soon as we have a generic implementation @@ -93,30 +65,30 @@ def basis_functions_per_direction(): return polynomial_degree() + 1 -@class_member(classtag="operator") def define_oned_quadrature_weights(name): - range_field = lop_template_range_field() - number_qp = quadrature_points_per_direction() - return "{} {}[{}];".format(range_field, name, number_qp) + loopy_class_member(name, + dtype=numpy.float64, + classtag="operator", + shape=(quadrature_points_per_direction(),), + ) def name_oned_quadrature_weights(): name = "qw" - globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64)) define_oned_quadrature_weights(name) return name -@class_member(classtag="operator") def define_oned_quadrature_points(name): - range_field = lop_template_range_field() - number_qp = quadrature_points_per_direction() - return "{} {}[{}];".format(range_field, name, number_qp) + loopy_class_member(name, + dtype=numpy.float64, + classtag="operator", + shape=(quadrature_points_per_direction(),), + ) def name_oned_quadrature_points(): name = "qp" - globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64)) define_oned_quadrature_points(name) return name @@ -176,50 +148,52 @@ def theta_iname(name, bound): return name -def construct_theta(name, transpose, derivative): - # Make sure that the quadrature points are sorted - sort_quadrature_points_weights() +class PolynomialLookup(FunctionIdentifier): + def __init__(self, pol, derivative): + self.pol = pol + self.derivative = derivative - if transpose: - shape = (basis_functions_per_direction(), quadrature_points_per_direction()) - else: - shape = (quadrature_points_per_direction(), basis_functions_per_direction()) - polynomials = name_polynomials() - qp = name_oned_quadrature_points() + def __getinitargs__(self): + return (self.pol, self.derivative) - i = theta_iname("i", shape[0]) - j = theta_iname("j", shape[1]) + @property + def name(self): + return "{}.{}".format(self.pol, "dp" if self.derivative else "p") - # access = "j,i" if transpose else "i,j" - basispol = "dp" if derivative else "p" - polynomial_access = "{},{}[{}]".format(i, qp, j) if transpose else "{},{}[{}]".format(j, qp, i) - return instruction(code="{}.colmajoraccess({},{}) = {}.{}({});".format(name, i, j, polynomials, basispol, polynomial_access), - kernel="operator", - within_inames=frozenset({i, j}), - within_inames_is_final=True, - ) + +@function_mangler +def polynomial_lookup_mangler(target, func, dtypes): + if isinstance(func, PolynomialLookup): + return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64))) -@class_member(classtag="operator") -def typedef_theta(name): - include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile") - alignment = name_alignment() - range_field = lop_template_range_field() - return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment) +def define_theta(name, shape, transpose, derivative): + sort_quadrature_points_weights() + polynomials = name_polynomials() + qp = name_oned_quadrature_points() + loopy_class_member(name, + dtype=numpy.float64, + shape=shape, + classtag="operator", + dim_tags="f,f", + managed=True, + ) -def type_theta(): - name = "AMatrix" - typedef_theta(name) - return name + # TODO Enforce the alignment here! + i = theta_iname("i", shape[0]) + j = theta_iname("j", shape[1]) -@class_member(classtag="operator") -def define_theta(name, shape, transpose, derivative): - theta_type = type_theta() - initializer_list(name, [str(axis) for axis in shape], classtag="operator") - construct_theta(name, transpose, derivative) - return "{} {};".format(theta_type, name) + if transpose: + args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),))) + else: + args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),))) + + instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))), + expression=prim.Call(PolynomialLookup(polynomials, derivative), args), + kernel="operator", + ) def name_theta(): diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 0af2d38c0c20ddaaf11cb5569a3a24355df1e8d3..7333d968250702d641ca2404b0e20a53967e0e14 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -14,7 +14,6 @@ from dune.perftool.generation import (backend, ) from dune.perftool.sumfact.amatrix import (AMatrix, basis_functions_per_direction, - ColMajorAccess, name_dtheta, name_theta, quadrature_points_per_direction, @@ -159,9 +158,9 @@ def evaluate_basis(element, name, restriction): inames = lfs_inames(element, restriction) assert(len(quad_inames) == len(inames)) - instruction(expression=prim.Product(tuple(prim.Call(ColMajorAccess(theta), - (prim.Variable(i), prim.Variable(j)) - ) + instruction(expression=prim.Product(tuple(prim.Subscript(prim.Variable(theta), + (prim.Variable(i), prim.Variable(j)) + ) for (i, j) in zip(quad_inames, inames) ) ), @@ -201,9 +200,9 @@ def evaluate_reference_gradient(element, name, restriction): dim = formdata.geometric_dimension for i in range(dim): - calls = [prim.Call(ColMajorAccess(theta), (prim.Variable(m), prim.Variable(n))) + calls = [prim.Subscript(prim.Variable(theta), (prim.Variable(m), prim.Variable(n))) for (m, n) in zip(quad_inames, inames)] - calls[i] = prim.Call(ColMajorAccess(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i]))) + calls[i] = prim.Subscript(prim.Variable(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i]))) calls = tuple(calls) # assignee = prim.Subscript(prim.Variable(name), tuple(prim.Variable(0))) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 3624022806170316035c10b4fa4e98263d7d52ef..95ae0b11a15ab15e8ceef0d447f991b23145fbe8 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -259,8 +259,7 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional k = sumfact_iname(a_matrix.cols, "red") # Construct the matrix-matrix-multiplication expression a_ik*in_kj - from dune.perftool.sumfact.amatrix import ColMajorAccess - prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))), + prod = Product((Subscript(Variable(a_matrix.a_matrix), (Variable(i), Variable(k))), Subscript(Variable(inp), (Variable(k), Variable(j))) ))