Skip to content
Snippets Groups Projects
Commit 065b6021 authored by René Heß's avatar René Heß
Browse files

Merge branch 'feature/loopyish-class-members' into 'master'

Feature/loopyish class members

See merge request !65
parents e07ed8f6 076f20c3
No related branches found
No related tags found
No related merge requests found
#ifndef __ALIGNEDMATVEC__
#define __ALIGNEDMATVEC__
#include <iostream>
#include <iomanip>
/****************************************************************************/
/****************************************************************************/
// dynamic allocation of aligned memory blocks
/****************************************************************************/
/****************************************************************************/
/** get a block of aligned memory
This function is based on malloc and allocates sufficient
extra memory to provide an aligned pointer;
use the correspondig free function to release the memory.
*/
void* aligned_malloc (size_t size, size_t alignment=16)
{
// alignment must be reasonable
assert(alignment>=sizeof(void*));
assert(alignment%sizeof(void*)==0);
// get a block of memory the pointer type is pointer to pointer to void to be able to do arithmetic
void **p = (void**) malloc(size+alignment+sizeof(void*)); // get block of sufficient size
if (p==0) return p; // not enough memory
// align the pointer
void **aligned_p = p;
while ( ((size_t)aligned_p)%alignment!=0)
{
aligned_p++;
}
// now put the original pointer just before that address
if ( ((size_t)aligned_p)-((size_t)p) >= sizeof(void*) )
{
void **pp = aligned_p;
pp--;
*pp = p;
return (void *) aligned_p;
}
else
{
aligned_p = aligned_p + (alignment/sizeof(void*));
void **pp = aligned_p;
pp--;
*pp = p;
return (void *) aligned_p;
}
}
/** free a block of memory allocated through aligned_malloc
*/
void aligned_free (void* p)
{
void **pp = (void**) p;
pp--;
free(*pp);
}
/****************************************************************************/
/****************************************************************************/
/* matrix class with
- aligned memory allocation
- possible padding of rows and columns
- row and column-wise access
- export of underlying pointer for BLAS-like operations
*/
/****************************************************************************/
/****************************************************************************/
/** Provide a memory aligned matrix with padded rows and colums allowing row-wise and column-wise layout
- alignment is in bytes
- padding means allocated rows and columns is a multiple of padding
- padding=1 means that exactly m rows and m columns are allocated
- matrix allows row and columnwise access
*/
template<typename T=double, size_t alignment=16, size_t padding=1>
class AlignedMatrix
{
typedef T member_type;
typedef size_t size_type;
public:
//! make a new matrix allocated on the heap
AlignedMatrix (size_t _m, size_t _n, bool verbose=false) : m(_m), n(_n)
{
// make padded_m, padded_n multiples of padding
if (alignment%sizeof(T)!=0) throw std::bad_alloc();
padded_m = m;
if (m%padding!=0) padded_m = (m/padding+1)*padding;
padded_n = n;
if (n%padding!=0) padded_n = (n/padding+1)*padding;
// allocate aligned array
p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
if (verbose) {
std::cout << "allocating aligned matrix" << std::endl;
std::cout << "m=" << std::dec << std::uppercase << m << " padded to " << padded_m << std::endl;
std::cout << "n=" << std::dec << std::uppercase << n << " padded to " << padded_n << std::endl;
std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
}
}
//! copy constructor
AlignedMatrix (const AlignedMatrix& mat)
{
m = mat.m;
n = mat.n;
padded_m = m;
if (m%padding!=0) padded_m = (m/padding+1)*padding;
padded_n = n;
if (n%padding!=0) padded_n = (n/padding+1)*padding;
// allocate aligned array
p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
}
//! destroy the matrix
~AlignedMatrix ()
{
aligned_free(p);
}
//! taking address returns the internal pointer
T* data () { return p; }
//! taking address returns the internal pointer
const T* data () const { return p; }
//! set new size
void resize (size_t newm, size_t newn)
{
if (newm*newn>padded_m*padded_n)
{
m = newm;
n = newn;
}
else
{
// throw away old matrix
aligned_free(p);
// set new size and reallocate
m = newm;
n = newn;
// make padded_m, padded_n multiples of padding
if (alignment%sizeof(T)!=0) throw std::bad_alloc();
padded_m = m;
if (m%padding!=0) padded_m = (m/padding+1)*padding;
padded_n = n;
if (n%padding!=0) padded_n = (n/padding+1)*padding;
// allocate aligned array
p = (T*) aligned_malloc(padded_m*padded_n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
}
}
//! get number of rows in use
size_t rows () const { return m;}
//! get number of columns in use
size_t cols () const { return n;}
//! get number of rows available
size_t padded_rows () const { return padded_m;}
//! get number of columns available
size_t padded_cols () const { return padded_n;}
//! access matrix elements as one big vector
const T& operator[] (size_t i) const {return p[i];}
//! access matrix elements as one big vector
T& operator[] (size_t i) {return p[i];}
//! access with j fastest; uses padding
const T& rowmajoraccess (size_t i, size_t j) const
{
return p[i*padded_n+j];
}
//! access with j fastest; uses padding
T& rowmajoraccess (size_t i, size_t j)
{
return p[i*padded_n+j];
}
//! access with i fastest; uses padding
const T& colmajoraccess (size_t i, size_t j) const
{
return p[j*padded_m+i];
}
//! access with i fastest; uses padding
T& colmajoraccess (size_t i, size_t j)
{
return p[j*padded_m+i];
}
private:
size_t m,n;
size_t padded_m,padded_n;
T* p;
};
/****************************************************************************/
/****************************************************************************/
/* vector class with
- aligned memory allocation
- export of underlying pointer for BLAS-like operations
*/
/****************************************************************************/
/****************************************************************************/
/** Provide a memory aligned vector
- alignment is in bytes
*/
template<typename T=double, size_t alignment=16>
class AlignedVector
{
typedef T member_type;
typedef size_t size_type;
public:
//! make a new vector
AlignedVector (size_t _n, bool verbose=false) : n(_n)
{
// allocate aligned array
p = (T*) aligned_malloc(n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
if (verbose) {
std::cout << "allocating aligned vector" << std::endl;
std::cout << "p=" << std::hex << std::uppercase << p << std::endl;
}
}
AlignedVector()
: n(0)
, p(nullptr)
{}
//! copy constructor
AlignedVector (const AlignedVector& vec)
{
n = vec.n;
p = (T*) aligned_malloc(n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
}
//! assignment operator
AlignedVector& operator=(const AlignedVector& vec)
{
if (p)
aligned_free(p);
n = vec.n;
p = (T*) aligned_malloc(n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
std::copy(vec.p,vec.p+n,p);
return *this;
}
//! free vector
~AlignedVector ()
{
if (p)
aligned_free(p);
}
//! resize vector
void resize (int _n)
{
if (_n<=n)
{
n = _n;
return;
}
if (p)
aligned_free(p);
n = _n;
p = (T*) aligned_malloc(n*sizeof(T),alignment);
if (p==0) throw std::bad_alloc();
}
//! taking address returns internal pointer to data
T* data () { return p; }
//! taking address returns internal pointer to data
const T* data () const { return p; }
//! get number of elements in vector
size_t size () const { return n;}
//! element access
const T& operator[] (size_t i) const {return p[i];}
//! element access
T& operator[] (size_t i) {return p[i];}
private:
size_t n;
T* p;
};
#endif
......@@ -72,7 +72,7 @@ class Class(Generable):
for bc in base_classes:
assert isinstance(bc, BaseClass)
for mem in members:
assert isinstance(mem, ClassMember)
assert isinstance(mem, Generable)
def generate(self):
# define the class header
......
......@@ -36,6 +36,7 @@ from dune.perftool.generation.loopy import (barrier,
globalarg,
iname,
instruction,
loopy_class_member,
kernel_cached,
noop_instruction,
silenced_warning,
......
......@@ -164,3 +164,17 @@ def barrier(**kwargs):
name = 'barrier_{}'.format(get_counter('barrier'))
_barrier(id=name, **kwargs)
return name
def loopy_class_member(name, classtag=None, **kwargs):
""" A class member is based on loopy! It is an
* temporary variable of the constructor kernel
* A globalarg of the requesting kernel (to make things pass)
"""
assert classtag
temporary_variable(name, kernel=classtag, **kwargs)
silenced_warning("read_no_write({})".format(name), kernel=classtag)
kwargs.pop("decl_method", None)
# TODO I guess some filtering has to be applied here.
globalarg(name, **kwargs)
......@@ -115,11 +115,18 @@ class DuneASTBuilder(CASTBuilder):
post_include("#define BARRIER asm volatile(\"\": : :\"memory\")", filetag="operatorfile")
return cgen.Line("BARRIER;")
def get_temporary_decls(self, codegen_state, schedule_index):
if self.target.declare_temporaries:
return CASTBuilder.get_temporary_decls(self, codegen_state, schedule_index)
else:
return []
class DuneTarget(TargetBase):
def __init__(self):
def __init__(self, declare_temporaries=True):
# Set fortran_abi to allow reusing CASTBuilder for the moment
self.fortran_abi = False
self.declare_temporaries = declare_temporaries
def split_kernel_at_global_barriers(self):
return False
......
......@@ -43,7 +43,6 @@ def get_form_compiler_arguments():
parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project")
# TODO at some point this help description should be updated
parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization")
parser.add_argument("--sumfact-alignment", type=int, help="Alignment used in sumfactorization", default=64)
parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization")
# Modify the positional argument to not be a list
......
......@@ -644,11 +644,29 @@ def cgen_class_from_cache(tag, members=[]):
# Construct the constructor
constructor_knl = extract_kernel_from_cache(tag)
from dune.perftool.loopy.target import DuneTarget
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
signature = "{}({})".format(basename, ", ".join(next(iter(p.generate(with_semicolon=False))) for p in constructor_params))
constructor = LoopyKernelMethod([signature], constructor_knl, add_timings=False, initializer_list=il)
# Take any temporary declarations from the kernel and make them class members
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)
from dune.perftool.cgen import Class
return Class(basename, base_classes=base_classes, members=[constructor] + members + pm, tparam_decls=tparams)
return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams)
def generate_localoperator_kernels(formdata, data):
......
......@@ -13,6 +13,7 @@ from dune.perftool.generation import (class_member,
include_file,
initializer_list,
instruction,
loopy_class_member,
preamble,
silenced_warning,
temporary_variable,
......@@ -29,6 +30,7 @@ from loopy.types import NumpyType
from pytools import Record, product
import pymbolic.primitives as prim
import numpy
......@@ -41,36 +43,6 @@ class AMatrix(Record):
)
class ColMajorAccess(FunctionIdentifier):
def __init__(self, amatrix):
self.amatrix = amatrix
def __getinitargs__(self):
return (self.amatrix,)
@property
def name(self):
return '{}.colmajoraccess'.format(self.amatrix)
@function_mangler
def colmajoraccess_mangler(target, func, dtypes):
if isinstance(func, ColMajorAccess):
return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.int32)))
@class_member(classtag="operator")
def define_alignment(name):
alignment = get_option("sumfact_alignment")
return "enum {{ {} = {} }};".format(name, str(alignment))
def name_alignment():
name = "alignment"
define_alignment(name)
return name
def quadrature_points_per_direction():
# TODO use quadrature order from dune.perftool.pdelab.quadrature
# as soon as we have a generic implementation
......@@ -93,30 +65,30 @@ def basis_functions_per_direction():
return polynomial_degree() + 1
@class_member(classtag="operator")
def define_oned_quadrature_weights(name):
range_field = lop_template_range_field()
number_qp = quadrature_points_per_direction()
return "{} {}[{}];".format(range_field, name, number_qp)
loopy_class_member(name,
dtype=numpy.float64,
classtag="operator",
shape=(quadrature_points_per_direction(),),
)
def name_oned_quadrature_weights():
name = "qw"
globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64))
define_oned_quadrature_weights(name)
return name
@class_member(classtag="operator")
def define_oned_quadrature_points(name):
range_field = lop_template_range_field()
number_qp = quadrature_points_per_direction()
return "{} {}[{}];".format(range_field, name, number_qp)
loopy_class_member(name,
dtype=numpy.float64,
classtag="operator",
shape=(quadrature_points_per_direction(),),
)
def name_oned_quadrature_points():
name = "qp"
globalarg(name, shape=(quadrature_points_per_direction(),), dtype=NumpyType(numpy.float64))
define_oned_quadrature_points(name)
return name
......@@ -176,50 +148,52 @@ def theta_iname(name, bound):
return name
def construct_theta(name, transpose, derivative):
# Make sure that the quadrature points are sorted
sort_quadrature_points_weights()
class PolynomialLookup(FunctionIdentifier):
def __init__(self, pol, derivative):
self.pol = pol
self.derivative = derivative
if transpose:
shape = (basis_functions_per_direction(), quadrature_points_per_direction())
else:
shape = (quadrature_points_per_direction(), basis_functions_per_direction())
polynomials = name_polynomials()
qp = name_oned_quadrature_points()
def __getinitargs__(self):
return (self.pol, self.derivative)
i = theta_iname("i", shape[0])
j = theta_iname("j", shape[1])
@property
def name(self):
return "{}.{}".format(self.pol, "dp" if self.derivative else "p")
# access = "j,i" if transpose else "i,j"
basispol = "dp" if derivative else "p"
polynomial_access = "{},{}[{}]".format(i, qp, j) if transpose else "{},{}[{}]".format(j, qp, i)
return instruction(code="{}.colmajoraccess({},{}) = {}.{}({});".format(name, i, j, polynomials, basispol, polynomial_access),
kernel="operator",
within_inames=frozenset({i, j}),
within_inames_is_final=True,
)
@function_mangler
def polynomial_lookup_mangler(target, func, dtypes):
if isinstance(func, PolynomialLookup):
return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64)))
@class_member(classtag="operator")
def typedef_theta(name):
include_file("dune/perftool/sumfact/alignedmatvec.hh", filetag="operatorfile")
alignment = name_alignment()
range_field = lop_template_range_field()
return "using {} = AlignedMatrix<{}, {}>;".format(name, range_field, alignment)
def define_theta(name, shape, transpose, derivative):
sort_quadrature_points_weights()
polynomials = name_polynomials()
qp = name_oned_quadrature_points()
loopy_class_member(name,
dtype=numpy.float64,
shape=shape,
classtag="operator",
dim_tags="f,f",
managed=True,
)
def type_theta():
name = "AMatrix"
typedef_theta(name)
return name
# TODO Enforce the alignment here!
i = theta_iname("i", shape[0])
j = theta_iname("j", shape[1])
@class_member(classtag="operator")
def define_theta(name, shape, transpose, derivative):
theta_type = type_theta()
initializer_list(name, [str(axis) for axis in shape], classtag="operator")
construct_theta(name, transpose, derivative)
return "{} {};".format(theta_type, name)
if transpose:
args = (prim.Variable(i), prim.Subscript(prim.Variable(qp), (prim.Variable(j),)))
else:
args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),)))
instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))),
expression=prim.Call(PolynomialLookup(polynomials, derivative), args),
kernel="operator",
)
def name_theta():
......
......@@ -14,7 +14,6 @@ from dune.perftool.generation import (backend,
)
from dune.perftool.sumfact.amatrix import (AMatrix,
basis_functions_per_direction,
ColMajorAccess,
name_dtheta,
name_theta,
quadrature_points_per_direction,
......@@ -159,9 +158,9 @@ def evaluate_basis(element, name, restriction):
inames = lfs_inames(element, restriction)
assert(len(quad_inames) == len(inames))
instruction(expression=prim.Product(tuple(prim.Call(ColMajorAccess(theta),
(prim.Variable(i), prim.Variable(j))
)
instruction(expression=prim.Product(tuple(prim.Subscript(prim.Variable(theta),
(prim.Variable(i), prim.Variable(j))
)
for (i, j) in zip(quad_inames, inames)
)
),
......@@ -201,9 +200,9 @@ def evaluate_reference_gradient(element, name, restriction):
dim = formdata.geometric_dimension
for i in range(dim):
calls = [prim.Call(ColMajorAccess(theta), (prim.Variable(m), prim.Variable(n)))
calls = [prim.Subscript(prim.Variable(theta), (prim.Variable(m), prim.Variable(n)))
for (m, n) in zip(quad_inames, inames)]
calls[i] = prim.Call(ColMajorAccess(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
calls[i] = prim.Subscript(prim.Variable(dtheta), (prim.Variable(quad_inames[i]), prim.Variable(inames[i])))
calls = tuple(calls)
# assignee = prim.Subscript(prim.Variable(name), tuple(prim.Variable(0)))
......
......@@ -259,8 +259,7 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional
k = sumfact_iname(a_matrix.cols, "red")
# Construct the matrix-matrix-multiplication expression a_ik*in_kj
from dune.perftool.sumfact.amatrix import ColMajorAccess
prod = Product((Call(ColMajorAccess(a_matrix.a_matrix), (Variable(i), Variable(k))),
prod = Product((Subscript(Variable(a_matrix.a_matrix), (Variable(i), Variable(k))),
Subscript(Variable(inp), (Variable(k), Variable(j)))
))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment