diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py index e33cc1cb46915cc40ff766a0e38dce234601ed96..a0a7de0172f6310067ddd7a72160a595889fdcaa 100644 --- a/python/dune/perftool/loopy/symbolic.py +++ b/python/dune/perftool/loopy/symbolic.py @@ -3,11 +3,11 @@ Use this module to insert pymbolic nodes and the likes. """ from dune.perftool.error import PerftoolError +from dune.perftool.sumfact.symbolic import SumfactKernel from pymbolic.mapper.substitutor import make_subst_func import loopy as lp import pymbolic.primitives as prim -from pytools import ImmutableRecord from six.moves import intern @@ -16,246 +16,6 @@ from six.moves import intern # -class SumfactKernel(ImmutableRecord, prim.Variable): - def __init__(self, - a_matrices=None, - buffer=None, - stage=1, - preferred_position=None, - restriction=0, - within_inames=(), - input=None, - padding=frozenset(), - index=None, - insn_dep=frozenset(), - coeff_func=None, - element=None, - component=None, - accumvar=None, - ): - """Create a sum factorization kernel - - Sum factorization can be written as - - Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...) - - with: - - X: Input rank d tensor of dimension n_0 x ... x n_{d-1} - - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1} - - A_l: Values of 1D basis evaluations at quadrature points in l - direction, matrix of dimension m_l x n_l - - R_l: Transformation operator that permutes the underlying data - vector of the rank d tensor in such a way that the fastest - direction gets the slowest direction - - In the l'th step we have the following setup: - - A_l: Matrix of dimensions m_l x n_l - - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1} - - R_l: Transformation operator - - Looking at the indizes the following will happen: - X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}] - A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] - R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] - - So the multiplication with A_l is a reduction over one index and - the transformation brings the next reduction index in the fastest - position. - - It can make sense to permute the order of directions. If you have - a small m_l (e.g. stage 1 on faces) it is better to do direction l - first. This can be done by: - - - Permuting the order of the A matrices. - - Permuting the input tensor. - - Permuting the output tensor (this assures that the directions of - the output tensor are again ordered from 0 to d-1). - - Note, that you will typically *not* set all of the below arguments, - but only some. The vectorization strategy may set others for you. - The only argument really needed in all cases is a_matrices. - - Arguments: - ---------- - a_matrices: A tuple of AMatrix instances - The list of tensors to be applied to the input. - Order of application is from 0 up. - buffer: A string identifying the flip flop buffer in use - for intermediate results. The memory is expected to be - pre-initialized with the input or you have to provide - direct_input (FastDGGridOperator). - stage: 1 or 3 - insn_dep: An instruction ID that the first issued instruction - should depend upon. All following ones will depend on each - other. - within_inames: Instructions will be executed within those - inames (needed for stage 3 in jacobians). - preferred_position: Will be used in the dry run to order kernels - when doing vectorization e.g. (dx u,dy u,dz u, u). - restriction: Restriction for faces values. - input: The name to use for the input temporary - padding: a set of slots in the input temporary to be padded - index: The slot in the input temporary dedicated to this kernel - coeff_func: The function to call to get the input coefficient - element: The UFL element - component: The treepath to the correct component of above element - accumvar: The accumulation variable to accumulate into - """ - # Check the input and apply defaults where necessary - assert isinstance(a_matrices, tuple) - from dune.perftool.sumfact.amatrix import AMatrixBase - assert all(isinstance(m, AMatrixBase) for m in a_matrices) - - assert stage in (1, 3) - - if preferred_position is not None: - assert isinstance(preferred_position, int) - - if stage == 3: - assert isinstance(restriction, tuple) - - assert isinstance(within_inames, tuple) - - assert isinstance(insn_dep, frozenset) - - ImmutableRecord.__init__(self, - a_matrices=a_matrices, - buffer=buffer, - stage=stage, - preferred_position=preferred_position, - restriction=restriction, - within_inames=within_inames, - input=input, - padding=padding, - index=index, - insn_dep=insn_dep, - coeff_func=coeff_func, - element=element, - component=component, - accumvar=accumvar, - ) - - prim.Variable.__init__(self, "SUMFACT") - - # - # The methods/fields needed to get a well-formed pymbolic node - # - def __getinitargs__(self): - return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction, self.within_inames, self.input, self.padding, self.index, self.insn_dep, self.coeff_func, self.element, self.component, self.accumvar) - - def stringifier(self): - return lp.symbolic.StringifyMapper - - init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction", "within_inames", "input", "padding", "index", "insn_dep", "coeff_func", "element", "component", "accumvar") - - mapper_method = "map_sumfact_kernel" - - # - # Some convenience methods to extract information about the sum factorization kernel - # - - @property - def length(self): - return len(self.a_matrices) - - @property - def vectorized(self): - return next(iter(self.a_matrices)).vectorized - - @property - def transposed(self): - return next(iter(self.a_matrices)).transpose - - @property - def cache_key(self): - """ The cache key that can be used in generation magic, - Any two sum factorization kernels having the same cache_key - are realized simulatneously! - """ - return hash((self.a_matrices, self.restriction, self.stage, self.buffer)) - - @property - def vec_index(self): - """ A tuple with the vector index - - Defaults to the empty tuple to allow addition to any - existing index tuple """ - if self.index is not None: - return (self.index,) - else: - return () - - @property - def flat_input_shape(self): - """ The 'flat' input tensor shape """ - from pytools import product - shape = (product(mat.cols for mat in self.a_matrices),) - if self.vectorized: - shape = shape + (4,) - return shape - - @property - def quadrature_shape(self): - """ The shape of a temporary for the quadrature points - - Takes into account the lower dimensionality of faces and vectorization. - """ - if self.transposed: - shape = tuple(mat.cols for mat in self.a_matrices if mat.face is None) - else: - shape = tuple(mat.rows for mat in self.a_matrices if mat.face is None) - if self.vectorized: - shape = shape + (4,) - return shape - - @property - def quadrature_dimtags(self): - """ The dim_tags of a temporary for the quadrature points - - Takes into account the lower dimensionality of faces and vectorization. - """ - tags = ["f"] * len(self.quadrature_shape) - if self.vectorized: - tags[-1] = 'c' - return ",".join(tags) - - @property - def dof_shape(self): - """ The shape of a temporary for the degrees of freedom - - Takes into account vectorization. - """ - shape = tuple(mat.rows for mat in self.a_matrices) - if self.vectorized: - shape = shape + (4,) - return shape - - @property - def dof_dimtags(self): - """ The dim_tags of a temporary for the degrees of freedom - - Takes into account vectorization. - """ - tags = ["f"] * len(self.dof_shape) - if self.vectorized: - tags[-1] = 'vec' - return ",".join(tags) - - @property - def output_shape(self): - if self.stage == 1: - return self.quadrature_shape - else: - return self.dof_shape - - @property - def output_dimtags(self): - if self.stage == 1: - return self.quadrature_dimtags - else: - return self.dof_dimtags - - class FusedMultiplyAdd(prim.Expression): """ Represents an FMA operation """ diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py index 2de5312691609ec50d3cac23b19b417a43dde838..d175f02585d886c4950cd25493f895b3a4ed6447 100644 --- a/python/dune/perftool/loopy/vcl.py +++ b/python/dune/perftool/loopy/vcl.py @@ -2,15 +2,9 @@ Our extensions to the loopy type system """ from dune.perftool.options import get_option -from dune.perftool.generation import (function_mangler, - include_file, - ) - -from loopy.symbolic import FunctionIdentifier -from loopy.types import NumpyType -from loopy import CallMangleInfo - +from dune.perftool.generation import function_mangler +import loopy as lp import numpy as np @@ -73,4 +67,4 @@ def vcl_function_mangler(knl, func, arg_dtypes): if func == "horizontal_add": vcl = lp.types.NumpyType(get_vcl_type(np.float64)) - return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,)) \ No newline at end of file + return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(np.float64),), (vcl,)) diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py index 0389416b5999e96b7e303434f07cab7e07a824b5..f7f108b311b2f17379e5e719730f75e46ff5c9f2 100644 --- a/python/dune/perftool/sumfact/__init__.py +++ b/python/dune/perftool/sumfact/__init__.py @@ -12,6 +12,8 @@ from dune.perftool.sumfact.basis import (lfs_inames, pymbolic_coefficient, pymbolic_coefficient_gradient, ) + +import dune.perftool.sumfact.accumulation import dune.perftool.sumfact.switch from dune.perftool.pdelab import PDELabInterface diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 93568b41b124f23c747d632ae8d150d8e06fa546..d908e5e2e545f85c6f7a415d6459834aee686403 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -1,4 +1,3 @@ -from dune.perftool.loopy.symbolic import substitute from dune.perftool.pdelab.argument import (name_accumulation_variable, PDELabAccumulationFunction, ) @@ -29,7 +28,7 @@ from dune.perftool.sumfact.amatrix import (basis_functions_per_direction, from dune.perftool.sumfact.switch import (get_facedir, get_facemod, ) -from dune.perftool.loopy.symbolic import SumfactKernel +from dune.perftool.sumfact.symbolic import SumfactKernel from dune.perftool.ufl.modified_terminals import extract_modified_arguments from dune.perftool.tools import get_pymbolic_basename from dune.perftool.error import PerftoolError @@ -136,6 +135,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): replace_dict = {} for iname in additional_inames: replace_dict[prim.Variable(iname)] = i + from dune.perftool.loopy.symbolic import substitute expression = substitute(pymbolic_expr, replace_dict) # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 739f8a96d821365223f8e185e91c1589185ace90..3bfa80ebfbd73c0721179a7e2f8fe8c7c3c74e48 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -29,7 +29,7 @@ from dune.perftool.pdelab.geometry import (local_dimension, world_dimension, ) from dune.perftool.loopy.buffer import initialize_buffer -from dune.perftool.loopy.symbolic import SumfactKernel +from dune.perftool.sumfact.symbolic import SumfactKernel from dune.perftool.options import get_option from dune.perftool.pdelab.driver import FEM_name_mangling from dune.perftool.pdelab.restriction import restricted_name diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index b15bbec27c340250660698e8420e93f0b4bd5857..129aa049eabbed04688709238e0875e767b197ea 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -117,7 +117,7 @@ def _realize_sum_factorization_kernel(sf): perm = sumfact_permutation_strategy(sf) # Permute a_matrices - a_matrices = _permute_forward(sf.a_matrices, perm) + a_matrices = permute_forward(sf.a_matrices, perm) # Product of all matrices for l, a_matrix in enumerate(a_matrices): @@ -150,8 +150,8 @@ def _realize_sum_factorization_kernel(sf): input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) if l == 0 and direct_input is not None: # See comment below - input_inames = _permute_backward(input_inames, perm) - inp_shape = _permute_backward(inp_shape, perm) + input_inames = permute_backward(input_inames, perm) + inp_shape = permute_backward(inp_shape, perm) globalarg(direct_input, dtype=np.float64, shape=inp_shape, dim_tags=novec_ftags) if a_matrix.vectorized: @@ -167,8 +167,8 @@ def _realize_sum_factorization_kernel(sf): # order of our input is from 0 to d-1. This means we need # to permute _back_ to get the right coefficients. if l == 0: - inp_shape = _permute_backward(inp_shape, perm) - input_inames = _permute_backward(input_inames, perm) + inp_shape = permute_backward(inp_shape, perm) + input_inames = permute_backward(input_inames, perm) # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the amatrix loop @@ -196,7 +196,7 @@ def _realize_sum_factorization_kernel(sf): # If we are in the last step we reverse the permutation. output_shape = tuple(out_shape[1:]) + (out_shape[0],) if l == len(a_matrices) - 1: - output_shape = _permute_backward(output_shape, perm) + output_shape = permute_backward(output_shape, perm) out = get_buffer_temporary(sf.buffer, shape=output_shape + vec_shape, dim_tags=ftags) @@ -214,7 +214,7 @@ def _realize_sum_factorization_kernel(sf): # end and reverse permutation output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) if l == len(a_matrices) - 1: - output_inames = _permute_backward(output_inames, perm) + output_inames = permute_backward(output_inames, perm) # In case of direct output we directly accumulate the result # of the Sumfactorization into some global data structure. diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py new file mode 100644 index 0000000000000000000000000000000000000000..8dbeb5278a43b47952b4f261a472b8feaea6344e --- /dev/null +++ b/python/dune/perftool/sumfact/symbolic.py @@ -0,0 +1,245 @@ +""" A pymbolic node representing a sum factorization kernel """ + +from pytools import ImmutableRecord + +import pymbolic.primitives as prim + + +class SumfactKernel(ImmutableRecord, prim.Variable): + def __init__(self, + a_matrices=None, + buffer=None, + stage=1, + preferred_position=None, + restriction=0, + within_inames=(), + input=None, + padding=frozenset(), + index=None, + insn_dep=frozenset(), + coeff_func=None, + element=None, + component=None, + accumvar=None, + ): + """Create a sum factorization kernel + + Sum factorization can be written as + + Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...) + + with: + - X: Input rank d tensor of dimension n_0 x ... x n_{d-1} + - Y: Output rank d tensor of dimension m_0 x ... x m_{d-1} + - A_l: Values of 1D basis evaluations at quadrature points in l + direction, matrix of dimension m_l x n_l + - R_l: Transformation operator that permutes the underlying data + vector of the rank d tensor in such a way that the fastest + direction gets the slowest direction + + In the l'th step we have the following setup: + - A_l: Matrix of dimensions m_l x n_l + - X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1} + - R_l: Transformation operator + + Looking at the indizes the following will happen: + X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}] + A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] + R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] + + So the multiplication with A_l is a reduction over one index and + the transformation brings the next reduction index in the fastest + position. + + It can make sense to permute the order of directions. If you have + a small m_l (e.g. stage 1 on faces) it is better to do direction l + first. This can be done by: + + - Permuting the order of the A matrices. + - Permuting the input tensor. + - Permuting the output tensor (this assures that the directions of + the output tensor are again ordered from 0 to d-1). + + Note, that you will typically *not* set all of the below arguments, + but only some. The vectorization strategy may set others for you. + The only argument really needed in all cases is a_matrices. + + Arguments: + ---------- + a_matrices: A tuple of AMatrix instances + The list of tensors to be applied to the input. + Order of application is from 0 up. + buffer: A string identifying the flip flop buffer in use + for intermediate results. The memory is expected to be + pre-initialized with the input or you have to provide + direct_input (FastDGGridOperator). + stage: 1 or 3 + insn_dep: An instruction ID that the first issued instruction + should depend upon. All following ones will depend on each + other. + within_inames: Instructions will be executed within those + inames (needed for stage 3 in jacobians). + preferred_position: Will be used in the dry run to order kernels + when doing vectorization e.g. (dx u,dy u,dz u, u). + restriction: Restriction for faces values. + input: The name to use for the input temporary + padding: a set of slots in the input temporary to be padded + index: The slot in the input temporary dedicated to this kernel + coeff_func: The function to call to get the input coefficient + element: The UFL element + component: The treepath to the correct component of above element + accumvar: The accumulation variable to accumulate into + """ + # Check the input and apply defaults where necessary + assert isinstance(a_matrices, tuple) + from dune.perftool.sumfact.amatrix import AMatrixBase + assert all(isinstance(m, AMatrixBase) for m in a_matrices) + + assert stage in (1, 3) + + if preferred_position is not None: + assert isinstance(preferred_position, int) + + if stage == 3: + assert isinstance(restriction, tuple) + + assert isinstance(within_inames, tuple) + + assert isinstance(insn_dep, frozenset) + + ImmutableRecord.__init__(self, + a_matrices=a_matrices, + buffer=buffer, + stage=stage, + preferred_position=preferred_position, + restriction=restriction, + within_inames=within_inames, + input=input, + padding=padding, + index=index, + insn_dep=insn_dep, + coeff_func=coeff_func, + element=element, + component=component, + accumvar=accumvar, + ) + + prim.Variable.__init__(self, "SUMFACT") + + # + # The methods/fields needed to get a well-formed pymbolic node + # + def __getinitargs__(self): + return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction, self.within_inames, self.input, self.padding, self.index, self.insn_dep, self.coeff_func, self.element, self.component, self.accumvar) + + def stringifier(self): + return lp.symbolic.StringifyMapper + + init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction", "within_inames", "input", "padding", "index", "insn_dep", "coeff_func", "element", "component", "accumvar") + + mapper_method = "map_sumfact_kernel" + + # + # Some convenience methods to extract information about the sum factorization kernel + # + + @property + def length(self): + return len(self.a_matrices) + + @property + def vectorized(self): + return next(iter(self.a_matrices)).vectorized + + @property + def transposed(self): + return next(iter(self.a_matrices)).transpose + + @property + def cache_key(self): + """ The cache key that can be used in generation magic, + Any two sum factorization kernels having the same cache_key + are realized simulatneously! + """ + return hash((self.a_matrices, self.restriction, self.stage, self.buffer)) + + @property + def vec_index(self): + """ A tuple with the vector index + + Defaults to the empty tuple to allow addition to any + existing index tuple """ + if self.index is not None: + return (self.index,) + else: + return () + + @property + def flat_input_shape(self): + """ The 'flat' input tensor shape """ + from pytools import product + shape = (product(mat.cols for mat in self.a_matrices),) + if self.vectorized: + shape = shape + (4,) + return shape + + @property + def quadrature_shape(self): + """ The shape of a temporary for the quadrature points + + Takes into account the lower dimensionality of faces and vectorization. + """ + if self.transposed: + shape = tuple(mat.cols for mat in self.a_matrices if mat.face is None) + else: + shape = tuple(mat.rows for mat in self.a_matrices if mat.face is None) + if self.vectorized: + shape = shape + (4,) + return shape + + @property + def quadrature_dimtags(self): + """ The dim_tags of a temporary for the quadrature points + + Takes into account the lower dimensionality of faces and vectorization. + """ + tags = ["f"] * len(self.quadrature_shape) + if self.vectorized: + tags[-1] = 'c' + return ",".join(tags) + + @property + def dof_shape(self): + """ The shape of a temporary for the degrees of freedom + + Takes into account vectorization. + """ + shape = tuple(mat.rows for mat in self.a_matrices) + if self.vectorized: + shape = shape + (4,) + return shape + + @property + def dof_dimtags(self): + """ The dim_tags of a temporary for the degrees of freedom + + Takes into account vectorization. + """ + tags = ["f"] * len(self.dof_shape) + if self.vectorized: + tags[-1] = 'vec' + return ",".join(tags) + + @property + def output_shape(self): + if self.stage == 1: + return self.quadrature_shape + else: + return self.dof_shape + + @property + def output_dimtags(self): + if self.stage == 1: + return self.quadrature_dimtags + else: + return self.dof_dimtags \ No newline at end of file