diff --git a/dune/perftool/common/vectorclass.hh b/dune/perftool/common/vectorclass.hh index fd405135573d4b4098ddc00cba2f816aa3932a80..6204b0a213796861533b9668f4c9a869198eb7cc 100644 --- a/dune/perftool/common/vectorclass.hh +++ b/dune/perftool/common/vectorclass.hh @@ -11,12 +11,38 @@ #define BARRIER asm volatile("": : :"memory") +template<typename T> +struct base_floatingpoint +{}; + #ifndef ENABLE_COUNTER #include <dune/perftool/vectorclass/vectorclass.h> #include <dune/perftool/vectorclass/vectormath_exp.h> #include <dune/perftool/vectorclass/vectormath_trig.h> +template<> +struct base_floatingpoint<Vec4d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec8f> +{ + using value = float; +}; + +#if MAX_VECTOR_SIZE >= 512 + +template<> +struct base_floatingpoint<Vec8d> +{ + using value = double; +}; + +#endif + #else #include <algorithm> @@ -46,10 +72,11 @@ struct Vec4d BARRIER; } - Vec4d(double d) + Vec4d(F dl, F du) { BARRIER; - std::fill(_d,_d+4,d); + std::fill(_d,_d+2,dl); + std::fill(_d+2,_d+4,du); BARRIER; } @@ -114,6 +141,11 @@ struct Vec4d }; +template<> +struct base_floatingpoint<Vec4d> +{ + using value = typename Vec4d::F; +}; /***************************************************************************** * @@ -142,7 +174,7 @@ static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) { static inline Vec4d operator ++ (Vec4d & a, int) { BARRIER; Vec4d a0 = a; - a = a + 1.0; + a = a + Vec4d(1.0); BARRIER; return a0; } @@ -150,7 +182,7 @@ static inline Vec4d operator ++ (Vec4d & a, int) { // prefix operator ++ static inline Vec4d & operator ++ (Vec4d & a) { BARRIER; - a = a + 1.0; + a = a + Vec4d(1.0); BARRIER; return a; } @@ -187,7 +219,7 @@ static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) { static inline Vec4d operator -- (Vec4d & a, int) { BARRIER; Vec4d a0 = a; - a = a - 1.0; + a = a - Vec4d(1.0); BARRIER; return a0; } @@ -195,7 +227,7 @@ static inline Vec4d operator -- (Vec4d & a, int) { // prefix operator -- static inline Vec4d & operator -- (Vec4d & a) { BARRIER; - a = a - 1.0; + a = a - Vec4d(1.0); BARRIER; return a; } @@ -708,10 +740,19 @@ struct Vec8d BARRIER; } - Vec8d(double d) + Vec8d(F dl, F du) { BARRIER; - std::fill(_d,_d+8,d); + std::fill(_d,_d+4,dl); + std::fill(_d+4,_d+8,du); + BARRIER; + } + + Vec8d(Vec4d low, Vec4d high) + { + BARRIER; + std::copy(_d, _d+4, low._d); + std::copy(_d+4, _d+8, high._d); BARRIER; } @@ -721,6 +762,24 @@ struct Vec8d BARRIER; } + Vec4d get_low() const + { + BARRIER; + Vec4d ret; + ret.load(_d); + BARRIER; + return ret; + } + + Vec4d get_high() const + { + BARRIER; + Vec4d ret; + ret.load(_d + 4); + BARRIER; + return ret; + } + Vec8d& load(const F* p) { BARRIER; @@ -776,6 +835,11 @@ struct Vec8d }; +template<> +struct base_floatingpoint<Vec8d> +{ + using value = typename Vec8d::F; +}; /***************************************************************************** * @@ -804,7 +868,7 @@ static inline Vec8d & operator += (Vec8d & a, Vec8d const & b) { static inline Vec8d operator ++ (Vec8d & a, int) { BARRIER; Vec8d a0 = a; - a = a + 1.0; + a = a + Vec8d(1.0); BARRIER; return a0; } @@ -812,7 +876,7 @@ static inline Vec8d operator ++ (Vec8d & a, int) { // prefix operator ++ static inline Vec8d & operator ++ (Vec8d & a) { BARRIER; - a = a + 1.0; + a = a + Vec8d(1.0); BARRIER; return a; } @@ -849,7 +913,7 @@ static inline Vec8d & operator -= (Vec8d & a, Vec8d const & b) { static inline Vec8d operator -- (Vec8d & a, int) { BARRIER; Vec8d a0 = a; - a = a - 1.0; + a = a - Vec8d(1.0); BARRIER; return a0; } @@ -857,7 +921,7 @@ static inline Vec8d operator -- (Vec8d & a, int) { // prefix operator -- static inline Vec8d & operator -- (Vec8d & a) { BARRIER; - a = a - 1.0; + a = a - Vec8d(1.0); BARRIER; return a; } @@ -1215,6 +1279,11 @@ struct Vec8f }; +template<> +struct base_floatingpoint<Vec8f> +{ + using value = typename Vec8f::F; +}; /***************************************************************************** * @@ -1243,7 +1312,7 @@ static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) { static inline Vec8f operator ++ (Vec8f & a, int) { BARRIER; Vec8f a0 = a; - a = a + 1.0; + a = a + Vec8f(1.0); BARRIER; return a0; } @@ -1251,7 +1320,7 @@ static inline Vec8f operator ++ (Vec8f & a, int) { // prefix operator ++ static inline Vec8f & operator ++ (Vec8f & a) { BARRIER; - a = a + 1.0; + a = a + Vec8f(1.0); BARRIER; return a; } @@ -1288,7 +1357,7 @@ static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) { static inline Vec8f operator -- (Vec8f & a, int) { BARRIER; Vec8f a0 = a; - a = a - 1.0; + a = a - Vec8f(1.0); BARRIER; return a0; } @@ -1296,7 +1365,7 @@ static inline Vec8f operator -- (Vec8f & a, int) { // prefix operator -- static inline Vec8f & operator -- (Vec8f & a) { BARRIER; - a = a - 1.0; + a = a - Vec8f(1.0); BARRIER; return a; } diff --git a/dune/perftool/sumfact/horizontaladd.hh b/dune/perftool/sumfact/horizontaladd.hh new file mode 100644 index 0000000000000000000000000000000000000000..db7634f0e5507214318dabb719240ff3674808ed --- /dev/null +++ b/dune/perftool/sumfact/horizontaladd.hh @@ -0,0 +1,19 @@ +#ifndef DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH +#define DUNE_PERFTOOL_SUMFACT_HORIZONTALADD_HH + +#include<dune/perftool/common/vectorclass.hh> + + +template<class V> +typename base_floatingpoint<V>::value horizontal_add_lower(const V& x) +{ + return horizontal_add(x.get_low()); +} + +template<class V> +typename base_floatingpoint<V>::value horizontal_add_upper(const V& x) +{ + return horizontal_add(x.get_high()); +} + +#endif diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh index f73c6a2f717b243e32411a5feea948c7777ce430..d2ce09c39bf6ff5b87cc236cbf5d2a5bfcf850f1 100644 --- a/dune/perftool/sumfact/transposereg.hh +++ b/dune/perftool/sumfact/transposereg.hh @@ -66,6 +66,9 @@ void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3); } +/** TODO: Is this transpose using blend8d superior to the swap_halves + * version below using get_low/get_high? + */ void transpose_reg (Vec8d& a0, Vec8d& a1) { Vec8d b0, b1; @@ -75,6 +78,48 @@ void transpose_reg (Vec8d& a0, Vec8d& a1) a1 = b1; } +namespace impl +{ + /* (alow, aupp), (blow, bupp) -> (alow, blow), (aupp, bupp) */ + void swap_halves(Vec8d& a, Vec8d& b) + { + Vec4d tmp = a.get_high(); + a = Vec8d(a.get_low(), b.get_low()); + b = Vec8d(tmp, b.get_high()); + } + + /* A 4x8 transpose that behaves exactly like Vec4d's 4x4 transpose + * on the lower and upper halves of the Vec8d + */ + void _transpose4x8(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) + { + Vec8d b0,b1,b2,b3; + b0 = blend8d<0,8,2,10,4,12,6,14>(a0,a1); + b1 = blend8d<1,9,3,11,5,13,7,15>(a0,a1); + b2 = blend8d<0,8,2,10,4,12,6,14>(a2,a3); + b3 = blend8d<1,9,3,11,5,13,7,15>(a2,a3); + a0 = blend8d<0,1,8,9,4,5,12,13>(b0,b2); + a1 = blend8d<0,1,8,9,4,5,12,13>(b1,b3); + a2 = blend8d<2,3,10,11,6,7,14,15>(b0,b2); + a3 = blend8d<2,3,10,11,6,7,14,15>(b1,b3); + } +} + +/* This is the 8x8 transpose of Vec8d's. It uses the same shuffling + * as Vec4d, but on the 4x4 subblocks. Afterwards, the off diagonal + * blocks are swapped. + */ +void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3, + Vec8d& a4, Vec8d& a5, Vec8d& a6, Vec8d& a7) +{ + impl::_transpose4x8(a0,a1,a2,a3); + impl::_transpose4x8(a4,a5,a6,a7); + impl::swap_halves(a0,a4); + impl::swap_halves(a1,a5); + impl::swap_halves(a2,a6); + impl::swap_halves(a3,a7); +} + #endif #endif diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py index c8c085c178d7dc2224d1efd1d76efaacf9da5259..e541e71366371039157cb757201f710ffd0a19ca 100644 --- a/python/dune/perftool/generation/__init__.py +++ b/python/dune/perftool/generation/__init__.py @@ -43,6 +43,7 @@ from dune.perftool.generation.loopy import (barrier, kernel_cached, noop_instruction, silenced_warning, + subst_rule, temporary_variable, transform, valuearg, diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index a97df4744fd1e661554a6febb306a8f858589c31..df47d5e9fb1ac03d8ed6bde6ec1de203470f4d2f 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -172,8 +172,8 @@ def noop_instruction(**kwargs): context_tags="kernel", cache_key_generator=no_caching, ) -def transform(trafo, *args): - return (trafo, args) +def transform(trafo, *args, **kwargs): + return (trafo, args, kwargs) @generator_factory(item_tags=("instruction", "barrier"), @@ -216,3 +216,8 @@ def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwar globalarg(name, **kwargs) return name + + +@generator_factory(item_tags=("substrule",), context_tags="kernel") +def subst_rule(name, args, expr): + return lp.SubstitutionRule(name, args, expr) diff --git a/python/dune/perftool/loopy/transformations/vectorview.py b/python/dune/perftool/loopy/transformations/vectorview.py index 1160c8bda81f0f81f6a6c9c6b8c2db327b0fe79d..9b74e3d8ca67cc28584def2ec0055b5504febfc4 100644 --- a/python/dune/perftool/loopy/transformations/vectorview.py +++ b/python/dune/perftool/loopy/transformations/vectorview.py @@ -33,7 +33,9 @@ def add_vector_view(knl, tmpname, pad_to=None, flatview=False): # Add base storage to the original temporary! if not temp.base_storage: - temp = temp.copy(base_storage=bsname) + temp = temp.copy(base_storage=bsname, + _base_storage_access_may_be_aliasing=True, + ) temporaries[tmpname] = temp else: bsname = temp.base_storage @@ -77,6 +79,7 @@ def add_vector_view(knl, tmpname, pad_to=None, flatview=False): base_storage=bsname, dtype=dtype_floatingpoint(), scope=lp.temp_var_scope.PRIVATE, + _base_storage_access_may_be_aliasing=True, ) # Avoid that any of these temporaries are eliminated diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py index 143d9b6c0a0ef84f76a9fe39c3024ca4cec041b5..f76f7881cf76e4360d2b6b8c0e691cfcad1488c1 100644 --- a/python/dune/perftool/loopy/vcl.py +++ b/python/dune/perftool/loopy/vcl.py @@ -2,7 +2,7 @@ Our extensions to the loopy type system """ from dune.perftool.options import get_option -from dune.perftool.generation import function_mangler +from dune.perftool.generation import function_mangler, include_file import loopy as lp import numpy as np @@ -62,8 +62,10 @@ def get_vcl_typename(nptype, register_size=None, vector_width=None): class ExplicitVCLCast(lp.symbolic.FunctionIdentifier): - def __init__(self, nptype, vector_width): + def __init__(self, nptype, vector_width=None): self.nptype = nptype + if vector_width is None: + vector_width = get_vcl_type_size(nptype) self.vector_width = vector_width def __getinitargs__(self): @@ -74,8 +76,17 @@ class ExplicitVCLCast(lp.symbolic.FunctionIdentifier): return get_vcl_typename(self.nptype, vector_width=self.vector_width) +class VCLLowerUpperLoad(ExplicitVCLCast): + pass + + @function_mangler def vcl_cast_mangler(knl, func, arg_dtypes): + if isinstance(func, VCLLowerUpperLoad): + return lp.CallMangleInfo(func.name, + (lp.types.NumpyType(func.nptype),), + arg_dtypes) + if isinstance(func, ExplicitVCLCast): return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],)) @@ -92,7 +103,8 @@ def vcl_function_mangler(knl, func, arg_dtypes): vcl = lp.types.NumpyType(get_vcl_type(dtype)) return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl)) - if func == "horizontal_add": + if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"): dtype = arg_dtypes[0] vcl = lp.types.NumpyType(get_vcl_type(dtype)) - return lp.CallMangleInfo("horizontal_add", (lp.types.NumpyType(dtype.dtype),), (vcl,)) + include_file("dune/perftool/sumfact/horizontaladd.hh", filetag="operatorfile") + return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,)) diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 5041772548f6646a9c1de40f0d6e929078900faa..6c239ccd5af49e9c042528024a4e6fc5c13b4d0f 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -78,7 +78,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.") sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|fromlist") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization") diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 1c3668ed3bca803035369a7ff63ed5c75bc4ad7a..03496a5cfbc24b0191e29df5bf83bac13fa99bc9 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -541,19 +541,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): # Apply the transformations that were gathered during tree traversals for trafo in transformations: - kernel = trafo[0](kernel, *trafo[1]) - - # Precompute all the substrules - for sr in kernel.substitutions: - tmpname = "precompute_{}".format(sr) - kernel = lp.precompute(kernel, - sr, - temporary_name=tmpname, - ) - # Vectorization strategies are actually very likely to eliminate the - # precomputation temporary. To avoid the temporary elimination warning - # we need to explicitly disable it. - kernel = kernel.copy(silenced_warnings=kernel.silenced_warnings + ["temp_to_write({})".format(tmpname)]) + kernel = trafo[0](kernel, *trafo[1], **trafo[2]) from dune.perftool.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 9566efb71e358dcaf346bd3a24b94858e3a56f26..02eb474754f1233e8cfbe86f57c474db4d2ed9d2 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -9,12 +9,15 @@ from dune.perftool.generation import (backend, generator_factory, get_counted_variable, get_counter, + get_global_context_value, + globalarg, iname, instruction, post_include, kernel_cached, temporary_variable, transform, + valuearg ) from dune.perftool.options import (get_form_option, get_option, @@ -26,15 +29,16 @@ from dune.perftool.pdelab.localoperator import determine_accumulation_space from dune.perftool.pdelab.restriction import restricted_name from dune.perftool.pdelab.signatures import assembler_routine_name from dune.perftool.pdelab.geometry import world_dimension +from dune.perftool.pdelab.spaces import name_lfs from dune.perftool.sumfact.tabulation import (basis_functions_per_direction, construct_basis_matrix_sequence, ) from dune.perftool.sumfact.switch import (get_facedir, get_facemod, ) -from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase +from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelOutputBase from dune.perftool.ufl.modified_terminals import extract_modified_arguments -from dune.perftool.tools import get_pymbolic_basename +from dune.perftool.tools import get_pymbolic_basename, get_leaf from dune.perftool.error import PerftoolError from dune.perftool.sumfact.quadrature import quadrature_inames @@ -81,21 +85,153 @@ def accum_iname(element, bound, i): return sumfact_iname(bound, "accum{}".format(suffix)) -class AlreadyAssembledInput(SumfactKernelInputBase): - def __init__(self, index): - self.index = index - - def __eq__(self, other): - return type(self) == type(other) and self.index == other.index +class AccumulationOutput(SumfactKernelOutputBase, ImmutableRecord): + def __init__(self, + accumvar=None, + restriction=None, + test_element=None, + test_element_index=None, + trial_element=None, + trial_element_index=None, + ): + # TODO: Isnt accumvar superfluous in the presence of all the other infos? + ImmutableRecord.__init__(self, + accumvar=accumvar, + restriction=restriction, + test_element=test_element, + test_element_index=test_element_index, + trial_element=trial_element, + trial_element_index=trial_element_index, + ) def __repr__(self): - return "AlreadyAssembledInput({})".format(self.index) + return ImmutableRecord.__repr__(self) - def __hash__(self): - return hash(self.index) + @property + def within_inames(self): + if self.trial_element is None: + return () + else: + from dune.perftool.sumfact.basis import lfs_inames + return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction) + + def realize(self, sf, result, insn_dep, inames=None, additional_inames=()): + trial_leaf_element = get_leaf(self.trial_element, self.trial_element_index) if self.trial_element is not None else None + + basis_size = tuple(mat.basis_size for mat in sf.matrix_sequence) + + if inames is None: + inames = tuple(accum_iname(trial_leaf_element, mat.rows, i) + for i, mat in enumerate(sf.matrix_sequence)) + + # Determine the expression to accumulate with. This depends on the vectorization strategy! + from dune.perftool.tools import maybe_wrap_subscript + result = maybe_wrap_subscript(result, tuple(prim.Variable(i) for i in inames)) + + # Collect the lfs and lfs indices for the accumulate call + restriction = (0, 0) if self.restriction is None else self.restriction + test_lfs = name_lfs(self.test_element, restriction[0], self.test_element_index) + valuearg(test_lfs, dtype=lp.types.NumpyType("str")) + test_lfs_index = flatten_index(tuple(prim.Variable(i) for i in inames), + basis_size, + order="f" + ) + + accum_args = [prim.Variable(test_lfs), test_lfs_index] + + # In the jacobian case, also determine the space for the ansatz space + if sf.within_inames: + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right now) + ansatz_lfs = name_lfs(self.trial_element, restriction[1], self.trial_element_index) + valuearg(ansatz_lfs, dtype=lp.types.NumpyType("str")) + from dune.perftool.sumfact.basis import _basis_functions_per_direction + ansatz_lfs_index = flatten_index(tuple(prim.Variable(sf.within_inames[i]) + for i in range(world_dimension())), + _basis_functions_per_direction(trial_leaf_element), + order="f" + ) + + accum_args.append(prim.Variable(ansatz_lfs)) + accum_args.append(ansatz_lfs_index) + + accum_args.append(result) + + if not get_form_option("fastdg"): + rank = 2 if self.within_inames else 1 + expr = prim.Call(PDELabAccumulationFunction(self.accumvar, rank), + tuple(accum_args) + ) + dep = instruction(assignees=(), + expression=expr, + forced_iname_deps=frozenset(inames + additional_inames + self.within_inames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + predicates=sf.predicates + ) - def __str__(self): - return "Input{}".format(self.index[0]) + return frozenset({dep}) + + def realize_direct(self, result, inames, shape, args): + ft = get_global_context_value("form_type") + + if self.test_element_index is None: + direct_output = "{}_access".format(self.accumvar) + else: + direct_output = "{}_access_comp{}".format(self.accumvar, self.test_element_index) + ftags = ",".join(["f"] * len(shape)) + + from dune.perftool.sumfact.realization import alias_data_array + if ft == 'residual' or ft == 'jacobian_apply': + globalarg(direct_output, + shape=shape, + dim_tags=ftags, + offset=_dof_offset(self.test_element, self.test_element_index), + ) + alias_data_array(direct_output, self.accumvar) + lhs = prim.Subscript(prim.Variable(direct_output), inames) + else: + assert ft == 'jacobian' + + direct_output = "{}x{}".format(direct_output, self.trial_element_index) + rowsize = sum(tuple(s for s in _local_sizes(self.trial_element))) + element = get_leaf(self.trial_element, self.trial_element_index) + other_shape = tuple(element.degree() + 1 for e in shape) + from pytools import product + manual_strides = tuple("stride:{}".format(rowsize * product(shape[:i])) for i in range(len(shape))) + dim_tags = "{},{}".format(ftags, ",".join(manual_strides)) + globalarg(direct_output, + shape=other_shape + shape, + offset=rowsize * _dof_offset(self.test_element, self.test_element_index) + _dof_offset(self.trial_element, self.trial_element_index), + dim_tags=dim_tags, + ) + alias_data_array(direct_output, self.accumvar) + # TODO: It is at least questionnable, whether using the *order* of the inames in here + # for indexing is a good idea. Then again, it is hard to find an alternative. + _ansatz_inames = tuple(prim.Variable(i) for i in self.within_inames) + lhs = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + inames) + + result = prim.Sum((lhs, result)) + return frozenset({instruction(assignee=lhs, expression=result, **args)}) + + +def _local_sizes(element): + from ufl import FiniteElement, MixedElement + if isinstance(element, MixedElement): + for subel in element.sub_elements(): + for s in _local_sizes(subel): + yield s + else: + assert isinstance(element, FiniteElement) + yield (element.degree() + 1)**element.cell().geometric_dimension() + + +def _dof_offset(element, component): + if component is None: + return 0 + else: + sizes = tuple(s for s in _local_sizes(element)) + return sum(sizes[0:component]) class SumfactAccumulationInfo(ImmutableRecord): @@ -266,16 +402,18 @@ def generate_accumulation_instruction(expr, visitor): if priority is None: priority = 3 + output = AccumulationOutput(accumvar=accumvar, + restriction=(test_info.restriction, trial_info.restriction), + test_element=test_info.element, + test_element_index=test_info.element_index, + trial_element=trial_info.element, + trial_element_index=trial_info.element_index, + ) + sf = SumfactKernel(matrix_sequence=matrix_sequence, - restriction=(test_info.restriction, trial_info.restriction), stage=3, position_priority=priority, - accumvar=accumvar, - test_element=test_info.element, - test_element_index=test_info.element_index, - trial_element=trial_info.element, - trial_element_index=trial_info.element_index, - input=AlreadyAssembledInput(index=(test_info.element_index,)), + output=output, predicates=predicates, ) @@ -342,56 +480,10 @@ def generate_accumulation_instruction(expr, visitor): depends_on=insn_dep, within_inames=frozenset(jacobian_inames))}) - inames = tuple(accum_iname(trial_leaf_element, mat.rows, i) - for i, mat in enumerate(vsf.matrix_sequence)) - - # Collect the lfs and lfs indices for the accumulate call - test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames), - basis_size, - order="f" - ) - - # In the jacobian case, also determine the space for the ansatz space - if jacobian_inames: - # TODO the next line should get its inames from - # elsewhere. This is *NOT* robust (but works right now) - from dune.perftool.sumfact.basis import _basis_functions_per_direction - ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i]) - for i in range(world_dimension())), - _basis_functions_per_direction(trial_leaf_element), - order="f" - ) - # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) - # Determine the expression to accumulate with. This depends on the vectorization strategy! - result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) - vecinames = () - - if vsf.vectorized: - iname = accum_iname(trial_leaf_element, vsf.vector_width, "vec") - vecinames = (iname,) - transform(lp.tag_inames, [(iname, "vec")]) - from dune.perftool.tools import maybe_wrap_subscript - result = prim.Call(prim.Variable("horizontal_add"), - (maybe_wrap_subscript(result, prim.Variable(iname)),), - ) - if not get_form_option("fastdg"): - rank = 2 if jacobian_inames else 1 - expr = prim.Call(PDELabAccumulationFunction(accumvar, rank), - (test_lfs.get_args() + - ansatz_lfs.get_args() + - (result,) - ) - ) - instruction(assignees=(), - expression=expr, - forced_iname_deps=frozenset(inames + vecinames + jacobian_inames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - predicates=predicates - ) + vsf.output.realize(vsf, result, insn_dep) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index a941b8e0d51e956010f7f11532157993766becf3..7c2740e13f195678ff234c9dbd946002f710eb83 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -11,6 +11,7 @@ from dune.perftool.generation import (backend, get_counted_variable, get_counter, get_global_context_value, + globalarg, iname, instruction, kernel_cached, @@ -70,7 +71,11 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): def __str__(self): return repr(self) - def realize(self, sf, index, insn_dep): + @property + def direct_input_is_possible(self): + return get_form_option("fastdg") + + def realize(self, sf, insn_dep, index=0): lfs = name_lfs(self.element, self.restriction, self.element_index) basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") container = self.coeff_func(self.restriction) @@ -85,18 +90,31 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): assignee = prim.Subscript(prim.Variable(name), (prim.Variable(basisiname),) + (index,)) - instruction(assignee=assignee, - expression=coeff, - depends_on=sf.insn_dep.union(insn_dep), - tags=frozenset({"sumfact_stage{}".format(sf.stage)}), - ) - - @property - def direct_input(self): - if get_form_option("fastdg"): - return self.coeff_func(self.restriction) - else: - return None + insn = instruction(assignee=assignee, + expression=coeff, + depends_on=sf.insn_dep.union(insn_dep), + tags=frozenset({"sumfact_stage{}".format(sf.stage)}), + ) + + return insn_dep.union(frozenset({insn})) + + def realize_direct(self, shape, inames): + arg = "{}_access{}".format(self.coeff_func(self.restriction), + "_comp{}".format(self.element_index) if self.element_index else "" + ) + + from dune.perftool.sumfact.accumulation import _dof_offset + from dune.perftool.sumfact.realization import alias_data_array + globalarg(arg, + shape=shape, + dim_tags=",".join("f" * len(shape)), + offset=_dof_offset(self.element, self.element_index), + ) + + func = self.coeff_func(self.restriction) + alias_data_array(arg, func) + + return prim.Subscript(prim.Variable(arg), inames) def _basis_functions_per_direction(element): diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index b0e85d43bdaa27409af81596ecf3274b6dbabca7..d4aea877bdfd46157f3447baaf02869a2dfe6131 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -75,22 +75,8 @@ def _realize_sum_factorization_kernel(sf): ), })) - direct_input = sf.input.direct_input - - # Set up the input for stage 1 - if direct_input is None: - if sf.vectorized: - for i, inputsf in enumerate(sf.kernels): - inputsf.input.realize(sf, i, inputsf.insn_dep.union(insn_dep)) - else: - sf.input.realize(sf, 0, insn_dep) - - insn_dep = insn_dep.union(frozenset({lp.match.Writes("input_{}".format(sf.buffer))})) - else: - if sf.input.element_index is None: - direct_input_arg = "{}_access".format(direct_input) - else: - direct_input_arg = "{}_access_comp{}".format(direct_input, sf.input.element_index) + if not sf.input.direct_input_is_possible: + insn_dep = insn_dep.union(sf.input.realize(sf, insn_dep)) # Prepare some dim_tags/shapes for later use ftags = ",".join(["f"] * sf.length) @@ -143,24 +129,12 @@ def _realize_sum_factorization_kernel(sf): # * a global data structure (if FastDGGridOperator is in use) # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator) input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) - if l == 0 and direct_input is not None: + if l == 0 and sf.input.direct_input_is_possible: # See comment below input_inames = permute_backward(input_inames, perm) inp_shape = permute_backward(inp_shape, perm) - globalarg(direct_input_arg, - shape=inp_shape, - dim_tags=novec_ftags, - offset=_dof_offset(sf.input.element, sf.input.element_index), - ) - alias_data_array(direct_input_arg, direct_input) - if matrix.vectorized: - input_summand = prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=sf.vector_width), - (prim.Subscript(prim.Variable(direct_input_arg), - input_inames),)) - else: - input_summand = prim.Subscript(prim.Variable(direct_input_arg), - input_inames + vec_iname) + input_summand = sf.input.realize_direct(inp_shape, input_inames) else: # If we did permute the order of a matrices above we also # permuted the order of out_inames. Unfortunately the @@ -215,72 +189,33 @@ def _realize_sum_factorization_kernel(sf): if l == len(matrix_sequence) - 1: output_inames = permute_backward(output_inames, perm) - # In case of direct output we directly accumulate the result - # of the Sumfactorization into some global data structure. - if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3: - ft = get_global_context_value("form_type") - if sf.test_element_index is None: - direct_output = "{}_access".format(sf.accumvar) - else: - direct_output = "{}_access_comp{}".format(sf.accumvar, sf.test_element_index) - if ft == 'residual' or ft == 'jacobian_apply': - globalarg(direct_output, - shape=output_shape, - dim_tags=novec_ftags, - offset=_dof_offset(sf.test_element, sf.test_element_index), - ) - alias_data_array(direct_output, sf.accumvar) - - assignee = prim.Subscript(prim.Variable(direct_output), output_inames) - else: - assert ft == 'jacobian' - - direct_output = "{}x{}".format(direct_output, sf.trial_element_index) - rowsize = sum(tuple(s for s in _local_sizes(sf.trial_element))) - element = sf.trial_element - if isinstance(element, MixedElement): - element = element.extract_component(sf.trial_element_index)[1] - other_shape = tuple(element.degree() + 1 for e in range(sf.length)) - from pytools import product - manual_strides = tuple("stride:{}".format(rowsize * product(output_shape[:i])) for i in range(sf.length)) - dim_tags = "{},{}".format(novec_ftags, ",".join(manual_strides)) - globalarg(direct_output, - shape=other_shape + output_shape, - offset=rowsize * _dof_offset(sf.test_element, sf.test_element_index) + _dof_offset(sf.trial_element, sf.trial_element_index), - dim_tags=dim_tags, - ) - alias_data_array(direct_output, sf.accumvar) - # TODO: It is at least questionnable, whether using the *order* of the inames in here - # for indexing is a good idea. Then again, it is hard to find an alternative. - _ansatz_inames = tuple(prim.Variable(i) for i in sf.within_inames) - assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames) - - # In case of vectorization we need to apply a horizontal add - if matrix.vectorized: - matprod = prim.Call(prim.Variable("horizontal_add"), - (matprod,)) - - # We need to accumulate - matprod = prim.Sum((assignee, matprod)) - else: - assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) - tag = "sumfact_stage{}".format(sf.stage) if sf.stage == 3: tag = "{}_{}".format(tag, "_".join(sf.within_inames)) - # Issue the reduction instruction that implements the multiplication - # at the same time store the instruction ID for the next instruction to depend on - insn_dep = frozenset({instruction(assignee=assignee, - expression=matprod, - forced_iname_deps=frozenset([iname for iname in out_inames]).union(frozenset(sf.within_inames)), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - tags=frozenset({tag}), - predicates=sf.predicates, - groups=frozenset({sf.group_name}), - ) - }) + # Collect the key word arguments for the loopy instruction + insn_args = {"forced_iname_deps": frozenset([i for i in out_inames]).union(frozenset(sf.within_inames)), + "forced_iname_deps_is_final": True, + "depends_on": insn_dep, + "tags": frozenset({tag}), + "predicates": sf.predicates, + "groups": frozenset({sf.group_name}), + } + + # In case of direct output we directly accumulate the result + # of the Sumfactorization into some global data structure. + if l == len(matrix_sequence) - 1 and get_form_option('fastdg') and sf.stage == 3: + if sf.vectorized: + insn_args["forced_iname_deps"] = insn_args["forced_iname_deps"].union(frozenset({vec_iname[0].name})) + insn_dep = sf.output.realize_direct(matprod, output_inames, out_shape, insn_args) + else: + # Issue the reduction instruction that implements the multiplication + # at the same time store the instruction ID for the next instruction to depend on + insn_dep = frozenset({instruction(assignee=prim.Subscript(prim.Variable(out), output_inames + vec_iname), + expression=matprod, + **insn_args + ) + }) # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: @@ -301,22 +236,3 @@ def _realize_sum_factorization_kernel(sf): silenced_warning('read_no_write({})'.format(out)) return lp.TaggedVariable(out, sf.tag), insn_dep - - -def _local_sizes(element): - from ufl import FiniteElement, MixedElement - if isinstance(element, MixedElement): - for subel in element.sub_elements(): - for s in _local_sizes(subel): - yield s - else: - assert isinstance(element, FiniteElement) - yield (element.degree() + 1)**element.cell().geometric_dimension() - - -def _dof_offset(element, component): - if component is None: - return 0 - else: - sizes = tuple(s for s in _local_sizes(element)) - return sum(sizes[0:component]) diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index babb432bc324f6ce08cec9f931df7ba8191181fa..db93c6cf0145007e13dc82bbb7ff817c0b54e060 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -1,10 +1,16 @@ """ A pymbolic node representing a sum factorization kernel """ from dune.perftool.options import get_option -from dune.perftool.generation import get_counted_variable +from dune.perftool.generation import (get_counted_variable, + subst_rule, + transform, + ) from dune.perftool.pdelab.geometry import local_dimension, world_dimension from dune.perftool.sumfact.quadrature import quadrature_inames from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray +from dune.perftool.loopy.target import dtype_floatingpoint +from dune.perftool.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad +from dune.perftool.tools import get_leaf, maybe_wrap_subscript from pytools import ImmutableRecord, product @@ -18,11 +24,133 @@ import inspect class SumfactKernelInputBase(object): @property - def direct_input(self): - return None + def direct_input_is_possible(self): + return False + + def realize(self, sf, dep, index=0): + return dep + + def realize_direct(self, inames): + raise NotImplementedError + + def __repr__(self): + return "SumfactKernelInputBase()" + + +class VectorSumfactKernelInput(SumfactKernelInputBase): + def __init__(self, inputs): + assert(isinstance(inputs, tuple)) + self.inputs = inputs + + def __repr__(self): + return "_".join(repr(i) for i in self.inputs) + + @property + def direct_input_is_possible(self): + return all(i.direct_input_is_possible for i in self.inputs) + + def realize(self, sf, dep): + for i, inp in enumerate(self.inputs): + dep = dep.union(inp.realize(sf, dep, index=i)) + return dep + + def realize_direct(self, shape, inames): + # Check whether the input exhibits a favorable structure + # (whether we can broadcast scalar values into SIMD registers) + total = set(self.inputs) + lower = set(self.inputs[:len(self.inputs) // 2]) + upper = set(self.inputs[len(self.inputs) // 2:]) + + if len(total) == 1: + # All input coefficients use the exact same input coefficient. + # We implement this by broadcasting it into a SIMD register + return prim.Call(ExplicitVCLCast(dtype_floatingpoint()), + (self.inputs[0].realize_direct(shape, inames),) + ) + elif len(total) == 2 and len(lower) == 1 and len(upper) == 1: + # The lower and the upper part of the SIMD register use + # the same input coefficient, we combine the SIMD register + # from two shorter SIMD types + return prim.Call(VCLLowerUpperLoad(dtype_floatingpoint()), + (self.inputs[0].realize_direct(shape, inames), + self.inputs[len(self.inputs) // 2].realize_direct(shape, inames), + ) + ) + else: + # The input does not exhibit a broadcastable structure, we + # need to load scalars into the SIMD vector. + raise NotImplementedError("SIMD loads from scalars not implemented!") + + +class SumfactKernelOutputBase(object): + @property + def within_inames(self): + return () - def realize(self, sf, i, dep): - pass + def realize(self, sf, result, insn_dep): + return dep + + def realize_direct(self, result, inames, shape, args): + raise NotImplementedError + + def __repr__(self): + return "SumfactKernelOutputBase()" + + +class VectorSumfactKernelOutput(SumfactKernelOutputBase): + def __init__(self, outputs): + self.outputs = outputs + + def __repr__(self): + return "_".join(repr(o) for o in self.outputs) + + def _add_hadd(self, o, result): + hadd_function = "horizontal_add" + if len(set(self.outputs)) > 1: + pos = self.outputs.index(o) + if pos == 0: + hadd_function = "horizontal_add_lower" + else: + hadd_function = "horizontal_add_upper" + + return prim.Call(prim.Variable(hadd_function), (result,)) + + def realize(self, sf, result, insn_dep): + outputs = set(self.outputs) + + trial_element, = set(o.trial_element for o in self.outputs) + trial_element_index, = set(o.trial_element_index for o in self.outputs) + from dune.perftool.sumfact.accumulation import accum_iname + element = get_leaf(trial_element, trial_element_index) if trial_element is not None else None + inames = tuple(accum_iname(element, mat.rows, i) + for i, mat in enumerate(sf.matrix_sequence)) + veciname = accum_iname(element, sf.vector_width // len(outputs), "vec") + transform(lp.tag_inames, [(veciname, "vec")]) + + deps = frozenset() + for o in outputs: + hadd_result = self._add_hadd(o, maybe_wrap_subscript(result, tuple(prim.Variable(iname) for iname in inames + (veciname,)))) + deps = deps.union(o.realize(sf, hadd_result, insn_dep, inames=inames, additional_inames=(veciname,))) + + return deps + + def realize_direct(self, result, inames, shape, args): + outputs = set(self.outputs) + + # If multiple horizontal_add's are to be performed with 'result' + # we need to precompute the result! + if len(outputs) > 1: + substname = "haddsubst_{}".format("_".join([i.name for i in inames])) + subst_rule(substname, (), result) + result = prim.Call(prim.Variable(substname), ()) + transform(lp.precompute, substname, precompute_outer_inames=args["forced_iname_deps"]) + + deps = frozenset() + for o in outputs: + hadd_result = self._add_hadd(o, result) + deps = deps.union(o.realize_direct(hadd_result, inames, shape, args)) + + return deps class SumfactKernelBase(object): @@ -35,14 +163,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): buffer=None, stage=1, position_priority=None, - restriction=None, insn_dep=frozenset(), - input=None, - accumvar=None, - test_element=None, - test_element_index=None, - trial_element=None, - trial_element_index=None, + input=SumfactKernelInputBase(), + output=SumfactKernelOutputBase(), predicates=frozenset(), ): """Create a sum factorization kernel @@ -118,11 +241,8 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): assert stage in (1, 3) - if stage == 1: - assert isinstance(input, SumfactKernelInputBase) - - if stage == 3: - assert isinstance(restriction, tuple) + assert isinstance(input, SumfactKernelInputBase) + assert isinstance(output, SumfactKernelOutputBase) assert isinstance(insn_dep, frozenset) @@ -159,31 +279,54 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # Watch out for the documentation to see which key is used unter what circumstances # + @property + def parallel_key(self): + """ A key that identifies parallellizable kernels. """ + return tuple(m.basis_size for m in self.matrix_sequence) + (self.stage, self.buffer) + @property def cache_key(self): """ The cache key that can be used in generation magic Any two sum factorization kernels having the same cache_key - are realized simulatenously! + are realized simultaneously! """ - return (self.matrix_sequence, self.restriction, self.stage, self.buffer, self.test_element_index) + if self.buffer is None: + # During dry run, we return something unique to this kernel + return repr(self) + else: + # Later we identify parallely implemented kernels by the assigned buffer + return self.buffer @property - def input_key(self): + def inout_key(self): """ A cache key for the input coefficients Any two sum factorization kernels having the same input_key - work on the same input coefficient (and are suitable for simultaneous - treatment because of that) + work on the same input coefficient (stage 1) or accumulate + into the same thing (stage 3) """ - return (self.input, self.restriction, self.accumvar, self.trial_element_index) + return (repr(self.input), repr(self.output)) @property def group_name(self): - return "sfgroup_{}_{}_{}_{}".format(self.input, self.restriction, self.accumvar, self.trial_element_index) + return "sfgroup_{}_{}".format(self.input, self.output) # # Some convenience methods to extract information about the sum factorization kernel # + def __lt__(self, other): + if self.parallel_key != other.parallel_key: + return self.parallel_key < other.parallel_key + if self.inout_key != other.inout_key: + return self.input_key < other.input_key + if self.position_priority == other.position_priority: + return repr(self) < repr(other) + if self.position_priority is None: + return False + if other.position_priority is None: + return True + return self.position_priority < other.position_priority + @property def length(self): """ The number of matrices to apply """ @@ -199,14 +342,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): @property def within_inames(self): - if self.trial_element is None: - return () - else: - from dune.perftool.sumfact.basis import lfs_inames - element = self.trial_element - if isinstance(element, MixedElement): - element = element.extract_component(self.trial_element_index)[1] - return lfs_inames(element, self.restriction) + return self.output.within_inames def vec_index(self, sf): """ Map an unvectorized sumfact kernel object to its position @@ -358,7 +494,6 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # Assert all the properties that need to be the same across all subkernels assert len(set(k.stage for k in kernels)) == 1 assert len(set(k.length for k in kernels)) == 1 - assert len(set(k.restriction for k in kernels)) == 1 assert len(set(k.within_inames for k in kernels)) == 1 assert len(set(k.predicates for k in kernels)) == 1 @@ -366,7 +501,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) for i in range(kernels[0].length): assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1 assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1 - assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].direction for k in kernels))) == 1 assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1 # Join the instruction dependencies of all subkernels @@ -437,36 +572,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) def within_inames(self): return self.kernels[0].within_inames - @property - def test_element(self): - return self.kernels[0].test_element - - @property - def test_element_index(self): - return self.kernels[0].test_element_index - - @property - def trial_element(self): - return self.kernels[0].trial_element - - @property - def trial_element_index(self): - return self.kernels[0].trial_element_index - @property def predicates(self): return self.kernels[0].predicates - @property - def input(self): - assert len(set(k.input for k in self.kernels)) == 1 - return self.kernels[0].input - - @property - def accumvar(self): - assert len(set(k.accumvar for k in self.kernels)) == 1 - return self.kernels[0].accumvar - @property def transposed(self): return self.kernels[0].transposed @@ -486,13 +595,21 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # Define the same properties the normal SumfactKernel defines # + @property + def input(self): + return VectorSumfactKernelInput(tuple(k.input for k in self.kernels)) + + @property + def output(self): + return VectorSumfactKernelOutput(tuple(k.output for k in self.kernels)) + @property def cache_key(self): return (tuple(k.cache_key for k in self.kernels), self.buffer) @property - def input_key(self): - return tuple(k.input_key for k in self.kernels) + def inout_key(self): + return tuple(k.inout_key for k in self.kernels) @property def group_name(self): @@ -507,10 +624,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) return True def horizontal_index(self, sf): - key = tuple(mat.derivative for mat in sf.matrix_sequence) for i, k in enumerate(self.kernels): - if tuple(mat.derivative for mat in k.matrix_sequence) == key: - return i + if sf.inout_key == k.inout_key: + if tuple(mat.derivative for mat in sf.matrix_sequence) == tuple(mat.derivative for mat in k.matrix_sequence): + return i + return 0 def _quadrature_index(self, sf, visitor): diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index bc671b700e989e694bed572241edc19879942ea7..daeabf9005cbff9c5917185b5711ceb6cb4342a3 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -62,14 +62,27 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): slice_index=slice_index, ) + @property + def _shortname(self): + infos = ["d{}".format(self.basis_size), + "q{}".format(self.quadrature_size)] + + if self.transpose: + infos.append("T") + + if self.derivative: + infos.append("dx") + + if self.face is not None: + infos.append("f{}".format(self.face)) + + if self.slice_size is not None: + infos.append("s{}".format(self.slice_index)) + + return "".join(infos) + def __str__(self): - return "{}{}A{}{}{}" \ - .format("face{}_".format(self.face) if self.face is not None else "", - "d" if self.derivative else "", - self.basis_size, - "T" if self.transpose else "", - "_slice{}".format(self.slice_index) if self.slice_size is not None else "", - ) + return "Theta_{}".format(self._shortname) @property def rows(self): @@ -96,14 +109,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): return size def pymbolic(self, indices): - name = "{}{}Theta{}{}_qp{}_dof{}" \ - .format("face{}_".format(self.face) if self.face is not None else "", - "d" if self.derivative else "", - "T" if self.transpose else "", - "_slice{}".format(self.slice_index) if self.slice_size is not None else "", - self.quadrature_size, - self.basis_size, - ) + name = str(self) define_theta(name, self) return prim.Subscript(prim.Variable(name), indices) @@ -131,7 +137,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): assert len(set(t.quadrature_size for t in tabs)) == 1 assert len(set(t.basis_size for t in tabs)) == 1 assert len(set(t.transpose for t in tabs)) == 1 - assert len(set(t.face for t in tabs)) == 1 + assert len(set(t.direction for t in tabs)) == 1 assert len(set(t.slice_size for t in tabs)) == 1 self.tabs = tabs @@ -140,11 +146,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): self.width = width def __str__(self): - abbrevs = tuple("{}A{}{}".format("d" if t.derivative else "", - self.basis_size, - "s{}".format(t.slice_index) if t.slice_size is not None else "") - for t in self.tabs) - return "_".join(abbrevs) + return "Theta{}".format("_".join((t._shortname for t in self.tabs))) @property def quadrature_size(self): @@ -196,15 +198,8 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): theta = self.tabs[0].pymbolic(indices[:-1]) return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=get_vcl_type_size(dtype_floatingpoint())), (theta,)) - abbrevs = tuple("{}x{}".format("d" if t.derivative else "", - "s{}".format(t.slice_index) if t.slice_size is not None else "") - for t in self.tabs) - name = "ThetaLarge{}{}_{}_qp{}_dof{}".format("face{}_".format(self.face) if self.face is not None else "", - "T" if self.transpose else "", - "_".join(abbrevs), - self.tabs[0].quadrature_size, - self.tabs[0].basis_size, - ) + name = str(self) + for i, tab in enumerate(self.tabs): define_theta(name, tab, additional_indices=(i,), width=self.width) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 690a1c0ba912913f068fe2f295dcbc4c498bdf1e..4f7abe67048b3016db2277153844757116996164 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -48,13 +48,6 @@ def attach_vectorization_info(sf): return _cache_vectorization_info(sf, None) -def position_penalty_factor(sf): - if isinstance(sf, SumfactKernel) or sf.vertical_width > 1: - return 1 - else: - return 1 + sum(abs(sf.kernels[i].position_priority - i) if sf.kernels[i].position_priority is not None else 0 for i in range(sf.length)) - - @backend(interface="vectorization_strategy", name="model") def costmodel(sf): # Penalize vertical vectorization @@ -66,13 +59,7 @@ def costmodel(sf): scalar_penalty = get_vcl_type_size(dtype_floatingpoint()) # Return total operations - return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty - - -@backend(interface="vectorization_strategy", name="fromlist") -def fromlist_costmodel(sf): - # The fromlist strategy needs to reuse the cost model! - return costmodel(sf) + return sf.operations * vertical_penalty * scalar_penalty @backend(interface="vectorization_strategy", name="explicit") @@ -90,7 +77,7 @@ def explicit_costfunction(sf): if sf.horizontal_width == horizontal and sf.vertical_width == vertical: # Penalize position mapping - return sf.operations * position_penalty_factor(sf) + return sf.operations else: return 1000000000000 @@ -172,59 +159,21 @@ def decide_vectorization_strategy(): logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" .format(len(active_sumfacts))) - # Find the best vectorization strategy by using a costmodel - width = get_vcl_type_size(dtype_floatingpoint()) - # - # Optimize over all the possible quadrature point tuples + # Find the best vectorization strategy by using a costmodel # - quad_points = [quadrature_points_per_direction()] - if get_form_option("vectorization_allow_quadrature_changes"): - sf = next(iter(active_sumfacts)) - depth = 1 - while depth <= width: - i = 0 if sf.matrix_sequence[0].face is None else 1 - quad = list(quadrature_points_per_direction()) - quad[i] = round_to_multiple(quad[i], depth) - quad_points.append(tuple(quad)) - depth = depth * 2 - quad_points = list(set(quad_points)) - - if get_form_option("vectorization_strategy") == "fromlist": - # This is a bit special and does not follow the minimization procedure at all - - def _choose_strategy_from_list(stage1_sumfacts): - strategy = 0 - for qp in quad_points: - for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp): - if strategy == int(get_form_option("vectorization_list_index")): - # Output the strategy and its cost into a separate file - if get_global_context_value("form_type") == "jacobian_apply": - with open("strategycosts.csv", "a") as f: - f.write("{} {}\n".format(strategy, strategy_cost((qp, strat)))) - return qp, strat - strategy = strategy + 1 - - raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_form_option("vectorization_list_index"))) - - s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1) - - total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points) - print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_form_option("vectorization_list_index")), - total)) - qp, sfdict = _choose_strategy_from_list(s1_sumfacts) - - keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1) - for key in keys: - key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key) - minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), - key=fixedqp_strategy_costfunction(qp)) - sfdict = add_to_frozendict(sfdict, minimum) - else: - # Find the minimum cost strategy between all the quadrature point tuples - optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points} - qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) - sfdict = optimal_strategies[qp] + # Note that this optimization procedure uses a hierarchic approach to bypass + # the problems of unfavorable complexity of the set of all possible vectorization + # opportunities. Optimizations are performed at different levels (you find these + # levels in the function names implementing them), where optimal solutions at a + # higher level are combined into lower level solutions or optima of optimal solutions + # at higher level are calculated: + # * Level 1: Finding an optimal quadrature tuple (by finding optimum of level 2 optima) + # * Level 2: Split by parallelizability and combine optima into optimal solution + # * Level 3: Optimize number of different inputs to consider + # * Level 4: Optimize horizontal/vertical/hybrid strategy + width = get_vcl_type_size(dtype_floatingpoint()) + qp, sfdict = level1_optimal_vectorization_strategy(active_sumfacts, width) set_quadrature_points(qp) @@ -239,88 +188,104 @@ def decide_vectorization_strategy(): _cache_vectorization_info(sf, sfdict[sf]) -def fixed_quadrature_optimal_vectorization(sumfacts, width, qp): - """ For a given quadrature point tuple, find the optimal strategy! +def level1_optimal_vectorization_strategy(sumfacts, width): + # Gather a list of possible quadrature point tuples + quad_points = [quadrature_points_per_direction()] + if get_form_option("vectorization_allow_quadrature_changes"): + sf = next(iter(sumfacts)) + depth = 1 + while depth <= width: + i = 0 if sf.matrix_sequence[0].face is None else 1 + quad = list(quadrature_points_per_direction()) + quad[i] = round_to_multiple(quad[i], depth) + quad_points.append(tuple(quad)) + depth = depth * 2 + quad_points = list(set(quad_points)) - In order to have this scale sufficiently, we cannot simply list all vectorization - opportunities and score them individually, but we need to do a divide and conquer - approach. - """ - # Find the sets of simultaneously realizable kernels (thats an equivalence relation) - keys = frozenset(sf.input_key for sf in sumfacts) + # Find the minimum cost strategy between all the quadrature point tuples + optimal_strategies = {qp: level2_optimal_vectorization_strategy(sumfacts, width, qp) for qp in quad_points} + qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) + + return qp, optimal_strategies[qp] + + +def level2_optimal_vectorization_strategy(sumfacts, width, qp): + # Find the sets of simultaneously realizable kernels + keys = frozenset(sf.parallel_key for sf in sumfacts) # Find minimums for each of these sets sfdict = frozendict() for key in keys: - key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key) - minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), - key=fixedqp_strategy_costfunction(qp)) - sfdict = add_to_frozendict(sfdict, minimum) + key_sumfacts = frozenset(sf for sf in sumfacts if sf.parallel_key == key) + key_strategy = min(level2_optimal_vectorization_strategy_generator(key_sumfacts, width, qp), + key=fixedqp_strategy_costfunction(qp)) + sfdict = add_to_frozendict(sfdict, key_strategy) return sfdict -def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()): +def level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): + for opp in _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp): + # Add non-vectorized implementation information to all kernels that are not present in + # the optimal strategy + yield add_to_frozendict(opp, + {sf: sf.copy(buffer=get_counted_variable("buffer")) for sf in sumfacts if sf not in opp}) + + +def _level2_optimal_vectorization_strategy_generator(sumfacts, width, qp, already=frozendict()): if len(sumfacts) == 0: - # We have gone into recursion deep enough to have all sum factorization nodes - # assigned their vectorized counterpart. We can yield the result now! yield already return - # Ensure a deterministic order of the given sumfact kernels. This is necessary for the - # fromlist strategy to pick correct strategies across different program runs - sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) - - # Otherwise we pick a random sum factorization kernel and construct all the vectorization - # opportunities realizing this particular kernel and go into recursion. - sf_to_decide = sumfacts[0] - - # Have "unvectorized" as an option, although it is not good - for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), - width, - qp, - add_to_frozendict(already, - {sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))} - ), - ): - yield opp - - horizontal = 1 - while horizontal <= width: - # Iterate over the possible combinations of sum factorization kernels - # taking into account all the permutations of kernels. This also includes - # combinations which use a padding of 1 - but only for pure horizontality. - generators = [it.permutations(sumfacts, horizontal)] - if horizontal >= 4: - generators.append(it.permutations(sumfacts, horizontal - 1)) - for combo in it.chain(*generators): - # The chosen kernels must be part of the kernels for recursion - # to work correctly - if sf_to_decide not in combo: - continue + # We store the information whether a vectorization opportunity has been yielded from this + # generator to yield an incomplete strategy if not (which is then completed with unvectorized + # kernel implementations) + yielded = False + + # Find the number of input coefficients we can work on + keys = frozenset(sf.inout_key for sf in sumfacts) + + inoutkey_sumfacts = [tuple(sorted(filter(lambda sf: sf.inout_key == key, sumfacts))) for key in sorted(keys)] + + for parallel in (1, 2): + if parallel > len(keys): + continue + + horizontal = 1 + while horizontal <= width // parallel: + combo = sum((inoutkey_sumfacts[part][:horizontal] for part in range(parallel)), ()) + + vecdict = get_vectorization_dict(combo, width // (horizontal * parallel), horizontal * parallel, qp) + horizontal *= 2 - # Set up the vectorization dict for this combo - vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp) if vecdict is None: # This particular choice was rejected for some reason. # Possible reasons: # * the quadrature point tuple not being suitable # for this vectorization strategy + # * there are not enough horizontal kernels continue # Go into recursion to also vectorize all kernels not in this combo - for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), - width, - qp, - add_to_frozendict(already, vecdict), - ): + for opp in _level2_optimal_vectorization_strategy_generator(list_diff(sumfacts, combo), + width, + qp, + add_to_frozendict(already, vecdict), + ): + yielded = True yield opp - horizontal = horizontal * 2 + # If we did not yield on this recursion level, yield what we got so far + if not yielded: + yield already def get_vectorization_dict(sumfacts, vertical, horizontal, qp): + # Discard opportunities that do not contain enough horizontal kernels + if len(sumfacts) not in (horizontal, horizontal - 1): + return None + # Enhance the list of sumfact nodes by adding vertical splittings kernels = [] for sf in sumfacts: diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py index e302f28c0e63d03d013c181dc0eb4f831ec1648c..b29ebe221387a5af529f0dec29592b4de8dc762a 100644 --- a/python/dune/perftool/tools.py +++ b/python/dune/perftool/tools.py @@ -82,3 +82,14 @@ def list_diff(l1, l2): if item not in l2: l.append(item) return l + + +def get_leaf(element, index): + """ return a leaf element if the given element is a MixedElement """ + leaf_element = element + from ufl import MixedElement + if isinstance(element, MixedElement): + assert isinstance(index, int) + leaf_element = element.extract_component(index)[1] + + return leaf_element