diff --git a/python/dune/perftool/error.py b/python/dune/perftool/error.py index 3f99a83abeb947890b5c7fb36ffd2663514796fb..4d428b41b516845c3eb18803539edbf293f44cf1 100644 --- a/python/dune/perftool/error.py +++ b/python/dune/perftool/error.py @@ -15,3 +15,7 @@ class PerftoolCodegenError(PerftoolError): class PerftoolLoopyError(PerftoolError): pass + + +class PerftoolVectorizationError(PerftoolCodegenError): + pass diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 5611bbedf4eab089eceb8740487c6f60236f543e..adf7e8ce483fd36bdaa44a90fa8b5f54bb25e442 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -57,17 +57,19 @@ class PerftoolOptionsArray(ImmutableRecord): fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.") sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|fromlist") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization") vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") + vectorization_list_index = PerftoolOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.") simplify = PerftoolOption(default=False, helpstr="Whether to simplify expressions using sympy") precision_bits = PerftoolOption(default=64, helpstr="The number of bits for the floating point type") assure_statement_ordering = PerftoolOption(default=False, helpstr="Whether special care should be taken for a good statement ordering in sumfact kernels, runs into a loopy scheduler performance bug, but is necessary for production.") + generate_jacobians = PerftoolOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.") # Arguments that are mainly to be set by logic depending on other options max_vector_width = PerftoolOption(default=256, helpstr=None) @@ -127,6 +129,9 @@ def process_options(opt): if opt.sumfact: opt = opt.copy(unroll_dimension_loops=True) + if opt.numerical_jacobian: + opt = opt.copy(generate_jacobians=False) + if opt.overlapping: opt = opt.copy(parallel=True) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 23c92da10c0945b6e4196b1ccde44f0cdbdb5c83..76d6b0c162e2b5677905ac6c79cfefc8f9f3e01b 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -821,22 +821,23 @@ def generate_localoperator_kernels(formdata, data): jacform = preprocess_form(jacform).preprocessed_form with global_context(form_type="jacobian"): - for measure in set(i.integral_type() for i in jacform.integrals()): - logger.info("generate_localoperator_kernels: measure {}".format(measure)) - with global_context(integral_type=measure): - with global_context(kernel=assembler_routine_name()): - kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jacform.integrals_by_type(measure))] - operator_kernels[(measure, 'jacobian')] = kernel - - # Generate dummy functions for those kernels, that vanished in the differentiation process - # We *could* solve this problem by using lambda_* terms but we do not really want that, so - # we use empty jacobian assembly methods instead - alpha_measures = set(i.integral_type() for i in form.integrals()) - jacobian_measures = set(i.integral_type() for i in jacform.integrals()) - for it in alpha_measures - jacobian_measures: - with global_context(integral_type=it): - from dune.perftool.pdelab.signatures import assembly_routine_signature - operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] + if get_option("generate_jacobians"): + for measure in set(i.integral_type() for i in jacform.integrals()): + logger.info("generate_localoperator_kernels: measure {}".format(measure)) + with global_context(integral_type=measure): + with global_context(kernel=assembler_routine_name()): + kernel = [k for k in get_backend(interface="generate_kernels_per_integral")(jacform.integrals_by_type(measure))] + operator_kernels[(measure, 'jacobian')] = kernel + + # Generate dummy functions for those kernels, that vanished in the differentiation process + # We *could* solve this problem by using lambda_* terms but we do not really want that, so + # we use empty jacobian assembly methods instead + alpha_measures = set(i.integral_type() for i in form.integrals()) + jacobian_measures = set(i.integral_type() for i in jacform.integrals()) + for it in alpha_measures - jacobian_measures: + with global_context(integral_type=it): + from dune.perftool.pdelab.signatures import assembly_routine_signature + operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] # Jacobian apply methods for matrix-free computations if get_option("matrix_free"): diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 46563ee1f6805037b13b13136e16f2fcc1d23ff0..19278ef7ff5fe6cfbefdd9534f30bbe4ee5ea345 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -64,9 +64,12 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): restriction=restriction, ) - def __str__(self): + def __repr__(self): return "{}_{}".format(self.coeff_func(self.restriction), self.element_index) + def __str__(self): + return repr(self) + def realize(self, sf, index, insn_dep): lfs = name_lfs(self.element, self.restriction, self.element_index) basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index cbe8fbbc374c6ada8e3c63a1a81adb5a3c742ac6..4009a7bf7a2bb5356f53129f28c80420ff5a8c1a 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -22,7 +22,7 @@ from dune.perftool.generation import (class_member, ) from dune.perftool.loopy.buffer import get_buffer_temporary from dune.perftool.loopy.target import dtype_floatingpoint -from dune.perftool.loopy.vcl import ExplicitVCLCast +from dune.perftool.loopy.vcl import ExplicitVCLCast, get_vcl_type_size from dune.perftool.pdelab.localoperator import (name_domain_field, lop_template_range_field, ) @@ -196,7 +196,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): # Check whether we can realize this by broadcasting the values of a simple tabulation if len(set(self.tabs)) == 1: theta = self.tabs[0].pymbolic(indices[:-1]) - return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=len(self.tabs)), (theta,)) + return prim.Call(ExplicitVCLCast(dtype_floatingpoint(), vector_width=get_vcl_type_size(dtype_floatingpoint())), (theta,)) abbrevs = tuple("{}x{}".format("d" if t.derivative else "", "s{}".format(t.slice_index) if t.slice_size is not None else "") diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index f3fb96b99b0d7cb0a633e6d499f6e362db5da207..4dab904154d80072585453383e482725bfa5da97 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -18,9 +18,9 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, quadrature_points_per_direction, set_quadrature_points, ) -from dune.perftool.error import PerftoolError +from dune.perftool.error import PerftoolVectorizationError from dune.perftool.options import get_option -from dune.perftool.tools import add_to_frozendict, round_to_multiple +from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff from pytools import product from frozendict import frozendict @@ -33,7 +33,7 @@ import math @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) def _cache_vectorization_info(old, new): if new is None: - raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!") + raise PerftoolVectorizationError("Vectorization info for sum factorization kernel was not gathered correctly!") return new @@ -69,6 +69,12 @@ def costmodel(sf): return sf.operations * position_penalty_factor(sf) * vertical_penalty * scalar_penalty +@backend(interface="vectorization_strategy", name="fromlist") +def fromlist_costmodel(sf): + # The fromlist strategy needs to reuse the cost model! + return costmodel(sf) + + @backend(interface="vectorization_strategy", name="explicit") def explicit_costfunction(sf): # Read the explicitly set values for horizontal and vertical vectorization @@ -139,7 +145,7 @@ def decide_vectorization_strategy(): from dune.perftool.generation import retrieve_cache_items all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")] - # Stage 1 sumfactorizations that were actually used + # Stage 1 sum factorizations that were actually used basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')] # This means we can have sum factorizations that will not get used @@ -164,7 +170,6 @@ def decide_vectorization_strategy(): # Optimize over all the possible quadrature point tuples # quad_points = [quadrature_points_per_direction()] - if get_option("vectorization_allow_quadrature_changes"): sf = next(iter(active_sumfacts)) depth = 1 @@ -176,10 +181,43 @@ def decide_vectorization_strategy(): depth = depth * 2 quad_points = list(set(quad_points)) - # Find the minimum cost strategy between all the quadrature point tuples - optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points} - qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp])) - sfdict = optimal_strategies[qp] + if get_option("vectorization_strategy") == "fromlist": + # This is a bit special and does not follow the minimization procedure at all + + def _choose_strategy_from_list(stage1_sumfacts): + strategy = 0 + for qp in quad_points: + for strat in fixed_quad_vectorization_opportunity_generator(frozenset(stage1_sumfacts), width, qp): + if strategy == int(get_option("vectorization_list_index")): + set_quadrature_points(qp) + # Output the strategy and its cost into a separate file + if get_global_context_value("form_type") == "jacobian_apply": + with open("strategycosts.csv", "a") as f: + f.write("{} {}\n".format(strategy, strategy_cost(strat))) + return qp, strat + strategy = strategy + 1 + + raise PerftoolVectorizationError("Specified vectorization list index '{}' was too high!".format(get_option("vectorization_list_index"))) + + s1_sumfacts = frozenset(sf for sf in active_sumfacts if sf.stage == 1) + + total = sum(len([s for s in fixed_quad_vectorization_opportunity_generator(frozenset(s1_sumfacts), width, qp)]) for qp in quad_points) + print("'fromlist' vectorization is attempting to pick #{} of {} strategies...".format(int(get_option("vectorization_list_index")), + total)) + qp, sfdict = _choose_strategy_from_list(s1_sumfacts) + + keys = frozenset(sf.input_key for sf in active_sumfacts if sf.stage != 1) + for key in keys: + key_sumfacts = frozenset(sf for sf in active_sumfacts if sf.input_key == key) + minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), + key=strategy_cost) + sfdict = add_to_frozendict(sfdict, minimum) + else: + # Find the minimum cost strategy between all the quadrature point tuples + optimal_strategies = {qp: fixed_quadrature_optimal_vectorization(active_sumfacts, width, qp) for qp in quad_points} + qp = min(optimal_strategies, key=lambda qp: strategy_cost(optimal_strategies[qp])) + sfdict = optimal_strategies[qp] + set_quadrature_points(qp) logger.debug("decide_vectorization_strategy: Decided for the following strategy:" @@ -207,6 +245,7 @@ def fixed_quadrature_optimal_vectorization(sumfacts, width, qp): # Find minimums for each of these sets sfdict = frozendict() + for key in keys: key_sumfacts = frozenset(sf for sf in sumfacts if sf.input_key == key) minimum = min(fixed_quad_vectorization_opportunity_generator(key_sumfacts, width, qp), @@ -223,12 +262,16 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already= yield already return + # Ensure a deterministic order of the given sumfact kernels. This is necessary for the + # fromlist strategy to pick correct strategies across different program runs + sumfacts = sorted(sumfacts, key=lambda sf: repr(sf)) + # Otherwise we pick a random sum factorization kernel and construct all the vectorization # opportunities realizing this particular kernel and go into recursion. - sf_to_decide = next(iter(sumfacts)) + sf_to_decide = sumfacts[0] # Have "unvectorized" as an option, although it is not good - for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}), + for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, [sf_to_decide]), width, qp, add_to_frozendict(already, @@ -261,7 +304,7 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already= continue # Go into recursion to also vectorize all kernels not in this combo - for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo), + for opp in fixed_quad_vectorization_opportunity_generator(list_diff(sumfacts, combo), width, qp, add_to_frozendict(already, vecdict), diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py index d358f3ab45ef8231a3a952a881167e7108c14704..e302f28c0e63d03d013c181dc0eb4f831ec1648c 100644 --- a/python/dune/perftool/tools.py +++ b/python/dune/perftool/tools.py @@ -74,3 +74,11 @@ def add_to_frozendict(fd, valdict): t = dict(fd) t.update(valdict) return frozendict.frozendict(t) + + +def list_diff(l1, l2): + l = [] + for item in l1: + if item not in l2: + l.append(item) + return l diff --git a/python/setup.py b/python/setup.py index ccbe9e56658fa87a0c574767f6c7f4282612a750..c7e80ce1b0de4a2a561ee1a1a4d2189ec0df0d0c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -45,5 +45,6 @@ setup(name='dune.perftool', entry_points = { "console_scripts": [ "ufl2pdelab = dune.perftool.compile:compile_form", + "picklevecstrats = dune.perftool.sumfact.vectorization:pickle_vectorization_strategies", ] })