diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index 133a1f1712f011953f47515dd08575fbecdbe202..8e7e835137e0c9ab987db1e7e1c3b7bab1abdf6a 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -17,13 +17,17 @@ function_mangler = generator_factory(item_tags=("mangler",)) silenced_warning = generator_factory(item_tags=("silenced_warning",), no_deco=True) +class DuneGlobalArg(loopy.GlobalArg): + allowed_extra_kwargs = loopy.GlobalArg.allowed_extra_kwargs + ["managed"] + + @generator_factory(item_tags=("argument", "globalarg"), cache_key_generator=lambda n, **kw: n) -def globalarg(name, shape=loopy.auto, **kw): +def globalarg(name, shape=loopy.auto, managed=True, **kw): if isinstance(shape, str): shape = (shape,) dtype = kw.pop("dtype", numpy.float64) - return loopy.GlobalArg(name, dtype=dtype, shape=shape, **kw) + return DuneGlobalArg(name, dtype=dtype, shape=shape, managed=managed, **kw) @generator_factory(item_tags=("argument", "constantarg"), diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 55fb43c71c7218f7969e06a70339372a68acfb26..57ff1d237c48c0c9ed27ea36698410381d1aa718 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -1,5 +1,6 @@ from dune.perftool.generation import post_include +from dune.perftool.generation.loopy import DuneGlobalArg from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.vcl import VCLTypeRegistry from dune.perftool.generation import (include_file, @@ -29,7 +30,7 @@ _registry = {'float32': 'float', class DuneExpressionToCExpressionMapper(ExpressionToCExpressionMapper): def map_subscript(self, expr, type_context): arr = self.find_array(expr) - if isinstance(arr, DuneTemporaryVariable) and not arr.managed: + if isinstance(arr, (DuneTemporaryVariable, DuneGlobalArg)) and not arr.managed: # If there is but one index, we do not need to handle this if isinstance(expr.index, (prim.Variable, int)): return expr diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py index e39595f9b35014937cd907f34ae784d038a34d95..5839873dceb5222ffc65edb02246b70243762973 100644 --- a/python/dune/perftool/loopy/transformations/collect_rotate.py +++ b/python/dune/perftool/loopy/transformations/collect_rotate.py @@ -66,14 +66,10 @@ def collect_vector_data_rotate(knl, insns, inames): basename = get_pymbolic_basename(expr) quantities.setdefault(basename, frozenset()) quantities[basename] = quantities[basename].union(frozenset([expr])) - assert all(len(q) == 1 for q in quantities.values()) # Add vector size buffers for all these quantities - replacemap_arr = {} replacemap_vec = {} for quantity in quantities: - expr, = quantities[quantity] - # Check whether there is an instruction that writes this quantity within # the given inames. If so, we need a buffer array. iname_match = lp.match.And(tuple(lp.match.Iname(i) for i in inames)) @@ -83,38 +79,57 @@ def collect_vector_data_rotate(knl, insns, inames): all_writers.extend([i.id for i in write_insns]) if write_insns: + # Determine the shape of the quantity + shape = knl.temporary_variables[quantity].shape + arrname = quantity + '_buffered_arr' knl = add_temporary_with_vector_view(knl, arrname, dtype=np.float64, - shape=(vec_size,), - dim_tags="c", + shape=shape + (vec_size,), + dim_tags=",".join("c" for i in range(len(shape) + 1)), base_storage=quantity + '_base_storage', scope=lp.temp_var_scope.PRIVATE, ) - replacemap_arr[quantity] = prim.Subscript(prim.Variable(arrname), (prim.Variable('rotate_index'),)) - replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(arrname)), (0, prim.Variable(new_iname),)) + def get_quantity_subscripts(e, zero=False): + if isinstance(e, prim.Subscript): + index = e.index + if isinstance(index, tuple): + return index + else: + return (index,) + else: + if zero: + return (0,) + else: + return () + + for expr in quantities[quantity]: + replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(arrname)), get_quantity_subscripts(expr, zero=True) + (prim.Variable(new_iname),)) for insn in write_insns: if isinstance(insn, lp.Assignment): - new_insns.append(insn.copy(assignee=replacemap_arr[get_pymbolic_basename(insn.assignee)], + assignee = prim.Subscript(prim.Variable(arrname), get_quantity_subscripts(insn.assignee) + (prim.Variable('rotate_index'),)) + new_insns.append(insn.copy(assignee=assignee, depends_on_is_final=True, ) ) elif isinstance(insn, lp.CInstruction): # Rip apart the code and change the assignee assignee, expression = insn.code.split("=") - assignee = assignee.strip() - assert assignee in replacemap_arr - code = "{} ={}".format(str(replacemap_arr[assignee]), expression) + # TODO This is a bit whacky: It only works for scalar assignees + # OTOH this code is on its way out anyway, because of CInstruction + assignee = prim.Subscript(prim.Variable(arrname), (prim.Variable('rotate_index'),)) + + code = "{} ={}".format(str(assignee), expression) new_insns.append(insn.copy(code=code, depends_on_is_final=True, )) else: raise NotImplementedError - else: + elif quantity in knl.temporary_variables: # Add a vector view to this quantity knl = add_vector_view(knl, quantity) replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index b294fbb15cb73b3e07f981fbce33d1b964009fc3..75d879949e6930313b69faec4022bd70a8287bee 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -36,7 +36,8 @@ def get_form_compiler_arguments(): parser.add_argument("--print-transformations", action="store_true", help="print out dot files after ufl tree transformations") parser.add_argument("--print-transformations-dir", type=str, help="place where to put dot files (can be omitted)") parser.add_argument("--quadrature-order", type=int, help="Quadrature order used for all integrals.") - parser.add_argument("--diagonal-transformation-matrix", action="store_true", help="set option if the jacoby of the transformation is diagonal (axiparallel grids)") + parser.add_argument("--diagonal-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is diagonal (axiparallel grids)") + parser.add_argument("--constant-transformation-matrix", action="store_true", help="set option if the jacobian of the transformation is constant on a cell") parser.add_argument("--ini-file", type=str, help="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)") parser.add_argument("--timer", action="store_true", help="measure times") parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project") diff --git a/python/dune/perftool/pdelab/driver.py b/python/dune/perftool/pdelab/driver.py index 6d47731038bb96652cf7f50fa27f3791a224423e..9b8efbea778fd2b510fa0534c996363dd242f173 100644 --- a/python/dune/perftool/pdelab/driver.py +++ b/python/dune/perftool/pdelab/driver.py @@ -176,9 +176,10 @@ def name_dimension(): def typedef_grid(name): dim = name_dimension() if any(_driver_data['form'].ufl_cell().cellname() in x for x in ["vertex", "interval", "quadrilateral", "hexahedron"]): - # For Yasp Grids the jacobi of the transformation is diagonal + # For Yasp Grids the jacobi of the transformation is diagonal and constant on each cell from dune.perftool.options import set_option set_option('diagonal_transformation_matrix', True) + set_option('constant_transformation_matrix', True) gridt = "Dune::YaspGrid<{}>".format(dim) include_file("dune/grid/yaspgrid.hh", filetag="driver") diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py index 63ffcd08ad8362c276b54034f9bc4a4e7a74ca47..9d8a1c28d1bfbd50af29e9c3f6c850b60a949de1 100644 --- a/python/dune/perftool/pdelab/geometry.py +++ b/python/dune/perftool/pdelab/geometry.py @@ -1,14 +1,18 @@ from dune.perftool.ufl.modified_terminals import Restriction from dune.perftool.pdelab.restriction import restricted_name -from dune.perftool.generation import (cached, +from dune.perftool.generation import (backend, + cached, domain, get_backend, get_global_context_value, + globalarg, iname, include_file, preamble, temporary_variable, + valuearg, ) +from dune.perftool.options import option_switch from dune.perftool.pdelab.quadrature import (pymbolic_quadrature_position_in_cell, quadrature_preamble, ) @@ -17,6 +21,8 @@ from ufl.algorithms import MultiFunction from pymbolic.primitives import Variable from pymbolic.primitives import Expression as PymbolicExpression +import numpy as np + @preamble def define_reference_element(name): @@ -281,6 +287,26 @@ def define_jacobian_inverse_transposed_temporary(restriction): return _define_jacobian_inverse_transposed_temporary +@preamble +@backend(interface="define_jit", name="constant_transformation_matrix") +def define_constant_jacobian_inveser_transposed(name, restriction): + geo = name_cell_geometry(restriction) + pos = name_localcenter() + dim = name_dimension() + + if restriction: + geo_in = name_in_cell_geometry(restriction) + pos = "{}.global({})".format(geo_in, pos) + + globalarg(name, dtype=np.float64, shape=(dim, dim), managed=False) + + return 'auto {} = {}.jacobianInverseTransposed({});'.format(name, + geo, + pos, + ) + + +@backend(interface="define_jit", name="default") def define_jacobian_inverse_transposed(name, restriction): dim = name_dimension() temporary_variable(name, decl_method=define_jacobian_inverse_transposed_temporary(restriction), shape=(dim, dim)) @@ -297,10 +323,25 @@ def define_jacobian_inverse_transposed(name, restriction): def name_jacobian_inverse_transposed(restriction): name = restricted_name("jit", restriction) - define_jacobian_inverse_transposed(name, restriction) + get_backend(interface="define_jit", selector=option_switch("constant_transformation_matrix"))(name, restriction) return name +@backend(interface="detjac", name="constant_transformation_matrix") +@preamble +def define_constant_jacobian_determinant(name): + geo = name_geometry() + pos = name_localcenter() + + valuearg(name, dtype=np.float64) + + return "auto {} = {}.integrationElement({});".format(name, + geo, + pos, + ) + + +@backend(interface="detjac", name="default") def define_jacobian_determinant(name): temporary_variable(name, shape=()) geo = name_geometry() @@ -317,11 +358,11 @@ def define_jacobian_determinant(name): def name_jacobian_determinant(): name = 'detjac' - define_jacobian_determinant(name) + get_backend(interface="detjac", selector=option_switch("constant_transformation_matrix"))(name) return name def name_facet_jacobian_determinant(): name = 'fdetjac' - define_jacobian_determinant(name) + get_backend(interface="detjac", selector=option_switch("constant_transformation_matrix"))(name) return name diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index cd64a827e2a2ba90053ae3a61cabe8770d2b4207..e084853782f0d8c44ab85d984977c18f35a0e753 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -26,6 +26,8 @@ from dune.perftool.ufl.modified_terminals import Restriction from pymbolic.primitives import Variable from pytools import Record +import loopy as lp + def name_form(formdata, data): # Check wether the formdata has a name in UFL @@ -508,6 +510,19 @@ def generate_kernel(integrals): from dune.perftool.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) + # Maybe apply vectorization strategies + if get_option("vectorize"): + if get_option("sumfact"): + # Vectorization of the quadrature loop + insns = [i.id for i in lp.find_instructions(kernel, lp.match.Tagged("quadvec"))] + from dune.perftool.sumfact.quadrature import quadrature_inames + inames = quadrature_inames() + + from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate + kernel = collect_vector_data_rotate(kernel, insns, inames) + else: + raise NotImplementedError("Only vectorizing sumfactoized code right now!") + # Now add the preambles to the kernel preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("preamble"))] kernel = kernel.copy(preambles=preambles) diff --git a/python/dune/perftool/pdelab/quadrature.py b/python/dune/perftool/pdelab/quadrature.py index c0ddc6a99dac2c0f2f179e366e7d76846f28c7c4..72a42aa73ca20e0543f390ccbd193b5fc2895943 100644 --- a/python/dune/perftool/pdelab/quadrature.py +++ b/python/dune/perftool/pdelab/quadrature.py @@ -118,6 +118,8 @@ def name_quadrature_points(): """Name of vector storing quadrature points as class member""" dim = _local_dim() name = "qp_order" + str(dim) + shape = (name_quadrature_bound(), dim) + globalarg(name, shape=shape, dtype=numpy.float64, managed=False) define_quadrature_points(name) fill_quadrature_points_cache(name) return name diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 43b0ff43f682abca2ebcfe714a67bc129537cdb5..77edc9e0151799baa1f9c60c6f7355d1dbcc2006 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -41,32 +41,13 @@ def name_sumfact_base_buffer(): @cached def sumfact_evaluate_coefficient_gradient(element, name, restriction, component): - # First we determine the rank of the tensor we are talking about + # Get a temporary for the gradient from ufl.functionview import select_subelement sub_element = select_subelement(element, component) rank = len(sub_element.value_shape()) + 1 - - # We do then set some variables accordingly shape = sub_element.value_shape() + (element.cell().geometric_dimension(),) shape_impl = ('arr',) * rank - - from dune.perftool.pdelab.geometry import dimension_iname - idims = tuple(dimension_iname(count=i) for i in range(rank)) - leaf_element = sub_element - from ufl import VectorElement, TensorElement - if isinstance(sub_element, (VectorElement, TensorElement)): - leaf_element = sub_element.sub_elements()[0] - - # and proceed to call the necessary generator functions temporary_variable(name, shape=shape, shape_impl=shape_impl) - from dune.perftool.pdelab.spaces import name_lfs - lfs = name_lfs(element, restriction, component) - from dune.perftool.pdelab.basis import pymbolic_reference_gradient - basis = pymbolic_reference_gradient(leaf_element, restriction, 0, context='trialgrad') - from dune.perftool.tools import get_pymbolic_indices - index, _ = get_pymbolic_indices(basis) - if isinstance(sub_element, (VectorElement, TensorElement)): - lfs = lfs_child(lfs, idims[:-1], shape=shape_as_pymbolic(shape[:-1]), symmetry=element.symmetry()) # Calculate values with sumfactorization theta = name_theta() @@ -111,7 +92,7 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) expression = Subscript(Variable(buf), tuple(Variable(i) for i in quadrature_inames())) instruction(assignee=assignee, expression=expression, - forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset(idims)), + forced_iname_deps=frozenset(get_backend("quad_inames")()), forced_iname_deps_is_final=True, ) @@ -204,9 +185,6 @@ def pymbolic_basis(element, restriction, number): @backend(interface="evaluate_grad") @cached def evaluate_reference_gradient(element, name, restriction): - # from dune.perftool.pdelab.basis import name_leaf_lfs - # lfs = name_leaf_lfs(element, restriction) - # from dune.perftool.pdelab.spaces import name_lfs_bound from dune.perftool.pdelab.geometry import name_dimension temporary_variable( name, diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index f163217dcd1db4e73f13c7ee9ef8c3064bbade06..3624022806170316035c10b4fa4e98263d7d52ef 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -157,6 +157,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): expression=expression, forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), forced_iname_deps_is_final=True, + tags=frozenset({"quadvec"}), ) contribution_ids.append(contrib_dep) @@ -205,11 +206,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Mark the transformation that moves the quadrature loop inside the trialfunction loops for application transform(nest_quadrature_loops, visitor.inames) - # Maybe try to vectorize! - if get_option("vectorize"): - from dune.perftool.loopy.transformations.collect_rotate import collect_vector_data_rotate - transform(collect_vector_data_rotate, contribution_ids, quadrature_inames()) - def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({})): """ diff --git a/test/sumfact/poisson/poisson_order1.mini b/test/sumfact/poisson/poisson_order1.mini index 0c4919fea7ce92f8a897e7ace214fc8041b4d086..7591840603d2880c21542f1f3df3d3c8a818ef73 100644 --- a/test/sumfact/poisson/poisson_order1.mini +++ b/test/sumfact/poisson/poisson_order1.mini @@ -1,5 +1,8 @@ __name = sumfact_poisson_order1_{__exec_suffix} -__exec_suffix = numdiff, symdiff | expand num +__exec_suffix = {diff_suffix}_{vec_suffix} + +diff_suffix = numdiff, symdiff | expand num +vec_suffix = vec, nonvec | expand vec cells = 8 8 extension = 1. 1. @@ -14,3 +17,4 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-4 sumfact = 1 +vectorize = 1, 0 | expand vec diff --git a/test/sumfact/poisson/poisson_order2.mini b/test/sumfact/poisson/poisson_order2.mini index 88e4f3b89ca7f9ca82faa1680d85b4429e632ea1..8830f173ccb742221c987af29181938ee230f2d9 100644 --- a/test/sumfact/poisson/poisson_order2.mini +++ b/test/sumfact/poisson/poisson_order2.mini @@ -1,5 +1,8 @@ __name = sumfact_poisson_order2_{__exec_suffix} -__exec_suffix = numdiff, symdiff | expand num +__exec_suffix = {diff_suffix}_{vec_suffix} + +diff_suffix = numdiff, symdiff | expand num +vec_suffix = vec, nonvec | expand vec cells = 8 8 extension = 1. 1. @@ -14,3 +17,4 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-8 sumfact = 1 +vectorize = 1, 0 | expand vec