diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index c8eece5274d632fc764286bbcf1f346ba28092bd..3fa1a27d125e242ec0bd59bfc58c09b055f27818 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -86,6 +86,7 @@ class CodegenFormOptionsArray(ImmutableRecord): sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...") sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.") + sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.") vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 393b21ff3c00e5765c525c10dfe4c7b41a56c59b..5cc2178ce0adc8083ceeda7224b4180e2e4186ba 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -7,6 +7,7 @@ import filelock import hashlib import json from operator import mul +from six.moves import reduce import time import loopy as lp diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 71fd8c71e94b5ca7feffa59ec876af8149fa039c..61867987ff21a6348fe7f567a63c64836f7a81d2 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -385,12 +385,14 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): def realize_direct_output(self, result, inames, shape, **args): outputs = set(self.interfaces) + # Introduce substrule for the argument of the horizontal add + substname = "haddsubst_{}".format("_".join([i.name for i in inames])) + subst_rule(substname, (), result) + result = prim.Call(prim.Variable(substname), ()) + # If multiple horizontal_add's are to be performed with 'result' # we need to precompute the result! if len(outputs) > 1: - substname = "haddsubst_{}".format("_".join([i.name for i in inames])) - subst_rule(substname, (), result) - result = prim.Call(prim.Variable(substname), ()) transform(lp.precompute, substname) deps = frozenset() diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py index 41c0d690da9d8aa525871d77dc8672947795b5f8..04beaa1d765d567e172bf0162fccd121d10f6309 100644 --- a/python/dune/codegen/sumfact/transformations.py +++ b/python/dune/codegen/sumfact/transformations.py @@ -1,4 +1,5 @@ import re +import logging import loopy as lp import pymbolic.primitives as prim @@ -182,17 +183,17 @@ def _get_inames_of_reduction(instr, iname_permutation): inner_inames_base = [] for index, within in enumerate(within_reduction): if within: - inner_inames_base.append('sf_out_inames_{}'.format(dim-1-index)) + inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index)) # Name of inner inames and outer inames inner_inames = [] outer_inames = [] vec_inames = [] - for i in [x.name for x in instr.assignee.index_tuple]: + for i in [iname for iname in instr.within_inames]: for j in inner_inames_base: if j in i: inner_inames.append(i) - if 'vec' in i: + if 'sf_vec' in i: vec_inames.append(i) elif i not in inner_inames: outer_inames.append(i) @@ -200,13 +201,12 @@ def _get_inames_of_reduction(instr, iname_permutation): # Reduction iname regex = re.compile('sf_red_([0-9]*)') reduction_index = set(regex.findall(str(instr))) - if isinstance(instr.expression, lp.symbolic.Reduction): + if len(reduction_index) == 0: + reduction_iname = None + else: assert len(reduction_index) == 1 reduction_index = reduction_index.pop() reduction_iname = 'sf_red_{}'.format(reduction_index) - else: - assert len(reduction_index) == 0 - reduction_iname = None return outer_inames, reduction_iname, inner_inames, vec_inames @@ -285,11 +285,11 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): dim = world_dimension() # Nothing to do if permutation is identity - if iname_permutation == tuple(range(dim+1)): + if iname_permutation == tuple(range(dim + 1)): return kernel # Use names used in sum factorization kernel (without the index that distinguishes the different directions) - default_iname_order = ['sf_out_inames_{}'.format(dim-1-i) for i in range(dim)] + ['sf_red'] + default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red'] from dune.codegen.sumfact.permutation import permute_backward new_iname_order = permute_backward(default_iname_order, iname_permutation) @@ -308,21 +308,21 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): return current_iname_order for instr in kernel.instructions: + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + # We can directly use lp.prioritize_loops if: # - The reduction is the innermost loop # - There is no reduction (eg reduced direction on faces) - if iname_permutation[-1] == dim or not isinstance(instr.expression, lp.symbolic.Reduction): - # Inames used in this reduction - outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, - iname_permutation) + if iname_permutation[-1] == dim or reduction_iname is None: current_iname_order = _current_new_iname_order(outer_inames, reduction_iname, inner_inames, new_iname_order) kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) continue - - assert isinstance(instr.expression.operation, lp.library.reduction.SumReductionOperation) + assert isinstance(instr.expression, lp.symbolic.Reduction) # Dependencies. Note: We change the instructions below so we cannot do # depends_on=instr.depends_on. But we know that the id of instr is @@ -337,33 +337,36 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): # Remove reduction kernel = lp.remove_instructions(kernel, set([instr.id])) - # Inames used in this reduction - outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, iname_permutation) - # Create dim_tags dim_tags = ','.join(['f'] * len(inner_inames)) - agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name] - vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag) + vectorized = len(vec_inames) > 0 if vectorized: assert len(vec_inames) == 1 dim_tags = dim_tags + ',vec' # Create shape - agg_inames = tuple(i.name for i in instr.assignee.index_tuple) - agg_shape = agg_variable.shape - shape = tuple(agg_shape[agg_inames.index(i)] for i in inner_inames) - if vectorized: - shape = shape + (agg_shape[-1],) + def _get_iname_bound(kernel, iname): + # TODO: Not sure if that works in all cases + ldi, = kernel.get_leaf_domain_indices((iname,)) + domain = kernel.domains[ldi] + pwaff = domain.dim_max(0) + bound = lp.symbolic.pw_aff_to_expr(pwaff) + return bound + 1 + shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames) # Update temporary_variables of this kernel from dune.codegen.loopy.temporary import DuneTemporaryVariable accum_variable = get_counted_variable('accum_variable') + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = lp.types.NumpyType(dtype_floatingpoint()) var = {accum_variable: DuneTemporaryVariable(accum_variable, - dtype=agg_variable.dtype, + dtype=dtype, shape=shape, dim_tags=dim_tags, managed=True)} - kernel.temporary_variables.update(var) + tv = kernel.temporary_variables.copy() + tv.update(var) + kernel = kernel.copy(temporary_variables=tv) # Set accumulation variable to zero accum_init_inames = tuple(prim.Variable(i) for i in inner_inames) @@ -378,7 +381,7 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): depends_on=depends_on, tags=('accum_init',), ) - kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,]) + kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr]) # Accumulate in temporary variable assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) @@ -389,26 +392,46 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): expression, within_inames=within_inames, id=accum_id, - depends_on=frozenset([accum_init_id,]), + depends_on=frozenset([accum_init_id]), tags=('accum',), ) - kernel = kernel.copy(instructions=kernel.instructions + [accum_instr,]) - - # Assign accumulation result - assign_id = instr.id + '_assign' - assign_instr = lp.Assignment(instr.assignee, - assignee, - within_inames=within_inames, - id=assign_id, - depends_on=frozenset([accum_id,]), - tags=('accum_assign',), - ) - kernel = kernel.copy(instructions=kernel.instructions + [assign_instr,]) + kernel = kernel.copy(instructions=kernel.instructions + [accum_instr]) + + if 'haddsubst' not in str(instr): + # Write back the result if this reduction does not come from a substitution + assign_id = instr.id + '_assign' + assign_instr = lp.Assignment(instr.assignee, + assignee, + within_inames=within_inames, + id=assign_id, + depends_on=frozenset([accum_id]), + tags=('accum_assign',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [assign_instr]) - # Restore dependencies - for dep in depending: - match = lp.match.Id(dep) - kernel = lp.add_dependency(kernel, match, assign_id) + # Restore dependencies + for dep in depending: + match = lp.match.Id(dep) + kernel = lp.add_dependency(kernel, match, assign_id) + else: + # If this reduction comes from a substitution take the + # corresponding assignment and replace the substitution with the + # accumulation result + assert len(depending) == 1 + assignment, = [i for i in kernel.instructions if depending[0] == i.id] + other = [i for i in kernel.instructions if i.id != assignment.id] + replace = {prim.Variable(instr.assignee.name): assignee} + from dune.codegen.loopy.symbolic import substitute + new_expression = substitute(assignment.expression, replace) + new_instr = assignment.copy(expression=new_expression, + depends_on=frozenset(list(assignment.depends_on) + [accum_id])) + kernel = kernel.copy(instructions=other + [new_instr]) + + # Remove the temporary variable that belongs to the + # substitution. It is no longer used. + tv = kernel.temporary_variables.copy() + del tv[instr.assignee.name] + kernel = kernel.copy(temporary_variables=tv) # Reordering loops only works if we duplicate some inames duplicate_inames = tuple(i.name for i in accum_init_inames) @@ -416,8 +439,13 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): duplicate_inames = duplicate_inames[:-1] match = lp.match.Id(accum_init_id) kernel = lp.duplicate_inames(kernel, duplicate_inames, match) - match = lp.match.Id(assign_id) - kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + # Reorder loops of the assignment of the result + if 'haddsubst' not in str(instr): + match = lp.match.Id(assign_id) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + else: + match = lp.match.Id(assignment.id) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) # Change loop order current_iname_order = _current_new_iname_order(outer_inames, @@ -478,37 +506,32 @@ def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variabl """ if accum_variable: - return _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation) + kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation) + return kernel else: - # Need to adapts this! + # TODO: Need to adapt this! assert False - return _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation) + kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation) + return kernel def tensor_contraction_loop_order_generator(kernel): dim = world_dimension() - assert dim == 3 - - indices = ['l', 'k', 'i', 'j'] + identity = range(dim + 1) import itertools - for loop_order in itertools.permutations(indices): - # palpo TODO: Heavy culling for 'quick' tests during development - if loop_order[0] != 'l' or loop_order[1] != 'k': + for permutation in itertools.permutations(identity): + # Culling of stupid variant + if permutation[0] == dim: continue - order = ''.join(loop_order) - new_kernel = reorder_loops_in_tensor_contraction(kernel, order, True) - yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(order),] + new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, True) + yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)] - new_kernel = reorder_loops_in_tensor_contraction(kernel, loop_order, False) - yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(order),] + # new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, False) + # yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation),] def simple_autotuner(kernel_generator, signature): - # palpo TODO - from dune.codegen.options import set_option - set_option("autotune_google_benchmark", True) - kernel, transformations = next(kernel_generator) best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations) best_kernel = kernel @@ -521,23 +544,27 @@ def simple_autotuner(kernel_generator, signature): def autotune_tensor_contraction_loop_order(kernel, signature): + assert get_option('autotune_google_benchmark') from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add kernel = match_fused_multiply_add(kernel) - generator = tensor_contraction_loop_order_generator(kernel) return simple_autotuner(generator, signature) def sumfact_performance_transformations(kernel, signature): - if kernel.name.startswith('sfimpl'): - # 2D - # kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True) - - # 3D - # kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True) - # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True) - - # kernel = autotune_tensor_contraction_loop_order(kernel, signature) - - pass + logger = logging.getLogger(__name__) + + if get_form_option('sumfact_performance_transformations'): + logger.info('Apply sumfact performance transformations') + if kernel.name.startswith('sfimpl'): + # # TODO + # dim = world_dimension() + # if dim == 2: + # kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True) + # else: + # kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True) + # # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True) + + kernel = autotune_tensor_contraction_loop_order(kernel, signature) + pass return kernel diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..929f06fe5df790dcd0bed7b3589ec9affd026fd1 100644 --- a/test/sumfact/poisson/CMakeLists.txt +++ b/test/sumfact/poisson/CMakeLists.txt @@ -134,3 +134,16 @@ if(benchmark_FOUND) INIFILE poisson_fastdg_volumes_3d_benchmark.mini ) endif() + + +#================================================ +# Poisson fastdg with performance transformations +#================================================ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_poisson_fastdg_2d_performance_transformations + INIFILE poisson_fastdg_2d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_poisson_fastdg_3d_performance_transformations + INIFILE poisson_fastdg_3d_performance_transformations.mini + ) diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..ee6b6080906326fd0a6fff28a28c17cc2937feb0 --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini @@ -0,0 +1,27 @@ +__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = explicit, none | expand gradvec +fastdg = 1 +sumfact_performance_transformations = 1 +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2 \ No newline at end of file diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..f1e2a56f1a8e54160c0f9f75f28c37704d3d2bf8 --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec + +cells = 8 8 8 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +# vectorization_strategy = explicit, none | expand gradvec +vectorization_strategy = model, none | expand gradvec +fastdg = 1 +sumfact_performance_transformations = 1 +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2