diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py index b7ab65cf80506fe107e117ee2bb06da2bee42eef..8319890b0284ff96c6a38990a50980894c200484 100644 --- a/python/dune/codegen/sumfact/transformations.py +++ b/python/dune/codegen/sumfact/transformations.py @@ -9,6 +9,7 @@ from dune.codegen.loopy.transformations.remove_reductions import remove_all_redu from dune.codegen.options import get_form_option, get_option from dune.codegen.pdelab.geometry import world_dimension from dune.codegen.error import CodegenAutotuneError +from dune.codegen.sumfact.autotune import autotune_realization def move_zero_assignment_up(kernel, move_up_inames): @@ -169,16 +170,46 @@ def reorder_loops_in_tensor_contraction(kernel, iname_order): return kernel +def tensor_contraction_loop_order_generator(kernel): + dim = world_dimension() + assert dim == 3 + yield kernel + + indices = ['l', 'k', 'i', 'j'] + import itertools + for loop_order in itertools.permutations(indices): + loop_order = ''.join(loop_order) + new_kernel = reorder_loops_in_tensor_contraction(kernel, loop_order) + yield new_kernel + + +def simple_autotuner(kernel_generator, signature): + # palpo TODO + from dune.codegen.options import set_option + set_option("autotune_google_benchmark", True) + + kernel = next(kernel_generator) + best_cost = autotune_realization(kernel=kernel, signature=signature) + best_kernel = kernel + for kernel in kernel_generator: + cost = autotune_realization(kernel=kernel, signature=signature) + if cost < best_cost: + best_cost = cost + best_kernel = kernel + return best_kernel + + +def autotune_tensor_contraction_loop_order(kernel, signature): + from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add + kernel = match_fused_multiply_add(kernel) + + generator = tensor_contraction_loop_order_generator(kernel) + return simple_autotuner(generator, signature) + + def sumfact_performance_transformations(kernel, signature): if kernel.name.startswith('sfimpl'): - # from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add - # kernel = match_fused_multiply_add(kernel) - # kernel = reorder_loops_in_tensor_contraction(kernel, 'ijlk') - - # from dune.codegen.sumfact.autotune import autotune_realization - # from dune.codegen.options import set_option - # set_option("autotune_google_benchmark", True) - # test = autotune_realization(kernel=kernel, signature=signature) + # kernel = autotune_tensor_contraction_loop_order(kernel, signature) pass return kernel