diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 7455f1bb242fa042011f005e154e8088264ad47f..76a7f54f9d3516ee047913cb0ebd4947c189536b 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -84,6 +84,7 @@ class CodegenFormOptionsArray(ImmutableRecord): sumfact = CodegenOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...") + sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.") vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index 09160ea5b0fcdcbad23621ff6a8d8c66267df666..8a22a1d8f45c79d90b9a6e9fed643bdc30787c59 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -164,7 +164,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): from dune.codegen.sumfact.basis import SumfactBasisMixin return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The result of stage 2 has the correct quadrature permutation but no # cost permutation is applied. The inames for this method are # quadrature and cost permuted. This means we need to reverse the cost @@ -175,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index d19fd7be5587258e8db1f3ada3301b1e7017f95e..b4a48b49473c64b3b46136f7654c34fc53f43ca0 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -1,20 +1,22 @@ """ Autotuning for sum factorization kernels """ -from dune.codegen.generation import cache_restoring, delete_cache_items -from dune.codegen.loopy.target import DuneTarget -from dune.codegen.sumfact.realization import realize_sumfact_kernel_function -from dune.codegen.options import get_option, set_option -from dune.codegen.error import CodegenAutotuneError - -import loopy as lp -from pytools import product - import os import re import subprocess import filelock import hashlib +import json +from operator import mul +import loopy as lp +from pytools import product +from cgen import ArrayOf, AlignedAttribute, Initializer + +from dune.codegen.generation import cache_restoring, delete_cache_items +from dune.codegen.loopy.target import DuneTarget, type_floatingpoint +from dune.codegen.sumfact.realization import realize_sumfact_kernel_function +from dune.codegen.options import get_option, set_option +from dune.codegen.error import CodegenAutotuneError def get_cmake_cache_entry(entry): for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): @@ -286,13 +288,139 @@ def generate_standalone_code(sf, filename): set_option("opcounter", opcounting) -def autotune_realization(sf): +def generate_standalone_kernel_code(kernel, signature, filename): + with open(filename, 'w') as f: + # Write headers + headers = ['#include "config.h"', + '#include <iostream>', + '#include <fstream>', + '#include <random>', + '#include "benchmark/benchmark.h"', + '#include <dune/codegen/common/vectorclass.hh>', + '#include <dune/codegen/sumfact/horizontaladd.hh>', + ] + f.write("\n".join(headers)) + + # Get a list of the function argument names + assert len(signature) == 1 + sig = signature[0] + arguments = sig[sig.find('(') +1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + + global_args = [a for a in kernel.args if a.name not in arguments] + + # Declare global arguments + f.write('\n\n') + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + f.write('{}\n'.format(arg_decl)) + + # Generate function we want to benchmark + f.write('\n') + f.write(sig[0:sig.find(')')+1]) + f.writelines(lp.generate_body(kernel)) + f.write('\n\n') + + # Generate function that will do the benchmarking + f.write('static void BM_sumfact_kernel(benchmark::State& state){\n') + + # Declare random generators + real = type_floatingpoint() + lines = [' std::uniform_real_distribution<{}> unif(0,1);'.format(real), + ' std::uniform_int_distribution<int> unif_int(0,128);', + ' std::default_random_engine re;'] + f.write('\n'.join(lines) + '\n') + + # Declare function arguments + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + f.write(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + f.write(' {}\n'.format(decl)) + else: + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + alignment = arg.dtype.itemsize + f.write(' {} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment)) + + # Initialize arguments + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] + real = type_floatingpoint() + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + for arg in kernel.args: + lines = _initialize_arg(arg) + f.write('\n'.join(lines) + '\n') + + # Benchmark loop + function_call = kernel.name + '({})'.format(','.join(arguments)) + f.writelines([' for (auto _ : state){\n', + ' {};\n'.format(function_call), + ' }\n', + ]) + f.write('}\n') + + # Benchmark main + main = ['', + 'BENCHMARK(BM_sumfact_kernel);', + '', + 'BENCHMARK_MAIN();'] + f.write('\n'.join(main)) + + +def autotune_realization(sf=None, kernel=None, signature=None): + """"Generate an executable run a benchmark and return time + + The benchmark can be generated from a SumfactKernel or a LoopKernel with a + function signature. For SumfactKernels you can generate benchmarks with or + without google benchmark, for LoopKernels only google benchmarks are + possible. + + Parameters + ---------- + sf : SumfactKernel or VectorizedSumfactKernel + kernel : loopy.kernel.LoopKernel + signature : str + """ + if sf is None: + assert kernel and signature + assert get_option("autotune_google_benchmark") + else: + assert kernel is None and signature is None + # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): os.mkdir(dir) - basename = "autotune_sumfact_{}".format(sf.function_name) + if sf is None: + basename = "autotune_sumfact_{}".format(kernel.name) + else: + basename = "autotune_sumfact_{}".format(sf.function_name) basename = hashlib.sha256(basename.encode()).hexdigest() filename = os.path.join(dir, "{}.cc".format(basename)) @@ -301,10 +429,14 @@ def autotune_realization(sf): executable = os.path.join(dir, basename) # Generate and compile a benchmark program + # + # Note: cache restoring is only necessary when generating from SumfactKernel with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): - if get_option("autotune_google_benchmark"): + if sf is None: + generate_standalone_kernel_code(kernel, signature, filename) + elif get_option("autotune_google_benchmark"): generate_standalone_code_google_benchmark(sf, filename) else: generate_standalone_code(sf, filename) @@ -339,7 +471,6 @@ def autotune_realization(sf): # Extract the result form the log file if get_option("autotune_google_benchmark"): - import json with open(logname) as json_file: data = json.load(json_file) return data['benchmarks'][0]['cpu_time'] diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index d3a2a190255ad2178fc44b5a691f49beb919b706..57287efeb4aaef4d0243799e3561731c98aa2272 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -348,14 +348,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Note: Here we do not need to reverse any permutation since this is # already done in the setup_input method above! # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index 689c2b60c30c06b821972b250627f19bd608095c..acb3599b0ffce6d4f393d90d6249f3083328cb56 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -459,11 +459,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index 3a628576bf5a130fe359bfb78967018a5f8a5211..d020c490b89f224a8c1e43fdb83f8135d2c3eea5 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -60,6 +60,11 @@ def name_buffer_storage(buff, which): return name +def _max_sum_factorization_buffer_size(sf): + size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + return size + @kernel_cached def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep @@ -73,8 +78,7 @@ def _realize_sum_factorization_kernel(sf): for buf in buffers: # Determine the necessary size of the buffer. We assume that we do not # underintegrate the form!!! - size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, - product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + size = _max_sum_factorization_buffer_size(sf) temporary_variable("{}_dummy".format(buf), shape=(size,), custom_base_storage=buf, @@ -117,10 +121,29 @@ class BufferSwitcher(object): def __init__(self): self.current = 0 - def get_temporary(self, name=None, **kwargs): + def get_temporary(self, sf=None, name=None, **kwargs): assert name bs = "buffer{}".format(self.current) - globalarg(bs) + + # Calculate correct alignment + shape = kwargs['shape'] + assert shape + dim_tags = kwargs['dim_tags'].split(',') + assert dim_tags + vec_size = 1 + if 'vec' in dim_tags: + vec_size = shape[dim_tags.index('vec')] + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint())) + alignment = dtype.itemsize * vec_size + + # Add this buffer as global argument to the kernel. Add a shape to make + # benchmark generation for autotuning of loopy kernels easier. Since + # this buffer is used to store different data sizes we need to make + # sure it is big enough. + assert sf + size = _max_sum_factorization_buffer_size(sf) / vec_size + globalarg(bs, shape=(size, vec_size), alignment=alignment, dim_tags=['f', 'vec']) temporary_variable(name, managed=True, custom_base_storage=bs, @@ -192,7 +215,8 @@ def realize_sumfact_kernel_function(sf): if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: input_summand = sf.interface.realize_direct_input(input_inames, inp_shape) elif l == 0: - input_summand = sf.interface.realize_input(input_inames, + input_summand = sf.interface.realize_input(sf, + input_inames, inp_shape, vec_iname, vec_shape, @@ -203,7 +227,8 @@ def realize_sumfact_kernel_function(sf): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buffer.get_temporary("buff_step{}_in".format(l), + inp = buffer.get_temporary(sf, + "buff_step{}_in".format(l), shape=inp_shape + vec_shape, dim_tags=ftags, ) @@ -249,7 +274,8 @@ def realize_sumfact_kernel_function(sf): output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation) output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) @@ -264,7 +290,8 @@ def realize_sumfact_kernel_function(sf): else: output_shape = tuple(out_shape[1:]) + (out_shape[0],) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 3fddff6297e42cf807fe1bb0017b9b3dce7aa668..8fc82f3bc2650ea78cb73375c5fb501ee3522ea4 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object): """ raise NotImplementedError - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): """Interpret the input of sumfact kernel function in the right way (non fastdgg) This happens inside the sumfact kernel function. @@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object): Parameters ---------- + sf : SumfactKernel or VectorizedSumfactKernel inames : tuple of pymbolic.primitives.Variable Inames for accesing the input. Ordered according to permuted matrix sequence. shape : tuple of int @@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): dep = dep.union(inp.setup_input(sf, dep, index=i)) return dep - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return prim.Call(prim.Variable(hadd_function), (result,)) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The input for stage 3 is quadrature permuted. The inames and shape # passed to this method are quadrature and cost permuted. This means we # need to take the cost permutation back to get the right inames and @@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): # Get a temporary that interprets the base storage of the input as a # column-major matrix. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py index 76cf241c90d6a85d6276b29d0bd670a63bedfd82..f059c4c567852da01f116eeeb158eb2a208b6f89 100644 --- a/python/dune/codegen/sumfact/transformations.py +++ b/python/dune/codegen/sumfact/transformations.py @@ -4,6 +4,7 @@ import islpy as isl from dune.codegen.generation import get_global_context_value from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions +from dune.codegen.options import get_form_option from dune.codegen.pdelab.geometry import world_dimension def move_zero_assignment_up(knl, move_up_inames): @@ -95,6 +96,8 @@ def reorder_loops_in_tensor_contraction(knl, iname_order): using einsum notation from numpy. iname_order should be a string like 'iklj' if the loops should be done in order i, k, l, j. + Without transformations those loops will be done in the order lkij. + In the sum factorization kernel itself those inames are called: sf_out_inames_2_* : l @@ -185,22 +188,63 @@ class LoopOrderTransformation(SumfactKernelFunctionTransformation): return 'looporder{}'.format(self.order) +def autotune_loop_order(sf): + # Baseline is the non transformed kernel + from dune.codegen.sumfact.autotune import autotune_realization + best_order = None + best_cost = autotune_realization(sf) + + # Go through all loop orderings. Note: Due to reduction removal there can + # be differences even in the case where the loop order here is the same as + # above. + import itertools as it + loop_orders = [''.join(p) for p in it.permutations('lkij')] + for order in loop_orders: + from dune.codegen.sumfact.transformations import reorder_loops_in_tensor_contraction + trafo = LoopOrderTransformation(order) + transformed_sf = sf.copy(transformations=sf.transformations + (trafo,)) + cost = autotune_realization(transformed_sf) + if cost < best_cost: + best_cost = cost + best_order = order + + return best_order + def attach_transformations(sf, vsf): if vsf == 0: return 0 if get_global_context_value("dry_run") == None: - # Example and test of such a transformation: - - # # Store transformation in sumfact kernel - # from dune.codegen.sumfact.transformations import LoopOrderTransformation - # trafo = LoopOrderTransformation('kjli') - # vsf = vsf.copy(transformations=vsf.transformations + (trafo,)) - - # # Map the original kernel to the new one - # from dune.codegen.sumfact.vectorization import _cache_vectorization_info - # _cache_vectorization_info(sf, vsf) + from dune.codegen.options import set_form_option, set_option + # TODO + # + # set_form_option("sumfact_optimize_loop_order", True) + # set_option("autotune_google_benchmark", True) + if get_form_option("sumfact_optimize_loop_order"): + if get_global_context_value("integral_type") == 'cell': + # Find best loop order and store transformation in sumfact kernel + loop_order = autotune_loop_order(vsf) + print("palpo loop_order: {}".format(loop_order)) + if loop_order != None: + trafo = LoopOrderTransformation(loop_order) + vsf = vsf.copy(transformations=vsf.transformations + (trafo,)) + + # Map the original kernel to the transformed kernel + from dune.codegen.sumfact.vectorization import _cache_vectorization_info + _cache_vectorization_info(sf, vsf) + + # TODO + # + # Exapmle for such an transformation + # if get_global_context_value("integral_type") == 'cell': + # loop_order = 'lkji' + # trafo = LoopOrderTransformation(loop_order) + # vsf = vsf.copy(transformations=vsf.transformations + (trafo,)) + + # # Map the original kernel to the transformed kernel + # from dune.codegen.sumfact.vectorization import _cache_vectorization_info + # _cache_vectorization_info(sf, vsf) return vsf