Skip to content
Snippets Groups Projects
autotune.py 26.6 KiB
Newer Older
""" Autotuning for sum factorization kernels """

import os
import re
import subprocess
import logging
import json
from operator import mul
import pkg_resources
from six.moves import reduce
import loopy as lp
from pytools import product
from cgen import ArrayOf, AlignedAttribute, Initializer

from dune.codegen.generation import cache_restoring, delete_cache_items
from dune.codegen.loopy.target import DuneTarget, type_floatingpoint
from dune.codegen.sumfact.realization import realize_sumfact_kernel_function
from dune.codegen.options import get_option, option_context
from dune.codegen.error import CodegenAutotuneError
René Heß's avatar
René Heß committed

def get_cmake_cache_entry(entry):
    for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
        match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line)
        if match:
            return match.groups()[0]


Dominic Kempf's avatar
Dominic Kempf committed
def get_dune_codegen_dir():
    if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-codegen":
        return get_option("project_basedir")
    else:
Dominic Kempf's avatar
Dominic Kempf committed
        return get_cmake_cache_entry("dune-codegen_DIR")
def compiler_invocation(name, filename):
    # Determine the CMake Generator in use
    gen = get_cmake_cache_entry("CMAKE_GENERATOR")
    assert(gen == "Unix Makefiles")

    # Find compiler path
    compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
    compile_flags = [compiler]

    # Parse compiler flags
Dominic Kempf's avatar
Dominic Kempf committed
    for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
        match = re.match("([^=]*)=(.*)", line)
        if match:
            compile_flags.extend(match.groups()[1].split())

    # Add the source file
    compile_flags.append(filename)

    # Parse linker flags
Dominic Kempf's avatar
Dominic Kempf committed
    for line in open(os.path.join(get_dune_codegen_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
        match = re.match(".*_autotune_target (.*)", line)
        if match:
            for flag in match.groups()[0].split():
                if flag.startswith("-") or os.path.isabs(flag):
                    compile_flags.append(flag)
                else:
Dominic Kempf's avatar
Dominic Kempf committed
                    compile_flags.append(os.path.join(get_dune_codegen_dir(), "python", flag))

    # Set an output name
    compile_flags.append("-o")
    compile_flags.append(name)

    return compile_flags


def write_global_data(sf, filename):
    opcounting = get_option("opcounter")
    with open(filename, "a") as f:
        # Get kernel
        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
        knl = realize_sumfact_kernel_function(sf)
        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
        target = DuneTarget()
        from loopy.codegen import CodeGenerationState
        codegen_state = CodeGenerationState(kernel=constructor_knl,
                                            implemented_data_info=None,
                                            implemented_domain=None,
                                            implemented_predicates=frozenset(),
                                            seen_dtypes=frozenset(),
                                            seen_functions=frozenset(),
                                            seen_atomic_dtypes=frozenset(),
                                            var_subst_map={},
                                            allow_complex=False,
                                            is_generating_device_code=True,
                                            )

        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
            f.write("{}\n".format(next(iter(decl.generate()))))
def write_setup_code(sf, filename, define_thetas=True):
    with open(filename, "a") as f:
        # Setup a polynomial object (normally done in the LocalOperator members)
Dominic Kempf's avatar
Dominic Kempf committed
        from dune.codegen.loopy.target import type_floatingpoint
        real = type_floatingpoint()
        f.write("  using RF = {};\n".format(real))
        f.write("  using DF = {};\n".format(real))

Dominic Kempf's avatar
Dominic Kempf committed
        from dune.codegen.sumfact.tabulation import name_polynomials
René Heß's avatar
René Heß committed
        degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence_quadrature_permuted)
        for deg in set(degs):
            f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))

Dominic Kempf's avatar
Dominic Kempf committed
        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
        knl = realize_sumfact_kernel_function(sf)
        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)

        # Allocate buffers
        alignment = get_option("max_vector_width") // 8
René Heß's avatar
René Heß committed
        size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width,
                   product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width)
Dominic Kempf's avatar
Dominic Kempf committed
        size = int(size * (get_option("precision_bits") / 8))
        f.writelines(["  char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
                      "  char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment),
        # Setup fastdg inputs
        for arg in sf.interface.signature_args:
            if "jacobian" in arg:
                f.write("{} = 0;\n".format(arg))
            else:
Dominic Kempf's avatar
Dominic Kempf committed
                size = sf.interface.fastdg_interface_object_size
                f.write("  RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment))
        # Write stuff into the input buffer
        f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
                      "  {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
                      "  for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
                      "    input[i] = ({})(i+1);\n".format(real),
                      ])

        target = DuneTarget()
        from loopy.codegen import CodeGenerationState
        codegen_state = CodeGenerationState(kernel=constructor_knl,
                                            implemented_data_info=None,
                                            implemented_domain=None,
                                            implemented_predicates=frozenset(),
                                            seen_dtypes=frozenset(),
                                            seen_functions=frozenset(),
                                            seen_atomic_dtypes=frozenset(),
                                            var_subst_map={},
                                            allow_complex=False,
                                            is_generating_device_code=True,
                                            )

        if define_thetas:
            for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
                f.write("  {}\n".format(next(iter(decl.generate()))))

        for _, line in constructor_knl.preambles:
            if "gfsu" not in line:
                f.write("  {}\n".format(line))

        # Add setup code for theta matrices. We add some lines not necessary,
        # but it would be more work to remove them than keeping them.
        for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
            if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
                f.write("  {}\n".format(line))

        # INtroduces a variable that makes sure that the kernel cannot be optimized away
        f.writelines(["  {} accum;\n".format(real),
                      "  std::mt19937 rng;\n",
                      "  rng.seed(42);\n",
                      "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                      ])


def generate_standalone_code_google_benchmark(sf, filename):
    delete_cache_items("kernel_default")

    # Turn off opcounting
    with option_context(opcounter=False):
        # Extract sum factorization kernel
        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
        knl = realize_sumfact_kernel_function(sf)
        # Add the implementation of the kernel.
        # TODO: This can probably done in a safer way?
        first_line = knl.member.lines[0]
        arguments = first_line[first_line.find("(") + 1:first_line.find(")")]

        with open(filename, "w") as f:
            f.writelines(["// {}".format(first_line),
                          "\n",
                          "#include \"config.h\"\n",
                          "#include \"benchmark/benchmark.h\"\n",
                          "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
                          "#include<dune/codegen/common/vectorclass.hh>\n",
                          "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
                          "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
                          "#include<random>\n",
                          "#include<fstream>\n",
                          "#include<iostream>\n",
                          "\n"
                          ])

        write_global_data(sf, filename)

        with open(filename, "a") as f:
            arguments = ', '.join(sf.interface.signature_args)
            if len(arguments) > 0:
                arguments = ', ' + arguments
            arguments = 'const char* buffer0, const char* buffer1' + arguments
            f.write("void sumfact_kernel({})\n".format(arguments))
            for line in knl.member.lines[1:]:
                f.write("{}\n".format(line))

            f.write("\n\n")
            f.write("static void BM_sumfact_kernel(benchmark::State& state){\n")

        write_setup_code(sf, filename, define_thetas=False)

        additional_arguments = [i.split()[-1] for i in sf.interface.signature_args]
        additional_arguments = ', '.join(additional_arguments)
        if len(additional_arguments) > 0:
            additional_arguments = ', ' + additional_arguments
        with open(filename, "a") as f:
            f.writelines(["  for (auto _ : state){\n",
                          "    sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments),
                          "  }\n",
                          "}\n",
                          "BENCHMARK(BM_sumfact_kernel);\n",
                          "\n",
                          "BENCHMARK_MAIN();"
                          ])


def generate_standalone_code(sf, filename):
    delete_cache_items("kernel_default")
    # Turn off opcounting
    with option_context(opcounter=False):
        # Extract sum factorization kernel
        from dune.codegen.pdelab.localoperator import extract_kernel_from_cache
        knl = realize_sumfact_kernel_function(sf)
        first_line = knl.member.lines[0]

        with open(filename, "w") as f:
            f.writelines(["// {}".format(first_line),
                          "\n",
                          "#include \"config.h\"\n",
                          "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
                          "#include<dune/codegen/common/tsc.hh>\n",
                          "#include<dune/codegen/common/vectorclass.hh>\n",
                          "#include<dune/codegen/sumfact/onedquadrature.hh>\n",
                          "#include<dune/codegen/sumfact/horizontaladd.hh>\n",
                          "#include<random>\n",
                          "#include<fstream>\n",
                          "#include<iostream>\n",
                          "\n"
                          ])

            f.writelines(["int main(int argc, char** argv)\n",
                          "{\n",
                          ])

        write_setup_code(sf, filename)

        # Write measurement
        with open(filename, "a") as f:
            # Start a TSC timer
            f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
                          ])

            # Add the implementation of the kernel.
            repeats = int(1e9 / sf.operations)
            f.write("  for(int i=0; i<{}; ++i)\n".format(repeats))
            f.write("  {\n")
            for line in knl.member.lines[1:]:
                f.write("    {}\n".format(line))
            f.write("  }\n")

            # Stop the TSC timer and write the result to a file
            f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
                          "  std::ofstream file;\n",
                          "  file.open(argv[1]);\n",
                          "  file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))),
                          "  file.close();\n",
                          "  accum += output[dis(rng)];\n",
                          "  std::cout << accum;\n",
                          "}\n",
                          ])
def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
    # Turn off opcounting
    with option_context(opcounter=False):
        # Remove opcounter from signature
        p = re.compile('OpCounter::OpCounter<([^>]*)>')
        assert len(signature) == 1
        sig = signature[0]
        sig = p.sub(r'\1', sig)
        assert 'OpCounter' not in signature

        # Which transformations were applied
        codegen_transformations = ''
        if transformations:
            codegen_transformations = ''
            for trafo in transformations:
                codegen_transformations += '// {}\n'.format(trafo)

        template = 'kernel_benchmark_template1.cc.in'
        use_datasets = True

        # Old benchmark template
        # template = 'kernel_benchmark_template0.cc.in'
        # use_datasets = False

        template_filename = pkg_resources.resource_filename(__name__, template)
        with open(template_filename, 'r') as f:
            benchmark = f.read()

        # Find function arguments and global arguments
        arguments = sig[sig.find('(') + 1:sig.find(')')].split(',')
        arguments = [a.split(' ')[-1] for a in arguments]
        global_args = [a for a in kernel.args if a.name not in arguments]
        buffer_arguments = [a for a in arguments if a.startswith('buff')]
        input_arguments = [a for a in arguments if a not in buffer_arguments]

        # Declare global arguments
        codegen_declare_global_arguments = ''
        target = DuneTarget()
        for g in global_args:
            decl_info = g.decl_info(target, True, g.dtype)
            for idi in decl_info:
                ast_builder = target.get_device_ast_builder()
                arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name)
                arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape))
                arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl)
                codegen_declare_global_arguments += '{}\n'.format(arg_decl)
        codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, '  ')

        # Helper function for argument initialization
        def _initialize_arg(arg):
            if isinstance(arg, lp.ValueArg):
                return []
            real = type_floatingpoint()
            size = reduce(mul, arg.shape)
            fill_name = arg.name + '_fill'
            lines = ['  {}* {} = (double *) {};'.format(real, fill_name, arg.name),
                     '  for (std::size_t i=0; i<{}; ++i){{'.format(size),
                     '    {}[i] = unif(re);'.format(fill_name),
                     '  }']
            return lines

        # Initialize global arguments
        codegen_initialize_global_arguments = ''
        for arg in global_args:
            lines = _initialize_arg(arg)
            codegen_initialize_global_arguments += '\n'.join(lines) + '\n'
        codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, '  ')

        codegen_initialize_input = ''

        # Function we want to benchmark
        codegen_benchmark_function = ''
        codegen_benchmark_function += sig[0:sig.find(')') + 1]
        codegen_benchmark_function += lp.generate_body(kernel)
        codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, '  ')

        # Declare function arguments
        codegen_declare_arguments = []
        codegen_declare_input = []
        function_arguments = [a for a in kernel.args if a.name in arguments]
        for arg in function_arguments:
            if 'buffer' in arg.name:
                byte_size = reduce(mul, arg.shape) * 8
                codegen_declare_arguments.append('  char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name,
                                                                                                          byte_size,
                                                                                                          arg.alignment),)
            elif isinstance(arg, lp.ValueArg):
                assert 'jacobian_offset' in arg.name
                decl = arg.get_arg_decl(ast_builder)
                decl = Initializer(decl, 'unif_int(re)')
                codegen_declare_arguments.append(('  {}\n'.format(decl)))
                assert 'fastdg' in arg.name
                size = reduce(mul, arg.shape)
                min_stride = min([tag.stride for tag in arg.dim_tags])
                size *= min_stride
                alignment = arg.dtype.itemsize
                real = type_floatingpoint()
                if use_datasets:
                    codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real,
                                                                                                                 arg.name,
                                                                                                                 size,
                                                                                                                 alignment)))
                else:
                    codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real,
                                                                                                       arg.name,
                                                                                                       size,
                                                                                                       alignment)))

        codegen_declare_arguments = ''.join(codegen_declare_arguments)
        codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, '  ')
        codegen_declare_input = ''.join(codegen_declare_input)
        codegen_declare_input = textwrap.indent(codegen_declare_input, '  ')

        # Initialize function arguments
        codegen_initialize_arguments = ''
        codegen_initialize_input = ''
        for arg in function_arguments:
            if 'fastdg' in arg.name:
                if use_datasets:
                    lines = _initialize_arg(arg)
                    lines = ['  ' + a for a in lines]
                    lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines]
                    lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){')
                    lines.append('}')
                    codegen_initialize_input += '\n'.join(lines) + '\n'
                else:
                    lines = _initialize_arg(arg)
                    codegen_initialize_arguments += '\n'.join(lines) + '\n'
            else:
                lines = _initialize_arg(arg)
                codegen_initialize_arguments += '\n'.join(lines) + '\n'
        codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, '  ')
        codegen_initialize_input = textwrap.indent(codegen_initialize_input, '  ')

        # Call the benchmark function
        if use_datasets:
            arguments_with_datasets = arguments.copy()
            arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments]
            codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n'
            codegen_call_benchmark_function += '  ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n'
            for arg in input_arguments:
                codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg)
            codegen_call_benchmark_function += '}'
            codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n'
        codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, '    ')

        # Replace placeholders in benchmark template
        benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations)
        benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments)
        benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input)
        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments)
        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input)
        benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function)
        benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments)
        benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments)
        benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function)

        # Write benchmark source file
        with open(filename, 'w') as f:
            f.writelines(benchmark)
def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
    """Generate an microbenchmark, compile run and return time

    Parameters
    ----------
    sf: SumfactKernel or VectorizedSumfactKernel
    kernel: loopy.kernel.LoopKernel
    signature: str
    transformation: list of str
        Will be used to distinguish between autotune targets
    """
René Heß's avatar
René Heß committed
    if sf is None:
        assert kernel is not None
        assert signature is not None
    else:
        assert kernel is None
        assert signature is None

    logger = logging.getLogger(__name__)

    # Make sure that the benchmark directory exists
    dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
    if not os.path.exists(dir):
        os.mkdir(dir)

    if sf is None:
        basename = "autotune_sumfact_{}".format(kernel.name)
    else:
        basename = "autotune_sumfact_{}".format(sf.function_name)
    if transformations:
        for trafo in transformations:
            basename = '{}_{}'.format(basename, trafo)
    basename = hashlib.sha256(basename.encode()).hexdigest()
    filename = os.path.join(dir, "{}.cc".format(basename))
    logname = os.path.join(dir, "{}.log".format(basename))
    lock = os.path.join(dir, "{}.lock".format(basename))
    executable = os.path.join(dir, basename)
    #
    # Note: cache restoring is only necessary when generating from SumfactKernel
    with cache_restoring():
        with filelock.FileLock(lock):
            if not os.path.isfile(logname):
                logger.debug('Generate autotune target in file {}'.format(filename))

                if sf is None:
                    generate_standalone_kernel_code(kernel, signature, filename, transformations)
                elif get_option("autotune_google_benchmark"):
                    generate_standalone_code_google_benchmark(sf, filename)
                else:
                    generate_standalone_code(sf, filename)
                call = []
                wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_COMPILATION_WRAPPER")
                if wrapper:
                    call.append(wrapper)

                call.extend(compiler_invocation(executable, filename))
                devnull = open(os.devnull, 'w')
                os.environ['DUNE_CODEGEN_THREADS'] = '1'
                ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                if ret != 0:
                    raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call)))
                # File system synchronization!
                while not os.path.exists(executable):
                    time.sleep(0.01)

                # Check whether the user specified an execution wrapper
                call = []
                wrapper = get_cmake_cache_entry("DUNE_CODEGEN_BENCHMARK_EXECUTION_WRAPPER")
                if wrapper:
                    call.append(wrapper)

                # Run the benchmark program
                call.append(executable)
                if get_option("autotune_google_benchmark"):
                    call.append("--benchmark_out={}".format(logname))
                    call.append("--benchmark_repetitions=5")
                    # call.append("--benchmark_out_format=csv")
                else:
                    call.append(logname)
                ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
                if ret != 0:
Dominic Kempf's avatar
Dominic Kempf committed
                    raise CodegenAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call)))
                # File system synchronization!
                while not os.path.exists(logname):
                    time.sleep(0.01)

            # Extract the result form the log file
            if get_option("autotune_google_benchmark"):
                import json
                with open(logname) as json_file:
                    try:
                        data = json.load(json_file)
                        minimal_time = 1e80
                        for b in data['benchmarks']:
                            if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'):
                                pass
                            else:
                                if b['cpu_time'] < minimal_time:
                                    minimal_time = b['cpu_time']
                        assert minimal_time < 1e80
                        return minimal_time
                    except Exception as e:
                        print("Error while loading file {}".format(logname))
                        raise e
            else:
                return float(next(iter(open(logname, "r")))) / 1000000