autotune.py

""" Autotuning for sum factorization kernels """

from dune.perftool.generation import cache_restoring, delete_cache_items
from dune.perftool.loopy.target import DuneTarget
from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
from dune.perftool.options import get_option

import loopy as lp
from pytools import product

import os
import re
import subprocess


def get_cmake_cache_entry(entry):
    for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
        match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED]+=(.*)".format(entry), line)
        if match:
            return match.groups()[0]


def compiler_invocation(name, filename):
    # Determine the CMake Generator in use
    gen = get_cmake_cache_entry("CMAKE_GENERATOR")
    assert(gen == "Unix Makefiles")

    # Find compiler path
    compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
    compile_flags = [compiler]

    # Parse compiler flags
    for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
        match = re.match("([^=]*)=(.*)", line)
        if match:
            compile_flags.extend(match.groups()[1].split())

    # Add the source file
    compile_flags.append(filename)

    # Parse linker flags
    for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
        match = re.match(".*_autotune_target (.*)", line)
        if match:
            for flag in match.groups()[0].split():
                if flag.startswith("-") or os.path.isabs(flag):
                    compile_flags.append(flag)
                else:
                    compile_flags.append(os.path.join(get_option("project_basedir"), "python", flag))

    # Set an output name
    compile_flags.append("-o")
    compile_flags.append(name)

    return compile_flags


def generate_standalone_code(sf, filename, logname):
    delete_cache_items("kernel_default")

    with open(filename, "w") as f:
        f.writelines(["#include \"config.h\"\n",
                      "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
                      "#include<dune/perftool/common/tsc.hh>\n",
                      "#include<dune/perftool/common/vectorclass.hh>\n",
                      "#include<dune/perftool/sumfact/onedquadrature.hh>\n",
                      "#include<dune/perftool/sumfact/horizontaladd.hh>\n",
                      "#include<random>\n",
                      "#include<fstream>\n",
                      "\n"
                      ])

        f.writelines(["int main(int argc, char** argv)\n",
                      "{\n",
                      ])

        # Setup a polynomial object (normally done in the LocalOperator members)
        from dune.perftool.loopy.target import type_floatingpoint
        real = type_floatingpoint()
        f.write("  using RF = {};\n".format(real))
        f.write("  using DF = {};\n".format(real))

        from dune.perftool.sumfact.tabulation import name_polynomials
        degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence)
        for deg in set(degs):
            f.write("  Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))

        # Get kernels
        from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
        knl = realize_sumfact_kernel_function(sf)
        constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
        constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
        constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)

        # Allocate buffers
        size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
                   product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
        f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
                      "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
                      ])

        # Setup fastdg inputs
        for arg in sf.interface.signature_args:
            if "jacobian" in arg:
                f.write("{} = 0;\n".format(arg))
            else:
                basis_size = product(m.basis_size for m in sf.matrix_sequence)
                if sf.within_inames:
                    basis_size = basis_size * basis_size
                f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], product(m.basis_size for m in sf.matrix_sequence)))

        # Write stuff into the input buffer
        f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
                      "  {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
                      "  for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
                      "    input[i] = ({})(i+1);\n".format(real),
                      ])

        target = DuneTarget()
        from loopy.codegen import CodeGenerationState
        codegen_state = CodeGenerationState(kernel=constructor_knl,
                                            implemented_data_info=None,
                                            implemented_domain=None,
                                            implemented_predicates=frozenset(),
                                            seen_dtypes=frozenset(),
                                            seen_functions=frozenset(),
                                            seen_atomic_dtypes=frozenset(),
                                            var_subst_map={},
                                            allow_complex=False,
                                            is_generating_device_code=True,
                                            )

        for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
            f.write("  {}\n".format(next(iter(decl.generate()))))

        for _, line in constructor_knl.preambles:
            if "gfsu" not in line:
                f.write("  {}\n".format(line))

        # Add setup code for theta matrices. We add some lines not necessary,
        # but it would be more work to remove them than keeping them.
        for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
            if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
                f.write("  {}\n".format(line))

        # INtroduces a variable that makes sure that the kernel cannot be optimized away
        f.writelines(["  {} accum;\n".format(real),
                      "  std::mt19937 rng;\n",
                      "  rng.seed(42);\n",
                      "  std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
                      ])

        # Start a TSC timer
        f.writelines(["  auto start = Dune::PDELab::TSC::start();\n",
                      ])

        # Add the implementation of the kernel.
        f.write("  for(int i=0; i<10000000; ++i)\n")
        f.write("  {\n")
        for line in knl.member.lines[1:]:
            f.write("    {}\n".format(line))
        f.write("  }\n")

        # Stop the TSC timer and write the result to a file
        f.writelines(["  auto stop = Dune::PDELab::TSC::stop();\n",
                      "  std::ofstream file;\n",
                      "  file.open(\"{}\");\n".format(logname),
                      "  file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
                      "  file.close();\n",
                      "  accum += output[dis(rng)];\n",
                      "  std::cout << accum;\n",
                      "}\n",
                      ])


def autotune_realization(sf):
    # Make sure that the benchmark directory exists
    dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
    if not os.path.exists(dir):
        os.mkdir(dir)

    basename = "autotune_sumfact_{}".format(sf.function_name)
    name = os.path.join(dir, "autotune_sumfact_{}".format(sf.function_name))
    filename = os.path.join(dir, "{}.cc".format(basename))
    logname = os.path.join(dir, "{}.log".format(basename))

    # If the log file already exists, we can reuse the benchmark results
    # and do not need to rerun it.
    if not os.path.isfile(logname):
        # Generate and compile a benchmark program
        with cache_restoring():
            generate_standalone_code(sf, filename, logname)

        ret = subprocess.call(compiler_invocation(name, filename))
        assert ret == 0

        # Check whether the user specified an execution wrapper
        call = []
        wrapper = get_cmake_cache_entry("DUNE_PERFTOOL_BENCHMARK_WRAPPER")
        if wrapper:
            call.append(wrapper)

        # Run the benchmark program
        call.append(name)
        devnull = open(os.devnull, 'w')
        ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
        assert ret == 0

    # Extract the result form the log file
    return float(next(iter(open(logname, "r")))) / 1000000