Skip to content
Snippets Groups Projects
Commit 4244bd29 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Add a first draft of JIT compiled autotuning

parent 467f2a3e
No related branches found
No related tags found
No related merge requests found
...@@ -40,5 +40,7 @@ add_subdirectory(test) ...@@ -40,5 +40,7 @@ add_subdirectory(test)
add_subdirectory(bin) add_subdirectory(bin)
add_subdirectory(applications) add_subdirectory(applications)
add_executable(autotune_target EXCLUDE_FROM_ALL autotune.cc)
# finalize the dune project, e.g. generating config.h etc. # finalize the dune project, e.g. generating config.h etc.
finalize_dune_project(GENERATE_CONFIG_H_CMAKE) finalize_dune_project(GENERATE_CONFIG_H_CMAKE)
int main()
{}
...@@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): ...@@ -82,7 +82,7 @@ class PerftoolFormOptionsArray(ImmutableRecord):
sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)")
vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target") vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune")
vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize")
vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
......
...@@ -459,7 +459,7 @@ def generate_kernel(integrals): ...@@ -459,7 +459,7 @@ def generate_kernel(integrals):
# Delete the cache contents and do the real thing! # Delete the cache contents and do the real thing!
logger.debug('generate_kernel: visit_integrals (no dry run)') logger.debug('generate_kernel: visit_integrals (no dry run)')
from dune.perftool.generation import delete_cache_items from dune.perftool.generation import delete_cache_items
delete_cache_items("kernel_default") delete_cache_items("kernel_default or member")
for integral in integrals: for integral in integrals:
visit_integral(integral) visit_integral(integral)
......
""" Autotuning for sum factorization kernels """
from dune.perftool.generation import delete_cache_items
from dune.perftool.loopy.target import DuneTarget
from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
from dune.perftool.options import get_option
import loopy as lp
from pytools import product
import os
import re
import subprocess
def get_cmake_cache_entry(entry):
for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH]+=(.*)".format(entry), line)
if match:
return match.groups()[0]
def compiler_invocation(name, filename):
# Determine the CMake Generator in use
gen = get_cmake_cache_entry("CMAKE_GENERATOR")
assert(gen == "Unix Makefiles")
# Find compiler path
compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
compile_flags = [compiler]
# Parse compiler flags
for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
match = re.match("([^=]*)=(.*)", line)
if match:
compile_flags.extend(match.groups()[1].split())
# Add the source file
compile_flags.append(filename)
# Parse linker flags
for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
match = re.match(".*_autotune_target (.*)", line)
if match:
for flag in match.groups()[0].split():
if flag.startswith("-") or os.path.isabs(flag):
compile_flags.append(flag)
else:
compile_flags.append(os.path.join(get_option("project_basedir"), "python", flag))
# Set an output name
compile_flags.append("-o")
compile_flags.append(name)
return compile_flags
def generate_standalone_code(sf, filename, logname):
delete_cache_items("kernel_default")
with open(filename, "w") as f:
f.writelines(["#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/perftool/common/tsc.hh>\n",
"#include<dune/perftool/common/vectorclass.hh>\n",
"#include<dune/perftool/sumfact/onedquadrature.hh>\n",
"#include<random>\n",
"#include<fstream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
# Setup a polynomial object (normally done in the LocalOperator members)
from dune.perftool.loopy.target import type_floatingpoint
real = type_floatingpoint()
f.write(" using RF = {};\n".format(real))
f.write(" using DF = {};\n".format(real))
from dune.perftool.sumfact.tabulation import name_polynomials
degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence)
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernels
from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
# Allocate buffers
size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
" char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
])
# Write stuff into the input buffer
f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),
" {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
" for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
" input[i] = ({})(i+1);\n".format(real),
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
f.write(" {}\n".format(line))
# Add setup code for theta matrices. We add some lines not necessary,
# but it would be more work to remove them than keeping them.
for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
if "gfsu" not in line and "meshwidth" not in line:
f.write(" {}\n".format(line))
# INtroduces a variable that makes sure that the kernel cannot be optimized away
f.writelines([" {} accum;\n".format(real),
" std::mt19937 rng;\n",
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
# Start a TSC timer
f.writelines([
" auto start = Dune::PDELab::TSC::start();\n",
])
# Add the implementation of the kernel.
f.write(" for(int i=0; i<10000000; ++i)\n")
f.write(" {\n")
for line in knl.member.lines[1:]:
f.write(" {}\n".format(line))
f.write(" }\n")
# Stop the TSC timer and write the result to a file
f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n",
" std::ofstream file;\n",
" file.open(\"{}\");\n".format(logname),
" file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
" file.close();\n",
" accum += output[dis(rng)];\n",
" std::cout << accum;\n",
"}\n",
])
def autotune_realization(sf):
name = "autotune_sumfact_{}".format(sf.function_name)
filename = "{}.cc".format(name)
logname = "{}.log".format(name)
# Generate and compile a benchmark program
generate_standalone_code(sf, filename, logname)
ret = subprocess.call(compiler_invocation(name, filename))
assert ret == 0
# Run the benchmark program
devnull = open(os.devnull, 'w')
ret = subprocess.call(["./{}".format(name)], stdout=devnull, stderr=subprocess.STDOUT)
assert ret == 0
# Extract the result form the log file
return float(next(iter(open(logname, "r")))) / 1000000
...@@ -362,13 +362,17 @@ def name_polynomials(degree): ...@@ -362,13 +362,17 @@ def name_polynomials(degree):
return name return name
@preamble(kernel="operator")
def sort_quadrature_points_weights(qp, qw, bound): def sort_quadrature_points_weights(qp, qw, bound):
range_field = lop_template_range_field() range_field = lop_template_range_field()
domain_field = name_domain_field() domain_field = name_domain_field()
include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile") include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile")
return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \ return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \
.format(range_field, domain_field, bound, qp, qw) .format(range_field, domain_field, bound, qp, qw),
assignees=frozenset({qp, qw}),
read_variables=frozenset({qp, qw}),
kernel="operator",
),
})
@iname(kernel="operator") @iname(kernel="operator")
...@@ -439,10 +443,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None): ...@@ -439,10 +443,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
args = [inames[1]] args = [inames[1]]
dep = frozenset()
if tabmat.face is None: if tabmat.face is None:
qp = name_oned_quadrature_points(bound) qp = name_oned_quadrature_points(bound)
qw = name_oned_quadrature_weights(bound) qw = name_oned_quadrature_weights(bound)
sort_quadrature_points_weights(qp, qw, bound) dep = sort_quadrature_points_weights(qp, qw, bound)
args.append(prim.Subscript(prim.Variable(qp), (inames[0],))) args.append(prim.Subscript(prim.Variable(qp), (inames[0],)))
else: else:
args.append(tabmat.face) args.append(tabmat.face)
...@@ -450,6 +455,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None): ...@@ -450,6 +455,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None):
instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices), instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices),
expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)), expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)),
kernel="operator", kernel="operator",
depends_on=dep,
) )
......
...@@ -110,6 +110,9 @@ def strategy_cost(strat_tuple): ...@@ -110,6 +110,9 @@ def strategy_cost(strat_tuple):
func = explicit_costfunction func = explicit_costfunction
elif s == "target": elif s == "target":
func = target_costfunction func = target_costfunction
elif s == "autotune":
from dune.perftool.sumfact.autotune import autotune_realization
func = autotune_realization
else: else:
raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s)) raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment