Newer
Older
""" Autotuning for sum factorization kernels """
from dune.perftool.generation import cache_restoring, delete_cache_items
from dune.perftool.loopy.target import DuneTarget
from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
from dune.perftool.options import get_option
import loopy as lp
from pytools import product
import os
import re
import subprocess
def get_cmake_cache_entry(entry):
for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"):
match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED]+=(.*)".format(entry), line)
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
if match:
return match.groups()[0]
def compiler_invocation(name, filename):
# Determine the CMake Generator in use
gen = get_cmake_cache_entry("CMAKE_GENERATOR")
assert(gen == "Unix Makefiles")
# Find compiler path
compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER")
compile_flags = [compiler]
# Parse compiler flags
for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"):
match = re.match("([^=]*)=(.*)", line)
if match:
compile_flags.extend(match.groups()[1].split())
# Add the source file
compile_flags.append(filename)
# Parse linker flags
for line in open(os.path.join(get_option("project_basedir"), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"):
match = re.match(".*_autotune_target (.*)", line)
if match:
for flag in match.groups()[0].split():
if flag.startswith("-") or os.path.isabs(flag):
compile_flags.append(flag)
else:
compile_flags.append(os.path.join(get_option("project_basedir"), "python", flag))
# Set an output name
compile_flags.append("-o")
compile_flags.append(name)
return compile_flags
def generate_standalone_code(sf, filename, logname):
delete_cache_items("kernel_default")
with open(filename, "w") as f:
f.writelines(["#include \"config.h\"\n",
"#include<dune/pdelab/finiteelementmap/qkdg.hh>\n",
"#include<dune/perftool/common/tsc.hh>\n",
"#include<dune/perftool/common/vectorclass.hh>\n",
"#include<dune/perftool/sumfact/onedquadrature.hh>\n",
"#include<dune/perftool/sumfact/horizontaladd.hh>\n",
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"#include<random>\n",
"#include<fstream>\n",
"\n"
])
f.writelines(["int main(int argc, char** argv)\n",
"{\n",
])
# Setup a polynomial object (normally done in the LocalOperator members)
from dune.perftool.loopy.target import type_floatingpoint
real = type_floatingpoint()
f.write(" using RF = {};\n".format(real))
f.write(" using DF = {};\n".format(real))
from dune.perftool.sumfact.tabulation import name_polynomials
degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence)
for deg in set(degs):
f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg)))
# Get kernels
from dune.perftool.pdelab.localoperator import extract_kernel_from_cache
knl = realize_sumfact_kernel_function(sf)
constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False)
constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False))
constructor_knl = lp.get_one_scheduled_kernel(constructor_knl)
# Allocate buffers
size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
" char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
])
# Setup fastdg inputs
for arg in sf.interface.signature_args:
if "jacobian" in arg:
f.write("{} = 0;\n".format(arg))
else:
basis_size = product(m.basis_size for m in sf.matrix_sequence)
if sf.within_inames:
basis_size = basis_size * basis_size
f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], product(m.basis_size for m in sf.matrix_sequence)))
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Write stuff into the input buffer
f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),
" {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2),
" for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)),
" input[i] = ({})(i+1);\n".format(real),
])
target = DuneTarget()
from loopy.codegen import CodeGenerationState
codegen_state = CodeGenerationState(kernel=constructor_knl,
implemented_data_info=None,
implemented_domain=None,
implemented_predicates=frozenset(),
seen_dtypes=frozenset(),
seen_functions=frozenset(),
seen_atomic_dtypes=frozenset(),
var_subst_map={},
allow_complex=False,
is_generating_device_code=True,
)
for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0):
f.write(" {}\n".format(next(iter(decl.generate()))))
for _, line in constructor_knl.preambles:
if "gfsu" not in line:
f.write(" {}\n".format(line))
# Add setup code for theta matrices. We add some lines not necessary,
# but it would be more work to remove them than keeping them.
for line in lp.generate_body(constructor_knl).split("\n")[1:-1]:
if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line:
f.write(" {}\n".format(line))
# INtroduces a variable that makes sure that the kernel cannot be optimized away
f.writelines([" {} accum;\n".format(real),
" std::mt19937 rng;\n",
" rng.seed(42);\n",
" std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)),
])
# Start a TSC timer
])
# Add the implementation of the kernel.
f.write(" for(int i=0; i<10000000; ++i)\n")
f.write(" {\n")
for line in knl.member.lines[1:]:
f.write(" {}\n".format(line))
f.write(" }\n")
# Stop the TSC timer and write the result to a file
f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n",
" std::ofstream file;\n",
" file.open(\"{}\");\n".format(logname),
" file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n",
" file.close();\n",
" accum += output[dis(rng)];\n",
" std::cout << accum;\n",
"}\n",
])
def autotune_realization(sf):
# Make sure that the benchmark directory exists
dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks")
if not os.path.exists(dir):
os.mkdir(dir)
basename = "autotune_sumfact_{}".format(sf.function_name)
name = os.path.join(dir, "autotune_sumfact_{}".format(sf.function_name))
filename = os.path.join(dir, "{}.cc".format(basename))
logname = os.path.join(dir, "{}.log".format(basename))
# If the log file already exists, we can reuse the benchmark results
# and do not need to rerun it.
if not os.path.isfile(logname):
# Generate and compile a benchmark program
with cache_restoring():
generate_standalone_code(sf, filename, logname)
ret = subprocess.call(compiler_invocation(name, filename))
assert ret == 0
# Check whether the user specified an execution wrapper
call = []
wrapper = get_cmake_cache_entry("DUNE_PERFTOOL_BENCHMARK_WRAPPER")
if wrapper:
call.append(wrapper)
call.append(name)
ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT)
# Extract the result form the log file
return float(next(iter(open(logname, "r")))) / 1000000