diff --git a/bin/donkey_benchmark_wrapper.py b/bin/donkey_benchmark_wrapper.py new file mode 100755 index 0000000000000000000000000000000000000000..7951b8b06bfdeb217bd3328c9b3b88f229bbc606 --- /dev/null +++ b/bin/donkey_benchmark_wrapper.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python + +import time +import sys +import subprocess +import os + +# Run the actual command +command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split() +command.extend(sys.argv[1:]) +ret = subprocess.call(command) + +# If that failed - fail! +if ret != 0: + sys.exit(ret) + +# If that was succesful, wait for the output file to be available on the filesystem +# This step is necessary because the NFS synchronization is too slow for our workflow. +while not os.path.isfile(sys.argv[2]): + time.sleep(0.1) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8b6d3318bc46f180eb7ef4a388a20cbcc65fbbe5..a4c8a3702eccd689568cd9def3f24462e09d3f4f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -20,3 +20,7 @@ dune_python_add_test(NAME pep8-ourcode ) add_subdirectory(test) + +# Add a dummy target to extract compiler flags for the autotune tool chain +add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc) +target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing) diff --git a/python/_autotune.cc b/python/_autotune.cc new file mode 100644 index 0000000000000000000000000000000000000000..0c87b37802d394fb8dafc0ca6ae150797e81199a --- /dev/null +++ b/python/_autotune.cc @@ -0,0 +1,2 @@ +int main() +{} diff --git a/python/dune/perftool/error.py b/python/dune/perftool/error.py index 4d428b41b516845c3eb18803539edbf293f44cf1..fb77655aef3689b27b2764d76a3e20c158cbf982 100644 --- a/python/dune/perftool/error.py +++ b/python/dune/perftool/error.py @@ -19,3 +19,7 @@ class PerftoolLoopyError(PerftoolError): class PerftoolVectorizationError(PerftoolCodegenError): pass + + +class PerftoolAutotuneError(PerftoolVectorizationError): + pass diff --git a/python/dune/perftool/generation/__init__.py b/python/dune/perftool/generation/__init__.py index fc9a6ce42f58f2eecae533a6b7add45dbce450c2..607b4055850d33f2fb78f2d03ee4be771caec904 100644 --- a/python/dune/perftool/generation/__init__.py +++ b/python/dune/perftool/generation/__init__.py @@ -54,6 +54,7 @@ from dune.perftool.generation.loopy import (barrier, valuearg, ) -from dune.perftool.generation.context import (global_context, +from dune.perftool.generation.context import (cache_restoring, + global_context, get_global_context_value, ) diff --git a/python/dune/perftool/generation/cache.py b/python/dune/perftool/generation/cache.py index b4ae54f28479d607907ac59c3c6078283535bc85..31b1b85e0da2797a158b4d49ed10bdf89d7dfe0b 100644 --- a/python/dune/perftool/generation/cache.py +++ b/python/dune/perftool/generation/cache.py @@ -268,7 +268,7 @@ def delete_cache_items(condition=True, keep=False): gen._memoize_cache = _filter_cache_items(gen, condition) -def retrieve_cache_functions(condition=True): +def retrieve_cache_functions(condition="True"): return [g.func for g in _generators if eval(condition, _ConditionDict(g.item_tags))] diff --git a/python/dune/perftool/generation/context.py b/python/dune/perftool/generation/context.py index 55feda422efbabe48c7663e87fea289585253cde..4be1125a2deb0a695f1815f82293d29dc81880f2 100644 --- a/python/dune/perftool/generation/context.py +++ b/python/dune/perftool/generation/context.py @@ -31,3 +31,23 @@ def global_context(**kwargs): def get_global_context_value(key, default=None): return _global_context_cache.get(key, default) + + +class _CacheRestoringContext(object): + def __enter__(self): + from dune.perftool.generation.cache import _generators as g + self.cache = {} + for i in g: + self.cache[i] = {} + for k, v in i._memoize_cache.items(): + self.cache[i][k] = v + + def __exit__(self, exc_type, exc_value, traceback): + for i, c in self.cache.items(): + i._memoize_cache = {} + for k, v in c.items(): + i._memoize_cache[k] = v + + +def cache_restoring(): + return _CacheRestoringContext() diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index cc21196fa1679ad109510e3a65155b0394c60d3d..9c551db9a6788b5628a3eb8c5eba287846b5a281 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -44,7 +44,6 @@ def dtype_floatingpoint(): raise NotImplementedError("{}bit floating point type".format(bits)) -@pt.memoize def numpy_to_cpp_dtype(key): _registry = {'float32': 'float', 'int32': 'int', diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index c0f4900b8fff2f77b7cbefd593214cfc8d3af87d..a62de9950cf6d3dff0a84b3bc52b2e1e33ed5299 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -83,7 +83,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = PerftoolOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") - vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target") + vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = PerftoolOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index b9541a0f13e802c1f97a5f5ce5e33980327895cb..3bf16b24dd2e3fd5bd330319be2418770b6d3f93 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -243,6 +243,11 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): else: return () + @property + def fastdg_interface_object_size(self): + size = sum(_local_sizes(self.trial_element)) if self.trial_element else 1 + return size * sum(_local_sizes(self.test_element)) + def _local_sizes(element): from ufl import FiniteElement, MixedElement @@ -381,6 +386,10 @@ def generate_accumulation_instruction(expr, visitor): test_info = visitor.test_info trial_info = visitor.trial_info + # Count flops on the expression for the vectorization decision making algorithm + from dune.perftool.sumfact.vectorization import count_quadrature_point_operations + count_quadrature_point_operations(expr) + # Number of basis functions per direction leaf_element = test_info.element from ufl import MixedElement diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py new file mode 100644 index 0000000000000000000000000000000000000000..244873c24b0b64cd61f8b04a6c66500599b15670 --- /dev/null +++ b/python/dune/perftool/sumfact/autotune.py @@ -0,0 +1,225 @@ +""" Autotuning for sum factorization kernels """ + +from dune.perftool.generation import cache_restoring, delete_cache_items +from dune.perftool.loopy.target import DuneTarget +from dune.perftool.sumfact.realization import realize_sumfact_kernel_function +from dune.perftool.options import get_option, set_option +from dune.perftool.error import PerftoolAutotuneError + +import loopy as lp +from pytools import product + +import os +import re +import subprocess +import filelock + + +def get_cmake_cache_entry(entry): + for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): + match = re.match("{}:[INTERNAL|FILEPATH|BOOL|STRING|PATH|UNINITIALIZED|STATIC]+=(.*)".format(entry), line) + if match: + return match.groups()[0] + + +def get_perftool_dir(): + if get_cmake_cache_entry("CMAKE_PROJECT_NAME") == "dune-perftool": + return get_option("project_basedir") + else: + return get_cmake_cache_entry("dune-perftool_DIR") + + +def compiler_invocation(name, filename): + # Determine the CMake Generator in use + gen = get_cmake_cache_entry("CMAKE_GENERATOR") + assert(gen == "Unix Makefiles") + + # Find compiler path + compiler = get_cmake_cache_entry("CMAKE_CXX_COMPILER") + compile_flags = [compiler] + + # Parse compiler flags + for line in open(os.path.join(get_perftool_dir(), "python", "CMakeFiles", "_autotune_target.dir", "flags.make"), "r"): + match = re.match("([^=]*)=(.*)", line) + if match: + compile_flags.extend(match.groups()[1].split()) + + # Add the source file + compile_flags.append(filename) + + # Parse linker flags + for line in open(os.path.join(get_perftool_dir(), "python", "CMakeFiles", "_autotune_target.dir", "link.txt"), "r"): + match = re.match(".*_autotune_target (.*)", line) + if match: + for flag in match.groups()[0].split(): + if flag.startswith("-") or os.path.isabs(flag): + compile_flags.append(flag) + else: + compile_flags.append(os.path.join(get_perftool_dir(), "python", flag)) + + # Set an output name + compile_flags.append("-o") + compile_flags.append(name) + + return compile_flags + + +def generate_standalone_code(sf, filename): + delete_cache_items("kernel_default") + + with open(filename, "w") as f: + f.writelines(["#include \"config.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/perftool/common/tsc.hh>\n", + "#include<dune/perftool/common/vectorclass.hh>\n", + "#include<dune/perftool/sumfact/onedquadrature.hh>\n", + "#include<dune/perftool/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + f.writelines(["int main(int argc, char** argv)\n", + "{\n", + ]) + + # Setup a polynomial object (normally done in the LocalOperator members) + opcounting = get_option("opcounter") + set_option("opcounter", False) + from dune.perftool.loopy.target import type_floatingpoint + real = type_floatingpoint() + f.write(" using RF = {};\n".format(real)) + f.write(" using DF = {};\n".format(real)) + + from dune.perftool.sumfact.tabulation import name_polynomials + degs = tuple(m.basis_size - 1 for m in sf.matrix_sequence) + for deg in set(degs): + f.write(" Dune::QkStuff::EquidistantLagrangePolynomials<DF, RF, {}> {};\n".format(deg, name_polynomials(deg))) + + # Get kernels + from dune.perftool.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + constructor_knl = extract_kernel_from_cache("operator", "constructor_kernel", None, wrap_in_cgen=False, add_timings=False) + constructor_knl = constructor_knl.copy(target=DuneTarget(declare_temporaries=False)) + constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) + + # Allocate buffers + size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width) + size = int(size * (get_option("precision_bits") / 8)) + f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), + " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), + ]) + + # Setup fastdg inputs + for arg in sf.interface.signature_args: + if "jacobian" in arg: + f.write("{} = 0;\n".format(arg)) + else: + size = sf.interface.fastdg_interface_object_size + f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size)) + + # Write stuff into the input buffer + f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), + " {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2), + " for(int i=0; i<{}; ++i)\n".format(size / (get_option("precision_bits") / 8)), + " input[i] = ({})(i+1);\n".format(real), + ]) + + target = DuneTarget() + from loopy.codegen import CodeGenerationState + codegen_state = CodeGenerationState(kernel=constructor_knl, + implemented_data_info=None, + implemented_domain=None, + implemented_predicates=frozenset(), + seen_dtypes=frozenset(), + seen_functions=frozenset(), + seen_atomic_dtypes=frozenset(), + var_subst_map={}, + allow_complex=False, + is_generating_device_code=True, + ) + + for decl in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0): + f.write(" {}\n".format(next(iter(decl.generate())))) + + for _, line in constructor_knl.preambles: + if "gfsu" not in line: + f.write(" {}\n".format(line)) + + # Add setup code for theta matrices. We add some lines not necessary, + # but it would be more work to remove them than keeping them. + for line in lp.generate_body(constructor_knl).split("\n")[1:-1]: + if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line: + f.write(" {}\n".format(line)) + + # INtroduces a variable that makes sure that the kernel cannot be optimized away + f.writelines([" {} accum;\n".format(real), + " std::mt19937 rng;\n", + " rng.seed(42);\n", + " std::uniform_int_distribution<> dis(0, {});\n".format(size / (get_option("precision_bits") / 8)), + ]) + + # Start a TSC timer + f.writelines([" auto start = Dune::PDELab::TSC::start();\n", + ]) + + # Add the implementation of the kernel. + f.write(" for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations))) + f.write(" {\n") + for line in knl.member.lines[1:]: + f.write(" {}\n".format(line)) + f.write(" }\n") + + # Stop the TSC timer and write the result to a file + f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", + " std::ofstream file;\n", + " file.open(argv[1]);\n", + " file << Dune::PDELab::TSC::elapsed(start, stop) << std::endl;\n", + " file.close();\n", + " accum += output[dis(rng)];\n", + " std::cout << accum;\n", + "}\n", + ]) + set_option("opcounter", opcounting) + + +def autotune_realization(sf): + # Make sure that the benchmark directory exists + dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") + if not os.path.exists(dir): + os.mkdir(dir) + + basename = "autotune_sumfact_{}".format(sf.function_name) + name = os.path.join(dir, "autotune_sumfact_{}".format(sf.function_name)) + filename = os.path.join(dir, "{}.cc".format(basename)) + logname = os.path.join(dir, "{}.log".format(basename)) + lock = "{}.lock".format(name) + + # Generate and compile a benchmark program + with cache_restoring(): + with filelock.FileLock(lock): + if not os.path.isfile(logname): + generate_standalone_code(sf, filename) + + ret = subprocess.call(compiler_invocation(name, filename)) + if ret != 0: + raise PerftoolAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(compiler_invocation(name, filename)))) + + # Check whether the user specified an execution wrapper + call = [] + wrapper = get_cmake_cache_entry("DUNE_PERFTOOL_BENCHMARK_WRAPPER") + if wrapper: + call.append(wrapper) + + # Run the benchmark program + call.append(name) + call.append(logname) + devnull = open(os.devnull, 'w') + ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) + if ret != 0: + raise PerftoolAutotuneError("Execution of autotune benchmark failed. Invocation: {}".format(" ".join(call))) + + # Extract the result form the log file + return float(next(iter(open(logname, "r")))) / 1000000 diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index b4795478e990e5f82090f8a6346ee73168ede6e9..d846024d54e9f6b62785862803ab4e03498ac6ac 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -138,6 +138,11 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): else: return () + @property + def fastdg_interface_object_size(self): + from dune.perftool.sumfact.accumulation import _local_sizes + return sum(_local_sizes(self.element)) + def _basis_functions_per_direction(element): """Number of basis functions per direction """ diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 435975e41af8c29c0aaf078450f0046dd721137a..f1ef75074baa956e7a8fd3cb354b233495899501 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -124,6 +124,10 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): def function_name_suffix(self): return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces)) + @property + def fastdg_interface_object_size(self): + return self.interfaces[0].fastdg_interface_object_size + class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): def __init__(self, interfaces): @@ -212,6 +216,10 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): def function_name_suffix(self): return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces)) + @property + def fastdg_interface_object_size(self): + return self.interfaces[0].fastdg_interface_object_size + class SumfactKernelBase(object): pass diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index e913ed6243ff50f1e028bcff41dc2d2547e0c690..ef1c23da8c93d8db09fce419242892de68151d4f 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -373,13 +373,17 @@ def name_polynomials(degree): return name -@preamble(kernel="operator") def sort_quadrature_points_weights(qp, qw, bound): range_field = lop_template_range_field() domain_field = name_domain_field() include_file("dune/perftool/sumfact/onedquadrature.hh", filetag="operatorfile") - return "onedQuadraturePointsWeights<{}, {}, {}>({}, {});" \ - .format(range_field, domain_field, bound, qp, qw) + return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});" + .format(range_field, domain_field, bound, qp, qw), + assignees=frozenset({qp, qw}), + read_variables=frozenset({qp, qw}), + kernel="operator", + ), + }) @iname(kernel="operator") @@ -450,10 +454,11 @@ def define_theta(name, tabmat, additional_indices=(), width=None): args = [inames[1]] + dep = frozenset() if tabmat.face is None: qp = name_oned_quadrature_points(bound) qw = name_oned_quadrature_weights(bound) - sort_quadrature_points_weights(qp, qw, bound) + dep = sort_quadrature_points_weights(qp, qw, bound) args.append(prim.Subscript(prim.Variable(qp), (inames[0],))) else: args.append(tabmat.face) @@ -461,6 +466,7 @@ def define_theta(name, tabmat, additional_indices=(), width=None): instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices), expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)), kernel="operator", + depends_on=dep, ) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 27585679b456dd27a3470b999798fab02823eb3b..7d779eff11d82c4acf2fcd810223790537c8e7d6 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -12,6 +12,7 @@ from dune.perftool.generation import (backend, get_backend, get_counted_variable, get_global_context_value, + kernel_cached, ) from dune.perftool.pdelab.restriction import (Restriction, restricted_name, @@ -23,6 +24,7 @@ from dune.perftool.error import PerftoolVectorizationError from dune.perftool.options import get_form_option, get_option, set_form_option from dune.perftool.tools import add_to_frozendict, round_to_multiple, list_diff +from pymbolic.mapper.flop_counter import FlopCounter from pytools import product from frozendict import frozendict import itertools as it @@ -98,6 +100,20 @@ def target_costfunction(sf): return abs(realcost - ratio * target) +def accumulate_for_strategy(strategy, func, reduce=lambda x, y: x + y, start=0): + """ Accumulate a quantity over a given vectorization strategy """ + accum = start + + # Make sure to count each implementation exactly once + keys = set(sf.cache_key for sf in strategy.values()) + for sf in strategy.values(): + if sf.cache_key in keys: + accum = reduce(accum, func(sf)) + keys.discard(sf.cache_key) + + return accum + + def strategy_cost(strat_tuple): qp, strategy = strat_tuple @@ -109,20 +125,62 @@ def strategy_cost(strat_tuple): func = explicit_costfunction elif s == "target": func = target_costfunction + elif s == "autotune": + from dune.perftool.sumfact.autotune import autotune_realization + func = autotune_realization else: raise NotImplementedError("Vectorization strategy '{}' unknown!".format(s)) keys = set(sf.cache_key for sf in strategy.values()) set_quadrature_points(qp) - # Sum over all the sum factorization kernels in the realization - score = 0.0 - for sf in strategy.values(): - if sf.cache_key in keys: - score = score + float(func(sf)) - keys.discard(sf.cache_key) + return accumulate_for_strategy(strategy, lambda sf: float(func(sf))) + + +class PrimitiveApproximateOpcounter(FlopCounter): + def map_sumfact_kernel(self, expr): + return 0 + + def map_tagged_variable(self, expr): + return self.map_variable(expr) + + +@kernel_cached +def store_operation_count(expr, count): + return count + + +def count_quadrature_point_operations(expr): + counter = PrimitiveApproximateOpcounter() + store_operation_count(expr, counter(expr)) + + +def quadrature_penalized_strategy_cost(strat_tuple): + """ Implements a penalization of the cost function that accounts for + the increase in flops that occur in the quadrature loop. This needs to + somehow get a guess of how much work is done in the quadrature loop relative + to the sum factorization kernels. + """ + qp, strategy = strat_tuple + + # Evaluate the original cost function. This result will be scaled by this function. + cost = strategy_cost(strat_tuple) + + # Get the total number of Flops done in sum factorization kernels + sf_flops = accumulate_for_strategy(strategy, lambda sf: sf.operations) + + # Get the minimal possible number of quadrature points and the actual quadrature points + num_qp_new = product(qp) + set_quadrature_points(None) + num_qp_old = product(quadrature_points_per_direction()) + set_quadrature_points(qp) + + # Determine the number of floating point operations per quadrature point. + # This flop counting is a very crude approximation, but that is totally sufficient here. + ops_per_qp = sum(i.value for i in store_operation_count._memoize_cache.values()) - return score + # Do the actual scaling. + return float((sf_flops + ops_per_qp * num_qp_new) / (sf_flops + ops_per_qp * num_qp_old)) * cost def fixedqp_strategy_costfunction(qp): @@ -296,7 +354,7 @@ def level1_optimal_vectorization_strategy(sumfacts, width): with open("mapping.csv", 'a') as f: f.write(" ".join((identifier, str(cost), short_stringify_vectorization_strategy((qp, optimal_strategies[qp])))) + "\n") else: - qp = min(optimal_strategies, key=lambda qp: strategy_cost((qp, optimal_strategies[qp]))) + qp = min(optimal_strategies, key=lambda qp: quadrature_penalized_strategy_cost((qp, optimal_strategies[qp]))) return qp, optimal_strategies[qp] diff --git a/python/setup.py b/python/setup.py index 476a9f6f7b5399c04837af8eb5e627f4182156f2..e4517d2f7f6c1b57b7f57f69c90b20a5f4a05f3f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ setup(name='dune.perftool', 'dune.perftool.ufl', 'dune.perftool.ufl.transformations', ], - install_requires=['dune.testtools', 'sympy', 'frozendict', 'pytest', 'pytest-pep8'], + install_requires=['dune.testtools', 'sympy', 'frozendict', 'pytest', 'pytest-pep8', 'filelock'], cmdclass={'test': PyTest}, entry_points = { "console_scripts": [ diff --git a/test/sumfact/poisson/poisson_3d.mini b/test/sumfact/poisson/poisson_3d.mini index e3e6da7d29475cedd70db44a2b667d6171ae5c80..bf7b0f3bfdc7823e970694ecead48e6ecb4d7f6a 100644 --- a/test/sumfact/poisson/poisson_3d.mini +++ b/test/sumfact/poisson/poisson_3d.mini @@ -4,7 +4,7 @@ __exec_suffix = {deg_suffix}_{diff_suffix}_{quadvec_suffix}_{gradvec_suffix} deg_suffix = deg{formcompiler.ufl_variants.degree} diff_suffix = numdiff, symdiff | expand num quadvec_suffix = quadvec, nonquadvec | expand quad -gradvec_suffix = gradvec, nongradvec | expand grad +gradvec_suffix = gradvec, autotunevec, nongradvec | expand grad cells = 8 8 8 extension = 1. 1. 1. @@ -21,7 +21,7 @@ compare_l2errorsquared = 1e-4, 1e-8 | expand deg numerical_jacobian = 1, 0 | expand num sumfact = 1 vectorization_quadloop = 1, 0 | expand quad -vectorization_strategy = explicit, none | expand grad +vectorization_strategy = explicit, autotune, none | expand grad [formcompiler.ufl_variants] degree = 1, 2 | expand deg