diff --git a/CMakeLists.txt b/CMakeLists.txt index a496109044e05590d8458a73f2c2a5e7c2d78074..20e7d8a8b1ee12d80245e296a726d6eceffa5040 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.1.3) project(dune-codegen CXX) if(NOT (dune-common_DIR OR dune-common_ROOT OR @@ -21,6 +21,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDUNE_ISTL_SUPPORT_OLD_CATEGORY_INTERFA # start a dune project with information from dune.module dune_project() +dune_python_force_version(3) + dune_add_library(dunecodegen dune/codegen/common/tsc.cc) dune_target_enable_all_packages(dunecodegen) diff --git a/README.md b/README.md index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,25 @@ ctest Note that this takes quite a while. +## Building and Running dune-codegen in an offline environment + +dune-codegen relies on installing Python packages into self-contained environments +during its configuration and build process. In order to do this in an offline +environment, we recommend using the tool `devpi`. One of its use cases is to provide +a local mirror for the Python package index. A quickstart tutorial for this use case +is available [5]. It boils down to the following: + +* Installing the `devpi-server` package through your favorite method +* Setting up a local server with `devpi-server --init` +* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service. +* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration): +``` +export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/ +``` + +At first installation, the locally mirrored package index will access PyPI. +Later on, it will install packages from its local cache. + ## Links [0]: https://git-lfs.github.com/ @@ -112,3 +131,4 @@ Note that this takes quite a while. [2]: https://gitlab.dune-project.org/quality/dune-testtools [3]: http://isl.gforge.inria.fr/ [4]: https://www.dune-project.org/doc/installation/ +[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake index da3225866785c75a8cf73e6aa78b6e3e0eea42f9..cff09c5ec49c0675680b5f49c1f24f9008c93cc5 100644 --- a/cmake/modules/DuneCodegenMacros.cmake +++ b/cmake/modules/DuneCodegenMacros.cmake @@ -116,6 +116,11 @@ function(dune_add_generated_executable) message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.") endif() + set(MPI_OPTION "0") + if(MPI_FOUND) + set(MPI_OPTION "1") + endif() + # Apply defaults and enforce requirements if(NOT GEN_TARGET) message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable") @@ -139,6 +144,7 @@ function(dune_add_generated_executable) --target-name ${GEN_TARGET} --driver-file ${GEN_SOURCE} --project-basedir ${CMAKE_BINARY_DIR} + --with-mpi ${MPI_OPTION} ${GEN_FORM_COMPILER_ARGS} DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} COMMENT "Generating driver for the target ${GEN_TARGET}" @@ -172,10 +178,8 @@ function(dune_add_generated_executable) endif() # Parse a mapping of operators to build and their respective filenames - dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} - OUTPUT_VARIABLE depdata - ) - parse_python_data(PREFIX depdata INPUT ${depdata}) + dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log) + parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) if(DUNE_CODEGEN_PROFILING) # This is a bit silly, but cProfile only finds entry point scripts @@ -199,6 +203,7 @@ function(dune_add_generated_executable) --ini-file ${GEN_INIFILE} --target-name ${GEN_TARGET} --operator-to-build ${op} + --with-mpi ${MPI_OPTION} ${ANALYZE_GRID_OPTION} DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE} COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}" diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake index 87af7161c8b9c8196dbddc3ae42e55bcffd3bd81..2a1907950a7f5bd68e85a5c536465a73634828cf 100644 --- a/cmake/modules/GeneratedSystemtests.cmake +++ b/cmake/modules/GeneratedSystemtests.cmake @@ -119,9 +119,14 @@ function(dune_add_formcompiler_system_test) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} ${CMAKE_CURRENT_BINARY_DIR}/tmp_${SYSTEMTEST_INIFILE}) # expand the given meta ini file into the build tree - execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler - OUTPUT_VARIABLE output) - parse_python_data(PREFIX INIINFO INPUT "${output}") + execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py + --cmake + --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} + --dir ${CMAKE_CURRENT_BINARY_DIR} + --section formcompiler + --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log + ) + parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) foreach(inifile ${INIINFO_names}) if(${INIINFO_${inifile}_suffix} STREQUAL "__empty") @@ -147,10 +152,11 @@ function(dune_add_formcompiler_system_test) # just the way that dune-testtools does. dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py --ini ${inifile} + --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} OUTPUT_VARIABLE output ERROR_MESSAGE "Error extracting static info from ${inifile}") - parse_python_data(PREFIX STAT INPUT "${output}") + parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) foreach(config ${STAT___CONFIGS}) foreach(cd ${STAT___STATIC_DATA}) diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py index 9cb5d7d42cbfc712e18e37798dffc2d553416f8c..7b0afac3c7c5f216e9710cef394d175c0c6c9ee9 100755 --- a/cmake/modules/deplist.py +++ b/cmake/modules/deplist.py @@ -22,5 +22,5 @@ def get_filename(operator): result = {"__{}".format(o): get_filename(o) for o in operators} result["__operators"] = ";".join(operators) -printForCMake(result) +printForCMake(result, sys.argv[3]) sys.exit(0) diff --git a/dune/codegen/sumfact/horizontaladd.hh b/dune/codegen/sumfact/horizontaladd.hh index fc62dc47dab77330a125bb5f91805c89808480b0..7dd122af4c877b7206e99d71ba8aa05fc2fc448d 100644 --- a/dune/codegen/sumfact/horizontaladd.hh +++ b/dune/codegen/sumfact/horizontaladd.hh @@ -4,6 +4,50 @@ #include<dune/codegen/common/vectorclass.hh> +// Only use our custom implementations if we have AVX2 or later! +#if INSTRSET >= 8 + +/** Implement a variant of horizontal_add(Vec2d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add (const Vec2d & a) +{ + return _mm_cvtsd_f64(_mm_add_pd(_mm_permute_pd(a,1),a)); +} + +/** Implement a variant of horizontal_add(Vec4d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add (const Vec4d& a) +{ + __m128d valupper = _mm256_extractf128_pd(a, 1); + __m128d vallower = _mm256_castpd256_pd128(a); + __m128d valval = _mm_add_pd(valupper, vallower); + __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval); + return _mm_cvtsd_f64(res); +} + +#if MAX_VECTOR_SIZE >= 512 + +/** Implement a variant of horizontal_add(Vec8d) that avoids the haddpd + * instruction and instead uses the shuffle port. + */ +static inline double permuting_horizontal_add(const Vec8d& a) +{ + return permuting_horizontal_add(a.get_low() + a.get_high()); +} + +#endif + +#else +template<typename V> +static inline double permuting_horizontal_add (const V& a) +{ + return horizontal_add(a); +} + +#endif + template<class V> typename base_floatingpoint<V>::value horizontal_add_lower(const V& x) { @@ -16,4 +60,16 @@ typename base_floatingpoint<V>::value horizontal_add_upper(const V& x) return horizontal_add(x.get_high()); } +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x) +{ + return permuting_horizontal_add(x.get_low()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x) +{ + return permuting_horizontal_add(x.get_high()); +} + #endif diff --git a/dune/codegen/sumfact/oc_horizontaladd.hh b/dune/codegen/sumfact/oc_horizontaladd.hh new file mode 100644 index 0000000000000000000000000000000000000000..d136aaf5ffc226e8e2463c5c40b3d64c8a476fe7 --- /dev/null +++ b/dune/codegen/sumfact/oc_horizontaladd.hh @@ -0,0 +1,25 @@ +#ifndef DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH +#define DUNE_CODEGEN_SUMFACT_OCHORIZONTALADD_HH + +#include<immintrin.h> +#include<dune/codegen/common/simdtraits.hh> + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_lower(const V& x) +{ + return horizontal_add(x.get_low()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add_upper(const V& x) +{ + return horizontal_add(x.get_high()); +} + +template<class V> +typename base_floatingpoint<V>::value permuting_horizontal_add(const V& x) +{ + return horizontal_add(x); +} + +#endif diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh index 7d1d45112d392a13a3dec4881ba43222cf60054b..2d0cdc6f543f2cb43d3a28bf55f562db0e64f13a 100755 --- a/patches/apply_patches.sh +++ b/patches/apply_patches.sh @@ -5,10 +5,6 @@ git apply ../../patches/loopy/Current.patch git apply ../../patches/loopy/0001-Disable-a-logging-statement-that-breaks.patch popd -pushd dune/codegen/vectorclass -git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch -popd - pushd python/ufl git apply ../../patches/ufl/0001-Remove-special-case-for-variable-in-ufl2dot.patch popd diff --git a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch b/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch deleted file mode 100644 index c5ca6dc30e2135ab30a28c7373b94da344b8a7ac..0000000000000000000000000000000000000000 --- a/patches/vectorclass/0001-Alternative-implementation-of-horizontal_add-on-AVX5.patch +++ /dev/null @@ -1,44 +0,0 @@ -From a324181d74fd8cd81fb945a4f66e4502ffbc68a0 Mon Sep 17 00:00:00 2001 -From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> -Date: Thu, 30 Nov 2017 18:51:49 +0100 -Subject: [PATCH] Alternative implementation of horizontal_add on AVX512 - ---- - vectorf512.h | 19 +++++++++++++------ - 1 file changed, 13 insertions(+), 6 deletions(-) - -diff --git a/vectorf512.h b/vectorf512.h -index 0845d12..6a15ac2 100644 ---- a/vectorf512.h -+++ b/vectorf512.h -@@ -1339,14 +1339,21 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b) - - - // General arithmetic functions, etc. -+#if __GNUC__ < 7 -+extern __inline double -+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) -+_mm512_cvtsd_f64 (__m512d __A) -+{ -+ return __A[0]; -+} -+#endif - - // Horizontal add: Calculates the sum of all vector elements. --static inline double horizontal_add (Vec8d const & a) { --#if defined(__INTEL_COMPILER) -- return _mm512_reduce_add_pd(a); --#else -- return horizontal_add(a.get_low() + a.get_high()); --#endif -+static inline double horizontal_add (Vec8d const & x) { -+ __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1))); -+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 2))); -+ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castpd_si512(intermediate), 4))); -+ return _mm512_cvtsd_f64(intermediate); - } - - // function max: a > b ? a : b --- -2.1.4 - diff --git a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch b/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch deleted file mode 100644 index fee83d7ad7cedbacc588c530ad9581b49cfa3b54..0000000000000000000000000000000000000000 --- a/patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch +++ /dev/null @@ -1,32 +0,0 @@ -From 69f4ea4dcd018eb74c39a076a60fc27c0496e1dd Mon Sep 17 00:00:00 2001 -From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> -Date: Mon, 19 Jun 2017 13:07:22 +0200 -Subject: [PATCH] Better implementation of horizontal_add - ---- - vectorf256.h | 9 +++++---- - 1 file changed, 5 insertions(+), 4 deletions(-) - -diff --git a/vectorf256.h b/vectorf256.h -index db509f8..2bbd9de 100644 ---- a/vectorf256.h -+++ b/vectorf256.h -@@ -1692,10 +1692,11 @@ static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) - - // Horizontal add: Calculates the sum of all vector elements. - static inline double horizontal_add (Vec4d const & a) { -- __m256d t1 = _mm256_hadd_pd(a,a); -- __m128d t2 = _mm256_extractf128_pd(t1,1); -- __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2); -- return _mm_cvtsd_f64(t3); -+ const __m128d valupper = _mm256_extractf128_pd(a, 1); -+ const __m128d vallower = _mm256_castpd256_pd128(a); -+ const __m128d valval = _mm_add_pd(valupper, vallower); -+ const __m128d res = _mm_add_pd(_mm_permute_pd(valval,1), valval); -+ return _mm_cvtsd_f64(res); - } - - // function max: a > b ? a : b --- -2.1.4 - diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8881504f934e7b1d755580ffc46d47f2a1467346..67694937376b1b6cdaa66d44db07289652f62bcc 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -24,6 +24,7 @@ add_subdirectory(test) add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc) target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing) -if(benchmark_FOUND) - target_link_libraries(_autotune_target benchmark) +find_package(Threads) +if(benchmark_FOUND AND Threads_FOUND) + target_link_libraries(_autotune_target benchmark Threads::Threads) endif() diff --git a/python/dune/codegen/blockstructured/accumulation.py b/python/dune/codegen/blockstructured/accumulation.py index 9af9759d5e3426df12a10db39f94048e48006867..ec2e3d9917998a7a024796d48aaf6be6488a271c 100644 --- a/python/dune/codegen/blockstructured/accumulation.py +++ b/python/dune/codegen/blockstructured/accumulation.py @@ -1,12 +1,12 @@ -from dune.codegen.blockstructured.tools import sub_element_inames -from dune.codegen.generation import accumulation_mixin, instruction +from dune.codegen.blockstructured.tools import sub_element_inames, name_accumulation_alias +from dune.codegen.generation import accumulation_mixin, instruction, get_global_context_value from dune.codegen.loopy.target import dtype_floatingpoint from dune.codegen.options import get_form_option from dune.codegen.pdelab.geometry import world_dimension, name_intersection_geometry_wrapper from dune.codegen.pdelab.localoperator import determine_accumulation_space, GenericAccumulationMixin from dune.codegen.pdelab.argument import name_accumulation_variable from dune.codegen.pdelab.localoperator import boundary_predicates -from dune.codegen.generation.loopy import function_mangler, globalarg, temporary_variable +from dune.codegen.generation.loopy import function_mangler, temporary_variable import loopy as lp import pymbolic.primitives as prim @@ -16,32 +16,10 @@ from loopy.match import Writes @accumulation_mixin("blockstructured") class BlockStructuredAccumulationMixin(GenericAccumulationMixin): def generate_accumulation_instruction(self, expr): - if get_form_option('vectorization_blockstructured'): - return generate_accumulation_instruction_vectorized(expr, self) - else: + if get_global_context_value("form_type") == "jacobian": return generate_accumulation_instruction(expr, self) - - -def name_accumulation_alias(container, accumspace): - name = container + "_" + accumspace.lfs.name + "_alias" - name_tail = container + "_" + accumspace.lfs.name + "_alias_tail" - k = get_form_option("number_of_blocks") - p = accumspace.element.degree() - - def _add_alias_insn(name): - dim = world_dimension() - element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) - index_stride = tuple((p * k + 1)**i for i in range(0, dim)) - globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) - code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name) - instruction(within_inames=frozenset(), - code=code, - read_variables=frozenset({container}), - assignees=frozenset({name})) - - _add_alias_insn(name) - _add_alias_insn(name_tail) - return name + else: + return generate_accumulation_instruction_vectorized(expr, self) @function_mangler diff --git a/python/dune/codegen/blockstructured/argument.py b/python/dune/codegen/blockstructured/argument.py index 420773e85bea93ee55cb310255da7fa60d55d9de..deff1b8415d246d0ed197692df46b61db2dbd5cc 100644 --- a/python/dune/codegen/blockstructured/argument.py +++ b/python/dune/codegen/blockstructured/argument.py @@ -1,29 +1,11 @@ -from dune.codegen.generation import (kernel_cached, - valuearg, instruction, globalarg) +from dune.codegen.generation import kernel_cached, valuearg from dune.codegen.options import get_form_option from dune.codegen.pdelab.argument import CoefficientAccess -from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames -from dune.codegen.pdelab.geometry import world_dimension +from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames, name_container_alias from loopy.types import NumpyType import pymbolic.primitives as prim -def name_alias(container, lfs, element): - name = container + "_" + lfs.name + "_alias" - k = get_form_option("number_of_blocks") - p = element.degree() - dim = world_dimension() - element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) - index_stride = tuple((p * k + 1)**i for i in range(0, dim)) - globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) - code = "const auto {} = &{}({},0);".format(name, container, lfs.name) - instruction(within_inames=frozenset(), - code=code, - read_variables=frozenset({container}), - assignees=frozenset({name})) - return name - - # TODO remove the need for element @kernel_cached def pymbolic_coefficient(container, lfs, element, index): @@ -36,9 +18,6 @@ def pymbolic_coefficient(container, lfs, element, index): lfs = prim.Variable(lfs) # use higher order FEM index instead of Q1 index - if get_form_option("vectorization_blockstructured"): - subelem_inames = sub_element_inames() - coeff_alias = name_alias(container, lfs, element) - return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index)) - else: - return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) + subelem_inames = sub_element_inames() + coeff_alias = name_container_alias(container, lfs, element) + return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index)) diff --git a/python/dune/codegen/blockstructured/basis.py b/python/dune/codegen/blockstructured/basis.py index 5af8bb48f2bb625a0b336c5cae6d42863f0063c5..3781ac2a35855b03ed76c6fdb1c1b3db8c73559e 100644 --- a/python/dune/codegen/blockstructured/basis.py +++ b/python/dune/codegen/blockstructured/basis.py @@ -7,7 +7,7 @@ from dune.codegen.generation import (basis_mixin, globalarg, class_member, initializer_list, - include_file,) + include_file, preamble) from dune.codegen.tools import get_pymbolic_basename, get_pymbolic_indices from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.pdelab.basis import (GenericBasisMixin, @@ -22,7 +22,7 @@ from dune.codegen.pdelab.geometry import world_dimension, component_iname from dune.codegen.pdelab.spaces import type_leaf_gfs, name_lfs from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.blockstructured.spaces import lfs_inames -from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, sub_element_inames +from dune.codegen.blockstructured.tools import tensor_index_to_sequential_index, remove_sub_element_inames from ufl import MixedElement @@ -38,11 +38,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin): assert not isinstance(element, MixedElement) name = "phi_{}".format(FEM_name_mangling(element)) name = restricted_name(name, restriction) + self.init_basis_cache(element, restriction) self.evaluate_basis(element, name, restriction) inames = self.lfs_inames(element, restriction, number, context=context) return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0)) + @preamble(kernel='operator') + def init_basis_cache(self, element, restriction): + if not restriction: + cache = name_localbasis_cache(element) + localbasis = name_localbasis(element) + qp_name = get_pymbolic_basename(self.quadrature_position_in_micro()) + return ["for (int i=0; i < {}.size(); ++i)".format(qp_name), + "{", + " {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis), + "}"] + else: + return [] + @kernel_cached def evaluate_basis(self, element, name, restriction): temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1), @@ -50,7 +64,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin): cache = name_localbasis_cache(element) qp = self.to_cell(self.quadrature_position_in_micro()) localbasis = name_localbasis(element) - instruction(inames=self.quadrature_inames(), + instruction(inames=remove_sub_element_inames(self.quadrature_inames()), code='{} = {}.evaluateFunction({}, {});'.format(name, cache, str(qp), localbasis), assignees=frozenset({name}), read_variables=frozenset({get_pymbolic_basename(qp)}), @@ -60,11 +74,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin): assert not isinstance(element, MixedElement) name = "js_{}".format(FEM_name_mangling(element)) name = restricted_name(name, restriction) + self.init_gradient_cache(element, restriction) self.evaluate_reference_gradient(element, name, restriction) inames = self.lfs_inames(element, restriction, number, context=context) return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0)) + @preamble(kernel='operator') + def init_gradient_cache(self, element, restriction): + if not restriction: + cache = name_localbasis_cache(element) + localbasis = name_localbasis(element) + qp_name = get_pymbolic_basename(self.quadrature_position_in_micro()) + return ["for (int i=0; i < {}.size(); ++i)".format(qp_name), + "{", + " {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis), + "}"] + else: + return [] + @kernel_cached def evaluate_reference_gradient(self, element, name, restriction): temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()), @@ -72,7 +100,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin): cache = name_localbasis_cache(element) qp = self.to_cell(self.quadrature_position_in_micro()) localbasis = name_localbasis(element) - instruction(inames=self.quadrature_inames(), + instruction(inames=remove_sub_element_inames(self.quadrature_inames()), code='{} = {}.evaluateJacobian({}, {});'.format(name, cache, str(qp), localbasis), assignees=frozenset({name}), read_variables=frozenset({get_pymbolic_basename(qp)}), @@ -104,7 +132,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin): instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True), assignee=assignee, - forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,) + sub_element_inames()), + forced_iname_deps=frozenset(self.quadrature_inames() + (dimindex,)), forced_iname_deps_is_final=True, ) @@ -131,7 +159,7 @@ class BlockStructuredBasisMixin(GenericBasisMixin): instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True), assignee=assignee, - forced_iname_deps=frozenset(self.quadrature_inames() + sub_element_inames()), + forced_iname_deps=frozenset(self.quadrature_inames()), forced_iname_deps_is_final=True, ) diff --git a/python/dune/codegen/blockstructured/geometry.py b/python/dune/codegen/blockstructured/geometry.py index 3ad3a9026d8a5287e54e3950aef0aa199d08cc4e..975345eb572cd3adad443c879cff29220c6d7f65 100644 --- a/python/dune/codegen/blockstructured/geometry.py +++ b/python/dune/codegen/blockstructured/geometry.py @@ -1,9 +1,13 @@ +import pymbolic.primitives as prim +from loopy.match import Writes + +from dune.codegen.blockstructured.tools import name_point_in_macro from dune.codegen.generation import (geometry_mixin, temporary_variable, instruction, get_global_context_value, domain) -from dune.codegen.tools import get_pymbolic_basename +from dune.codegen.loopy.symbolic import FusedMultiplyAdd as FMA from dune.codegen.options import get_form_option from dune.codegen.pdelab.geometry import (AxiparallelGeometryMixin, EquidistantGeometryMixin, @@ -15,12 +19,9 @@ from dune.codegen.pdelab.geometry import (AxiparallelGeometryMixin, restricted_name, name_cell_geometry ) -from dune.codegen.blockstructured.tools import (sub_element_inames, - name_point_in_macro, - ) +from dune.codegen.pdelab.tensors import name_matrix_inverse, name_determinant +from dune.codegen.tools import get_pymbolic_basename from dune.codegen.ufl.modified_terminals import Restriction -import pymbolic.primitives as prim -from loopy.match import Writes @geometry_mixin("blockstructured_multilinear") @@ -49,43 +50,25 @@ class BlockStructuredGeometryMixin(GenericPDELabGeometryMixin): prim.Power(get_form_option("number_of_blocks"), local_dimension())) def jacobian_determinant(self, o): - name = 'detjac' - self.define_jacobian_determinant(name) - return prim.Quotient(prim.Variable(name), - prim.Power(get_form_option("number_of_blocks"), local_dimension())) - - def define_jacobian_determinant(self, name): - temporary_variable(name, shape=(), managed=True) - - determinant_signed = name_jacobian_determinant_signed(self) + jacobian = name_jacobian_matrix(self) + name = name_determinant(jacobian, (world_dimension(), world_dimension()), self) - return instruction(expression=prim.Call(prim.Variable("abs"), (prim.Variable(determinant_signed),)), - assignee=prim.Variable(name), - within_inames=frozenset(sub_element_inames() + self.quadrature_inames()), - depends_on=frozenset({Writes(determinant_signed)}) - ) + return prim.Quotient(prim.Call(prim.Variable("abs"), (prim.Variable(name),)), + prim.Power(get_form_option("number_of_blocks"), local_dimension())) def jacobian_inverse(self, o): - restriction = enforce_boundary_restriction(self) - assert(len(self.indices) == 2) i, j = self.indices self.indices = None - name = restricted_name("jit", restriction) - self.define_jacobian_inverse_transposed(name, restriction) + restriction = enforce_boundary_restriction(self) + + jacobian = restricted_name(name_jacobian_matrix(self), restriction) + name = name_matrix_inverse(jacobian, (world_dimension(), world_dimension()), self) return prim.Product((prim.Subscript(prim.Variable(name), (j, i)), get_form_option("number_of_blocks"))) - def define_jacobian_inverse_transposed(self, name, restriction): - temporary_variable(name, shape=(world_dimension(), world_dimension()), managed=True) - - jacobian = name_jacobian_matrix(self) - det_inv = name_jacobian_determinant_inverse(self) - - compute_inverse_transposed(name, det_inv, jacobian, self) - @geometry_mixin("blockstructured_axiparallel") class AxiparallelBlockStructuredGeometryMixin(AxiparallelGeometryMixin, BlockStructuredGeometryMixin): @@ -167,12 +150,12 @@ def compute_jacobian(name, visitor): a, b, c = coefficients expr_jac = [None, None] - expr_jac[0] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (1,)), - prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))), - prim.Subscript(prim.Variable(b), (prim.Variable(jac_iname),)))) - expr_jac[1] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (0,)), - prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))), - prim.Subscript(prim.Variable(c), (prim.Variable(jac_iname),)))) + expr_jac[0] = FMA(prim.Subscript(pymbolic_pos, (1,)), + prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)), + prim.Subscript(prim.Variable(b), (prim.Variable(jac_iname),))) + expr_jac[1] = FMA(prim.Subscript(pymbolic_pos, (0,)), + prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)), + prim.Subscript(prim.Variable(c), (prim.Variable(jac_iname),))) elif world_dimension() == 3: a, b, c, d, e, f, g = coefficients @@ -183,21 +166,20 @@ def compute_jacobian(name, visitor): # with k, l in {0,1,2} != i and k<l and vj = terms[i][j] for i in range(3): k, l = sorted(set(range(3)) - {i}) - expr_jac[i] = prim.Sum((prim.Product((prim.Subscript(pymbolic_pos, (k,)), prim.Subscript(pymbolic_pos, (l,)), - prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)))), - prim.Product((prim.Subscript(pymbolic_pos, (k,)), - prim.Subscript(prim.Variable(terms[i][0]), (prim.Variable(jac_iname),)))), - prim.Product((prim.Subscript(pymbolic_pos, (l,)), - prim.Subscript(prim.Variable(terms[i][1]), (prim.Variable(jac_iname),)))), - prim.Subscript(prim.Variable(terms[i][2]), (prim.Variable(jac_iname),)) - )) + expr_jac[i] = FMA(prim.Subscript(prim.Variable(a), (prim.Variable(jac_iname),)), + prim.Subscript(pymbolic_pos, (k,)) * prim.Subscript(pymbolic_pos, (l,)), + FMA(prim.Subscript(prim.Variable(terms[i][0]), (prim.Variable(jac_iname),)), + prim.Subscript(pymbolic_pos, (k,)), + FMA(prim.Subscript(prim.Variable(terms[i][1]), (prim.Variable(jac_iname),)), + prim.Subscript(pymbolic_pos, (l,)), + prim.Subscript(prim.Variable(terms[i][2]), (prim.Variable(jac_iname),))))) else: raise NotImplementedError() for i, expr in enumerate(expr_jac): instruction(expression=expr, assignee=prim.Subscript(prim.Variable(name), (prim.Variable(jac_iname), i)), - within_inames=frozenset((jac_iname, ) + sub_element_inames() + visitor.quadrature_inames()), + within_inames=frozenset((jac_iname, ) + visitor.quadrature_inames()), depends_on=frozenset({Writes(get_pymbolic_basename(pymbolic_pos))} | {Writes(cd) for cd in coefficients}) ) @@ -213,121 +195,6 @@ def name_jacobian_matrix(visitor): return name -def compute_determinant(name, matrix, visitor): - dim = world_dimension() - matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)] - if dim == 2: - expr_determinant = prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1])), - -1 * prim.Product((matrix_entry[1][0], matrix_entry[0][1])))) - elif dim == 3: - expr_determinant = prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1], matrix_entry[2][2])), - prim.Product((matrix_entry[0][1], matrix_entry[1][2], matrix_entry[2][0])), - prim.Product((matrix_entry[0][2], matrix_entry[1][0], matrix_entry[2][1])), - -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][1], matrix_entry[2][0])), - -1 * prim.Product((matrix_entry[0][0], matrix_entry[1][2], matrix_entry[2][1])), - -1 * prim.Product((matrix_entry[0][1], matrix_entry[1][0], matrix_entry[2][2])) - )) - else: - raise NotImplementedError() - instruction(expression=expr_determinant, - assignee=prim.Variable(name), - within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()), - depends_on=frozenset({Writes(matrix)}) - ) - - -def define_jacobian_determinant(name, visitor): - temporary_variable(name, shape=(), managed=True) - jacobian = name_jacobian_matrix(visitor) - compute_determinant(name, jacobian, visitor) - - -def define_jacobian_determinant_inverse(name, visitor): - temporary_variable(name, shape=(), managed=True) - determinant = name_jacobian_determinant_signed(visitor) - return instruction(expression=prim.Quotient(1., prim.Variable(determinant)), - assignee=prim.Variable(name), - within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()), - depends_on=frozenset({Writes(determinant)}) - ) - - -def name_jacobian_determinant_signed(visitor): - name = "detjac_signed" - define_jacobian_determinant(name, visitor) - return name - - -def name_jacobian_determinant_inverse(visitor): - name = "detjac_inverse" - define_jacobian_determinant_inverse(name, visitor) - return name - - -def compute_inverse_transposed(name, det_inv, matrix, visitor): - dim = world_dimension() - matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)] - assignee = [[prim.Subscript(prim.Variable(name), (i, j)) for j in range(dim)] for i in range(dim)] - exprs = [[None for _ in range(dim)] for _ in range(dim)] - - if dim == 2: - for i in range(2): - for j in range(2): - sign = 1. if i == j else -1. - exprs[i][j] = prim.Product((sign, prim.Variable(det_inv), matrix_entry[1 - i][1 - j])) - elif dim == 3: - exprs[0][0] = prim.Product((1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[1][1], matrix_entry[2][2])), - -1 * prim.Product((matrix_entry[1][2], matrix_entry[2][1])))))) - exprs[1][0] = prim.Product((-1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][1], matrix_entry[2][2])), - -1 * prim.Product((matrix_entry[0][2], matrix_entry[2][1])))))) - exprs[2][0] = prim.Product((1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][1], matrix_entry[1][2])), - -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][1])))))) - - exprs[0][1] = prim.Product((-1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[1][0], matrix_entry[2][2])), - -1 * prim.Product((matrix_entry[1][2], matrix_entry[2][0])))))) - exprs[1][1] = prim.Product((1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[2][2])), - -1 * prim.Product((matrix_entry[0][2], matrix_entry[2][0])))))) - exprs[2][1] = prim.Product((-1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][2])), - -1 * prim.Product((matrix_entry[0][2], matrix_entry[1][0])))))) - - exprs[0][2] = prim.Product((1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[1][0], matrix_entry[2][1])), - -1 * prim.Product((matrix_entry[1][1], matrix_entry[2][0])))))) - exprs[1][2] = prim.Product((-1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[2][1])), - -1 * prim.Product((matrix_entry[0][1], matrix_entry[2][0])))))) - exprs[2][2] = prim.Product((1., prim.Variable(det_inv), - prim.Sum((prim.Product((matrix_entry[0][0], matrix_entry[1][1])), - -1 * prim.Product((matrix_entry[0][1], matrix_entry[1][0])))))) - else: - raise NotImplementedError - for j in range(dim): - for i in range(dim): - instruction(expression=exprs[i][j], - assignee=assignee[i][j], - within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames()), - depends_on=frozenset({Writes(matrix), Writes(det_inv)})) - - -def define_jacobian_inverse_transposed(name, visitor): - temporary_variable(name, shape=(world_dimension(), world_dimension()), managed=True) - jacobian = name_jacobian_matrix(visitor) - det_inv = name_jacobian_determinant_inverse(visitor) - compute_inverse_transposed(name, det_inv, jacobian, visitor) - - -def name_jacobian_inverse_transposed(restriction, visitor): - name = restricted_name("jit", restriction) - define_jacobian_inverse_transposed(name, visitor) - return name - - def compute_multilinear_to_global_transformation(name, local, visitor): dim = world_dimension() temporary_variable(name, shape=(dim,), managed=True) @@ -347,19 +214,22 @@ def compute_multilinear_to_global_transformation(name, local, visitor): # global[d] = T(local)[d] if dim == 2: a_pym, b_pym, c_pym = coeffs_pym - expr = a_pym * local_pym[0] * local_pym[1] + b_pym * local_pym[0] + c_pym * local_pym[1] + corner_0_pym + expr = FMA(a_pym, local_pym[0] * local_pym[1], FMA(b_pym, local_pym[0], FMA(c_pym, local_pym[1], corner_0_pym))) elif dim == 3: a_pym, b_pym, c_pym, d_pym, e_pym, f_pym, g_pym = coeffs_pym - expr = (a_pym * local_pym[0] * local_pym[1] * local_pym[2] + b_pym * local_pym[0] * local_pym[1] + - c_pym * local_pym[0] * local_pym[2] + d_pym * local_pym[1] * local_pym[2] + - e_pym * local_pym[0] + f_pym * local_pym[1] + g_pym * local_pym[2] + corner_0_pym) + expr = FMA(a_pym * local_pym[0], local_pym[1] * local_pym[2], + FMA(b_pym, local_pym[0] * local_pym[1], + FMA(c_pym, local_pym[0] * local_pym[2], + FMA(d_pym, local_pym[1] * local_pym[2], + FMA(e_pym, local_pym[0], + FMA(f_pym, local_pym[1], FMA(g_pym, local_pym[2], corner_0_pym))))))) else: raise NotImplementedError assignee = prim.Subscript(prim.Variable(name), (dim_pym,)) instruction(assignee=assignee, expression=expr, - within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames() + (dim_pym.name,)), + within_inames=frozenset(visitor.quadrature_inames() + (dim_pym.name,)), within_inames_is_final=True, depends_on=frozenset({Writes(get_pymbolic_basename(local)), Writes(corners)}) ) @@ -376,13 +246,14 @@ def compute_axiparallel_to_global_transformation(name, local, visitor): dim_pym = prim.Variable(component_iname('to_global')) # global[d] = lower_left[d] + local[d] * (upper_right[d] - lower_left[d]) - expr = (prim.Subscript(prim.Variable(corners), (0, dim_pym)) + - prim.Subscript(local, (dim_pym,)) * (prim.Subscript(prim.Variable(corners), (2**dim - 1, dim_pym)) - - prim.Subscript(prim.Variable(corners), (0, dim_pym)))) + expr = FMA(prim.Subscript(prim.Variable(corners), (2**dim - 1, dim_pym)) - + prim.Subscript(prim.Variable(corners), (0, dim_pym)), + prim.Subscript(local, (dim_pym,)), prim.Subscript(prim.Variable(corners), (0, dim_pym))) + assignee = prim.Subscript(prim.Variable(name), (dim_pym,)) instruction(assignee=assignee, expression=expr, - within_inames=frozenset(sub_element_inames() + visitor.quadrature_inames() + (dim_pym.name,)), + within_inames=frozenset(visitor.quadrature_inames() + (dim_pym.name,)), within_inames_is_final=True, depends_on=frozenset({Writes(get_pymbolic_basename(local)), Writes(corners)}) ) diff --git a/python/dune/codegen/blockstructured/quadrature.py b/python/dune/codegen/blockstructured/quadrature.py index d3ba4950e8034232ebd41258f571b887caf198a4..5028bd35646fc82d9e529ba8305e67cf5ee7add8 100644 --- a/python/dune/codegen/blockstructured/quadrature.py +++ b/python/dune/codegen/blockstructured/quadrature.py @@ -1,14 +1,18 @@ from dune.codegen.error import CodegenError from dune.codegen.generation import quadrature_mixin from dune.codegen.pdelab.quadrature import GenericQuadratureMixin -from dune.codegen.blockstructured.tools import name_point_in_macro +from dune.codegen.blockstructured.tools import name_point_in_macro, sub_element_inames, remove_sub_element_inames import pymbolic.primitives as prim @quadrature_mixin("blockstructured") class BlockstructuredQuadratureMixin(GenericQuadratureMixin): + @staticmethod + def _subscript_without_sub_element_inames(s): + return prim.Subscript(s.aggregate, remove_sub_element_inames(s.index_tuple)) + def quadrature_position_in_micro(self, index=None): - return GenericQuadratureMixin.quadrature_position(self, index) + return self._subscript_without_sub_element_inames(super().quadrature_position(index)) def quadrature_position_in_macro(self, index=None): original = self.quadrature_position_in_micro(index) @@ -20,3 +24,9 @@ class BlockstructuredQuadratureMixin(GenericQuadratureMixin): def quadrature_position(self, index=None): raise CodegenError('Decide if the quadrature point is in the macro or micro element') + + def quadrature_inames(self): + return super().quadrature_inames() + sub_element_inames() + + def quadrature_weight(self, o): + return self._subscript_without_sub_element_inames(super().quadrature_weight(o)) diff --git a/python/dune/codegen/blockstructured/tools.py b/python/dune/codegen/blockstructured/tools.py index 802b819f23754aa0862e122f4127a3db18593117..c929aa1e447960ce831af54c188bdcf645cc05a7 100644 --- a/python/dune/codegen/blockstructured/tools.py +++ b/python/dune/codegen/blockstructured/tools.py @@ -2,7 +2,7 @@ from dune.codegen.tools import get_pymbolic_basename from dune.codegen.generation import (iname, domain, temporary_variable, - instruction) + instruction, globalarg, preamble) from dune.codegen.pdelab.geometry import world_dimension from dune.codegen.options import get_form_option import pymbolic.primitives as prim @@ -22,6 +22,11 @@ def sub_element_inames(): return inames +def remove_sub_element_inames(indices): + assert isinstance(indices, tuple) + return tuple(set(indices) - set(sub_element_inames()) - set(prim.Variable(i) for i in sub_element_inames())) + + # compute sequential index for given tensor index, the same as index in base-k to base-10 def tensor_index_to_sequential_index(indices, k): return prim.Sum(tuple(prim.Variable(index) * k ** i for i, index in enumerate(indices))) @@ -64,7 +69,7 @@ def define_point_in_macro(name, point_in_micro, visitor): # TODO relax within inames instruction(assignee=prim.Subscript(prim.Variable(name), (i,)), expression=expr, - within_inames=frozenset(subelem_inames + visitor.quadrature_inames()), + within_inames=frozenset(visitor.quadrature_inames()), tags=frozenset({subelem_inames[i]}) ) @@ -74,3 +79,33 @@ def name_point_in_macro(point_in_micro, visitor): name = get_pymbolic_basename(point_in_micro) + "_macro" define_point_in_macro(name, point_in_micro, visitor) return name + + +@preamble +def define_container_alias(name, container, lfs, element, is_const): + k = get_form_option("number_of_blocks") + p = element.degree() + dim = world_dimension() + element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) + index_stride = tuple((p * k + 1)**i for i in range(0, dim)) + globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) + if is_const: + return "const auto {} = &{}({},0);".format(name, container, lfs.name) + else: + return "auto {} = &{}.container()({},0);".format(name, container, lfs.name) + + +def name_accumulation_alias(container, accumspace): + name = container + "_" + accumspace.lfs.name + "_alias" + name_tail = container + "_" + accumspace.lfs.name + "_alias_tail" + + define_container_alias(name, container, accumspace.lfs, accumspace.element, is_const=False) + define_container_alias(name_tail, container, accumspace.lfs, accumspace.element, is_const=False) + return name + + +def name_container_alias(container, lfs, element): + name = container + "_" + lfs.name + "_alias" + + define_container_alias(name, container, lfs, element, is_const=True) + return name diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index d5224e6ffdd19ae88ccbdc6a798a8b416a6fd36d..94bc4875f262c72d695007be04c0b4107208f2f5 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -26,20 +26,22 @@ def add_vcl_temporaries(knl, vcl_size): init_iname = 'init_vec{}'.format(vcl_size) from islpy import BasicSet init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint()))) + + silenced_warnings = [] + for alias in vector_alias: vector_name = alias.replace('alias', 'vec{}'.format(vcl_size)) new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(vcl_size,), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) - # write once to the vector such that loopy won't complain - new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)), - expression=0, within_inames=frozenset({init_iname}), - id='init_{}'.format(vector_name))) + # silence warning such that loopy won't complain + silenced_warnings.append("read_no_write({})".format(vector_name)) from loopy.kernel.data import VectorizeTag return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain], temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries), - iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()})) + iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}), + silenced_warnings=knl.silenced_warnings + silenced_warnings) def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level): @@ -248,6 +250,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): load_insns.append(lp.CallInstruction(assignees=(), expression=call_load, id=load_id, within_inames=insn.within_inames | insn.reduction_inames(), depends_on=insn.depends_on | write_ids, + depends_on_is_final=True, tags=frozenset({'vectorized_{}'.format(level)}))) read_dependencies.setdefault(id, set()) read_dependencies[id].add(load_id) @@ -277,6 +280,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): id=store_id, within_inames=insn.within_inames, depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] | write_ids), + depends_on_is_final=True, tags=frozenset({'vectorized_{}'.format(level)}))) # replace alias with vcl vector, except for accumulation assignee @@ -341,10 +345,12 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): new_insns.append(insn.copy(assignee=assignee_vec, depends_on=(insn.depends_on | read_dependencies[insn.id] | write_ids), + depends_on_is_final=True, tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) else: new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] | write_ids), + depends_on_is_final=True, tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) return knl.copy(instructions=new_insns + load_insns + store_insns) diff --git a/python/dune/codegen/error.py b/python/dune/codegen/error.py index 04484f25a9874fe1c753b9a0adb40e6501dfe5ab..047e8becc306cfda369c787175a45503aabcdd97 100644 --- a/python/dune/codegen/error.py +++ b/python/dune/codegen/error.py @@ -23,3 +23,7 @@ class CodegenVectorizationError(CodegenCodegenError): class CodegenAutotuneError(CodegenVectorizationError): pass + + +class CodegenUnsupportedFiniteElementError(CodegenUFLError): + pass diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py index bed0256407b7259bab61b6e932c4a17761097e75..97090e18852359b10d1a2d3f74a268a3abac60f1 100644 --- a/python/dune/codegen/generation/__init__.py +++ b/python/dune/codegen/generation/__init__.py @@ -24,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class, preamble, post_include, template_parameter, + dump_ssc_marks ) from dune.codegen.generation.hooks import (hook, diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py index b918291067f45c5f988bc8fdcea55651d538a9db..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644 --- a/python/dune/codegen/generation/cpp.py +++ b/python/dune/codegen/generation/cpp.py @@ -55,3 +55,10 @@ def dump_accumulate_timer(name): @generator_factory(item_tags=("register_likwid_timers",)) def register_liwkid_timer(name): return "LIKWID_MARKER_REGISTER(\"{}\");".format(name) + + +@generator_factory(item_tags=("register_ssc_marks",)) +def dump_ssc_marks(name): + from dune.codegen.pdelab.driver.timings import get_region_marks + return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name, + *get_region_marks(name, driver=False)) diff --git a/python/dune/codegen/loopy/vcl.py b/python/dune/codegen/loopy/vcl.py index e0943a69ac35136abfe59a7f641e0f076681643c..2431275a5f18a3bc87272711ab5ed71c038ced0b 100644 --- a/python/dune/codegen/loopy/vcl.py +++ b/python/dune/codegen/loopy/vcl.py @@ -119,9 +119,17 @@ def vcl_function_mangler(knl, func, arg_dtypes): return lp.CallMangleInfo("select", (vcl,), (vcl, vcl, vcl)) if func in ("horizontal_add", "horizontal_add_lower", "horizontal_add_upper"): + if get_option("permuting_horizontal_add"): + func = "permuting_{}".format(func) + dtype = arg_dtypes[0] vcl = lp.types.NumpyType(get_vcl_type(dtype)) - include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile") + + if get_option("opcounter"): + include_file("dune/codegen/sumfact/oc_horizontaladd.hh", filetag="operatorfile") + else: + include_file("dune/codegen/sumfact/horizontaladd.hh", filetag="operatorfile") + return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,)) if isinstance(func, VCLPermute): diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index c8c02a47e982ed64c9a0504ec8204e40183a7587..724f80df6554a9d977bf21210fd044daf9644274 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -8,6 +8,7 @@ import yaml import pkg_resources from six.moves import configparser from six import StringIO +from contextlib import contextmanager class CodegenOptionsValidator(cerberus.Validator): @@ -44,6 +45,41 @@ class CodegenGlobalOptionsArray(ImmutableRecord): opts.update(**kwargs) ImmutableRecord.__init__(self, **opts) + # Arguments that are to be set from the outside + uflfile = CodegenOption(helpstr="the UFL file to compile") + debug_cache_with_stack = CodegenOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.") + driver_file = CodegenOption(helpstr="The filename for the generated driver header") + explicit_time_stepping = CodegenOption(default=False, helpstr="use explicit time stepping") + time_stepping_order = CodegenOption(default=1, helpstr="Order of the time stepping method") + exact_solution_expression = CodegenOption(helpstr="name of the exact solution expression in the ufl file") + compare_l2errorsquared = CodegenOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)") + grid_info = CodegenOption(default=None, helpstr="Path to file with information about facedir and facemod variations. This can be used to limit the generation of skeleton kernels.") + l2error_tree_path = CodegenOption(default=None, helpstr="Tree pathes that should be considered for l2 error calculation. Default None means we take all of them into account.") + ini_file = CodegenOption(helpstr="An inifile to use. A generated driver will be hard-coded to it, a [formcompiler] section will be used as default values to form compiler arguments (use snake case)") + opcounter = CodegenOption(default=False, helpstr="Count operations. Note: In this case only operator applications are generated since solving and operator counting does not work. You probably want to set instrumentation level>0.") + performance_measuring = CodegenOption(default=False, helpstr="Generate opcounter codepath, but only measure times!") + instrumentation_level = CodegenOption(default=0, helpstr="Control time/opcounter measurements. 0-do nothing, 1-measure program as a whole, 2-operator applications, 3-measure kernel (eg. alpha-volume, ...), 4-parts of kernel (eg. stage 1-3 of SF)") + project_basedir = CodegenOption(helpstr="The base (build) directory of the dune-codegen project") + architecture = CodegenOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake") + yaspgrid_offset = CodegenOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.") + grid_unstructured = CodegenOption(default=False, helpstr="Set to true if you want to use an unstructured grid.") + grid_consistent = CodegenOption(default=False, helpstr="The used grid is already consistent.") + precision_bits = CodegenOption(default=64, helpstr="The number of bits for the floating point type") + overlapping = CodegenOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.") + operators = CodegenOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section") + target_name = CodegenOption(default=None, helpstr="The target name from CMake") + operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!") + debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).") + use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.") + use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.") + autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).") + with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi") + permuting_horizontal_add = CodegenOption(default=True, helpstr="Whether SIMD horizontal_add should use a permuting implementation.") + + # Arguments that are mainly to be set by logic depending on other options + max_vector_width = CodegenOption(default=256, helpstr=None) + parallel = CodegenOption(default=False, helpstr="Mark that this program should be run in parallel. If set to true the c++ code will check that there are more than 1 MPI-ranks involved and the error computation will use communication.") + class CodegenFormOptionsArray(ImmutableRecord): """ A collection of form-specific form compiler arguments """ @@ -54,12 +90,80 @@ class CodegenFormOptionsArray(ImmutableRecord): opts.update(**kwargs) ImmutableRecord.__init__(self, **opts) + # Form specific options + form = CodegenOption(default=None, helpstr="The name of the UFL object representing the form in the UFL file") + filename = CodegenOption(default=None, helpstr="The filename to use for this LocalOperator") + classname = CodegenOption(default=None, helpstr="The name of the C++ class to generate") + numerical_jacobian = CodegenOption(default=False, helpstr="use numerical jacobians (only makes sense, if uflpdelab for some reason fails to generate analytic jacobians)") + matrix_free = CodegenOption(default=False, helpstr="Generate jacobian_apply_* methods for matrix free solvers") + print_transformations = CodegenOption(default=False, helpstr="print out dot files after ufl tree transformations") + print_transformations_dir = CodegenOption(default=".", helpstr="place where to put dot files (can be omitted)") + quadrature_order = CodegenOption(_type=int, helpstr="Quadrature order used for all integrals.") + diagonal_transformation_matrix = CodegenOption(default=False, helpstr="set option if the jacobian of the transformation is diagonal (axiparallel grids)") + constant_transformation_matrix = CodegenOption(default=False, helpstr="set option if the jacobian of the transformation is constant on a cell") + fastdg = CodegenOption(default=False, helpstr="Use FastDGGridOperator from PDELab.") + sumfact = CodegenOption(default=False, helpstr="Use sumfactorization") + sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") + sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...") + sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.") + sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.") + sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.") + vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization") + vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") + vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") + vectorization_horizontal = CodegenOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy") + vectorization_vertical = CodegenOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy") + vectorization_padding = CodegenOption(default=None, helpstr="an explicit value for the allowed padding in vectorization") + vectorization_allow_quadrature_changes = CodegenOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers") + vectorization_list_index = CodegenOption(default=None, helpstr="Which vectorization to pick from a list (only valid with vectorization_strategy=fromlist).") + vectorization_jacobians = CodegenOption(default=True, helpstr="Whether to attempt to vectorize jacobians (takes time, often not needed)") + vectorization_target = CodegenOption(_type=float, helpstr="The cost function target for the 'target' cost model. Only needed to verify the cost model itself, do not use light-heartedly!!!") + simplify = CodegenOption(default=False, helpstr="Whether to simplify expressions using sympy") + generate_jacobians = CodegenOption(default=True, helpstr="Whether jacobian_* methods should be generated. This is set to false automatically, when numerical_jacobian is set to true.") + generate_jacobian_apply = CodegenOption(default=False, helpstr="Wether jacobian_allpy_* methods should be generated.") + generate_residuals = CodegenOption(default=True, helpstr="Whether alpha_* methods should be generated.") + unroll_dimension_loops = CodegenOption(default=False, helpstr="whether loops over the geometric dimension should be unrolled") + blockstructured = CodegenOption(default=False, helpstr="Use block structure") + number_of_blocks = CodegenOption(default=1, helpstr="Number of sub blocks in one direction") + vectorization_blockstructured = CodegenOption(default=False, helpstr="Vectorize block structuring") + vectorization_blockstructured_tail = CodegenOption(default=True, helpstr="Try to fully vectorize block structuring even when 'nunmber_of_blocks' is not divisible by vector length") + vectorization_blockstructured_tail_ordering = CodegenOption(default='consecutive', helpstr="Ordering of the tail w.r.t the vectorized loop. Possible values: consecutive|blocked") + adjoint = CodegenOption(default=False, helpstr="Generate adjoint operator") + control = CodegenOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable") + objective_function = CodegenOption(default=None, helpstr="Name of form representing the objective function in UFL file") + control_variable = CodegenOption(default=None, helpstr="Name of control variable in UFL file") + block_preconditioner_diagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner") + block_preconditioner_offdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner") + block_preconditioner_pointdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the point diagonal part of a block preconditioner") + geometry_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for geometries. Currently implemented mixins: generic, axiparallel, equidistant, sumfact_multilinear, sumfact_axiparallel, sumfact_equidistant") + quadrature_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for quadrature. Currently implemented: generic, sumfact") + basis_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for basis function evaluation. Currently implemented: generic, sumfact") + accumulation_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for accumulation. Currently implemented: generic, sumfact, control, blockstructured") + enable_volume = CodegenOption(default=True, helpstr="Whether to assemble volume integrals") + enable_skeleton = CodegenOption(default=True, helpstr="Whether to assemble skeleton integrals") + enable_boundary = CodegenOption(default=True, helpstr="Whether to assemble boundary integrals") + # Until more sophisticated logic is needed, we keep the actual option data in this module _global_options = CodegenGlobalOptionsArray() _form_options = {} +def show_options(): + # TODO: This needs to be adjusted to options-validation + def subopt(arr): + for k, v in arr.__dict__.items(): + if isinstance(v, CodegenOption) and v.helpstr is not None: + print("{}\n {}".format(k, v.helpstr)) + + print("This is a summary of options available for the code generation process:\n") + print("The following options can be given in the [formcompiler] section:") + subopt(CodegenGlobalOptionsArray) + + print("\nThefollowing options can be given in a form-specific subsection of [formcompiler]:") + subopt(CodegenFormOptionsArray) + + def initialize_options(): """ Initialize the options from the command line """ global _global_options @@ -192,10 +296,16 @@ def process_form_options(opt, form): if opt.filename is None: opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname)) + if opt.block_preconditioner_pointdiagonal: + opt = opt.copy(generate_jacobians=False, + basis_mixins="sumfact_pointdiagonal", + accumulation_mixins="sumfact_pointdiagonal", + ) + if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal: assert opt.numerical_jacobian is False opt = opt.copy(generate_residuals=False, - generate_jacobians=False, + generate_jacobians=True, matrix_free=True, ) @@ -251,21 +361,39 @@ def get_form_option(key, form=None): return getattr(processed_form_opts, key) -def option_switch(opt): - def _switch(): - if isinstance(opt, tuple): - opts = opt - else: - assert isinstance(opt, str) - opts = (opt,) - try: - for o in opts: - if get_option(o): - return o - return "default" - except AttributeError: - for o in opts: - if get_form_option(o): - return o - return "default" - return _switch +@contextmanager +def option_context(conditional=True, **opts): + """ A context manager that sets a given option and restores it on exit. """ + # Backup old values and set to new ones + if conditional: + backup = {} + for k, v in opts.items(): + backup[k] = get_option(k) + set_option(k, v) + + yield + + if conditional: + # Restore old values + for k in opts.keys(): + set_option(k, backup[k]) + + +@contextmanager +def form_option_context(conditional=True, **opts): + """ A context manager that sets a given form option and restores it on exit """ + if conditional: + form = opts.pop("form", None) + + # Backup old values and set to new ones + backup = {} + for k, v in opts.items(): + backup[k] = get_form_option(k, form=form) + set_form_option(k, v, form=form) + + yield + + # Restore old values + if conditional: + for k in opts.keys(): + set_form_option(k, backup[k], form=form) diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py index c3cc48298c1b2efb70c6c7716b29b221486044e1..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644 --- a/python/dune/codegen/pdelab/argument.py +++ b/python/dune/codegen/pdelab/argument.py @@ -19,6 +19,7 @@ from dune.codegen.pdelab.spaces import (lfs_iname, ) from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.ufl.modified_terminals import Restriction +from dune.codegen.options import get_form_option from pymbolic.primitives import Call, Subscript, Variable @@ -116,6 +117,10 @@ def type_coefficientcontainer(): return "X" +def type_linearizationpointcontainer(): + return "Z" + + def name_jacobian(restriction1, restriction2): # Restrictions may only differ if NONE if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE): @@ -144,6 +149,8 @@ def name_accumulation_variable(restrictions=None): restrictions = (Restriction.NONE,) else: restrictions = (Restriction.POSITIVE,) + if get_form_option("block_preconditioner_pointdiagonal"): + restrictions = restrictions[:1] return name_residual(*restrictions) if ft == 'jacobian': if restrictions is None: diff --git a/python/dune/codegen/pdelab/basis.py b/python/dune/codegen/pdelab/basis.py index 03c069ac518f01efaec8304356fddcf34915668f..885194af5817bd30d6cc339603ab84f00bc7b990 100644 --- a/python/dune/codegen/pdelab/basis.py +++ b/python/dune/codegen/pdelab/basis.py @@ -8,9 +8,7 @@ from dune.codegen.generation import (basis_mixin, preamble, temporary_variable, ) -from dune.codegen.options import (option_switch, - get_form_option, - ) +from dune.codegen.options import get_form_option from dune.codegen.pdelab.argument import (name_applycontainer, name_coefficientcontainer, ) @@ -39,6 +37,7 @@ from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.pdelab.driver import (isPk, isQk, isDG) +from dune.codegen.ufl.modified_terminals import Restriction from pymbolic.primitives import Product, Subscript, Variable import pymbolic.primitives as prim @@ -81,7 +80,11 @@ class BasisMixinBase(object): @basis_mixin("generic") class GenericBasisMixin(BasisMixinBase): def initialize_function_spaces(self, expr): - return initialize_function_spaces(expr, self) + restriction = self.restriction + if self.measure == 'exterior_facet': + restriction = Restriction.POSITIVE + + return initialize_function_spaces(expr, restriction, self.indices) def lfs_inames(self, element, restriction, number, context=""): return (lfs_iname(element, restriction, number, context),) diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py index b60544c1c78242f1490c76c46d4be6a2c4448501..12b5af6d9da9344b99064ee6fc071c52ac35dcd2 100644 --- a/python/dune/codegen/pdelab/driver/__init__.py +++ b/python/dune/codegen/pdelab/driver/__init__.py @@ -22,6 +22,8 @@ from dune.codegen.generation import (generator_factory, from dune.codegen.options import (get_form_option, get_option, ) +from ufl import TensorProductCell + # # The following functions are not doing anything useful, but providing easy access @@ -39,7 +41,10 @@ def get_form_ident(): def get_form(): data = get_global_context_value("data") - return data.object_by_name[get_form_option("form", get_form_ident())] + form = get_form_option("form") + if form is None: + form = get_form_ident() + return data.object_by_name[form] def get_dimension(): @@ -81,6 +86,9 @@ def isLagrange(fem): def isSimplical(cell): + if isinstance(cell, TensorProductCell): + return False + # Cells can be identified through strings *or* ufl objects if not isinstance(cell, str): cell = cell.cellname() @@ -88,6 +96,9 @@ def isSimplical(cell): def isQuadrilateral(cell): + if isinstance(cell, TensorProductCell): + return all(tuple(isSimplical(c) for c in cell.sub_cells())) + # Cells can be identified through strings *or* ufl objects if not isinstance(cell, str): cell = cell.cellname() @@ -120,20 +131,10 @@ def FEM_name_mangling(fem): name = name + FEM_name_mangling(elem) return name if isinstance(fem, FiniteElement): - if isPk(fem): - return "P" + str(fem._degree) - if isQk(fem): - return "Q" + str(fem._degree) - if isDG(fem): - return "DG" + str(fem._degree) + return "{}{}".format(fem._short_name, fem.degree()) if isinstance(fem, TensorProductElement): assert(len(set(subel._short_name for subel in fem.sub_elements())) == 1) - - if isLagrange(fem.sub_elements()[0]): - return "TensorQ" + '_'.join(map(str, fem._degree)) - if isDG(fem.sub_elements()[0]): - return "TensorDG" + '_'.join(map(str, fem._degree)) - raise NotImplementedError("fem name mangling") + return "TP_{}".format("_".join(FEM_name_mangling(subel) for subel in fem.sub_elements())) raise NotImplementedError("FEM NAME MANGLING") @@ -212,7 +213,10 @@ def name_initree(): @preamble(section="init") def define_mpihelper(name): include_file("dune/common/parallel/mpihelper.hh", filetag="driver") - return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name) + if get_option("with_mpi"): + return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name) + else: + return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name) def name_mpihelper(): @@ -282,6 +286,13 @@ def generate_driver(): contents = [] + # Assert that this program was called with ini file + contents += ['if (argc != 2){', + ' std::cerr << "This program needs to be called with an ini file" << std::endl;', + ' return 1;', + '}', + ''] + def add_section(tag, comment): tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)] if tagcontents: diff --git a/python/dune/codegen/pdelab/driver/gridfunctionspace.py b/python/dune/codegen/pdelab/driver/gridfunctionspace.py index 377b304b03499aa3c109390cf72f4af0f6327e9b..49789b794d1cad5e84b69da898fc13c7e823a9fd 100644 --- a/python/dune/codegen/pdelab/driver/gridfunctionspace.py +++ b/python/dune/codegen/pdelab/driver/gridfunctionspace.py @@ -1,3 +1,4 @@ +from dune.codegen.error import CodegenUnsupportedFiniteElementError from dune.codegen.generation import (include_file, preamble, ) @@ -9,10 +10,6 @@ from dune.codegen.pdelab.driver import (FEM_name_mangling, get_dimension, get_test_element, get_trial_element, - isLagrange, - isDG, - isPk, - isQk, isQuadrilateral, isSimplical, name_initree, @@ -20,7 +17,7 @@ from dune.codegen.pdelab.driver import (FEM_name_mangling, ) from dune.codegen.loopy.target import type_floatingpoint -from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement +from ufl import FiniteElement, MixedElement, TensorElement, VectorElement, TensorProductElement, TensorProductCell @preamble(section="grid") @@ -82,7 +79,8 @@ def define_grid(name): _type = type_grid() # TODO: In principle this is only necessary if we use sum factorization in # one of the operators. So this could be turned off if that is not the case. - if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured"): + if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured") and not \ + get_option("grid_consistent"): include_file("dune/consistent-edge-orientation/createconsistentgrid.hh", filetag="driver") return ["IniGridFactory<{}> factory({});".format(_type, ini), "std::shared_ptr<{}> grid_nonconsistent = factory.getGrid();".format(_type), @@ -122,61 +120,98 @@ def name_leafview(): return name +def get_short_name(element): + if isinstance(element, TensorProductElement): + assert len(set(subel._short_name for subel in element.sub_elements())) == 1 + return get_short_name(element.sub_elements()[0]) + + return element._short_name + + @preamble(section="fem") def typedef_fem(element, name): gv = type_leafview() df = type_domainfield() r = type_range() dim = get_dimension() + cell = element.cell() + degree = element.degree() + short = get_short_name(element) + + # We currently only support TensorProductElement from UFL if it aliases another finite element + # available from UFL. Here, we check this condition and recover the aliases element + if isinstance(element, TensorProductElement): + subels = set(subel._short_name for subel in element.sub_elements()) + if len(subels) != 1 or len(set(element.degree())) != 1: + raise CodegenUnsupportedFiniteElementError(element) + + degree = element.degree()[0] + cell = TensorProductCell(*tuple(subel.cell() for subel in element.sub_elements())) + # The blockstructured code branch has its own handling of finite element selection if get_form_option("blockstructured"): include_file("dune/codegen/blockstructured/blockstructuredqkfem.hh", filetag="driver") - degree = element.degree() * get_form_option("number_of_blocks") + degree = degree * get_form_option("number_of_blocks") return "using {} = Dune::PDELab::BlockstructuredQkLocalFiniteElementMap<{}, {}, {}, {}>;" \ .format(name, gv, df, r, degree) - if isinstance(element, TensorProductElement): - # Only allow TensorProductElements where all subelements are - # of the same type ('CG' or 'DG') - assert(len(set(subel._short_name for subel in element.sub_elements())) == 1) - - # Anisotropic degree is not yet supported in Dune - degrees = element.degree() - for deg in degrees: - assert (deg == degrees[0]) - - # TensorProductElements have Qk structure -> no Pk - if isLagrange(element.sub_elements()[0]): - include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver") - return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \ - .format(name, gv, df, r, degrees[0]) - elif isDG(element.sub_elements()[0]): - include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver") - # TODO allow switching the basis here! - return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \ - .format(name, df, r, degrees[0], dim) - raise NotImplementedError("FEM not implemented in dune-codegen") - elif isQk(element): - include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver") - return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \ - .format(name, gv, df, r, element.degree()) - elif isPk(element): - include_file("dune/pdelab/finiteelementmap/pkfem.hh", filetag="driver") - return "using {} = Dune::PDELab::PkLocalFiniteElementMap<{}, {}, {}, {}>;" \ - .format(name, gv, df, r, element.degree()) - elif isDG(element): - if isQuadrilateral(element.cell()): - include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver") - # TODO allow switching the basis here! - return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \ - .format(name, df, r, element.degree(), dim) - if isSimplical(element.cell()): - include_file("dune/pdelab/finiteelementmap/opbfem.hh", filetag="driver") - return "using {} = Dune::PDELab::OPBLocalFiniteElementMap<{}, {}, {}, {}, Dune::GeometryType::simplex>;" \ - .format(name, df, r, element.degree(), dim) - raise NotImplementedError("Geometry type not known in code generation") - - raise NotImplementedError("FEM not implemented in dune-codegen") + # This is a backward-compatibility hack: So far we silently used OPBFem for DG with simplices: + if short == "DG" and isSimplical(cell): + short = "OPB" + + # Choose the correct finite element implementation + if short == "CG": + if isSimplical(cell): + if dim in (1, 2, 3): + include_file("dune/pdelab/finiteelementmap/pkfem.hh", filetag="driver") + return "using {} = Dune::PDELab::PkLocalFiniteElementMap<{}, {}, {}, {}>;" \ + .format(name, gv, df, r, degree) + else: + raise CodegenUnsupportedFiniteElementError(element) + elif isQuadrilateral(cell): + if dim in (2, 3) and degree < 3: + include_file("dune/pdelab/finiteelementmap/qkfem.hh", filetag="driver") + return "using {} = Dune::PDELab::QkLocalFiniteElementMap<{}, {}, {}, {}>;" \ + .format(name, gv, df, r, degree) + else: + raise CodegenUnsupportedFiniteElementError(element) + else: + raise CodegenUnsupportedFiniteElementError(element) + elif short == "DG": + if isQuadrilateral(cell): + if dim < 4: + include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="driver") + return "using {} = Dune::PDELab::QkDGLocalFiniteElementMap<{}, {}, {}, {}>;" \ + .format(name, df, r, degree, dim) + else: + raise CodegenUnsupportedFiniteElementError(element) + else: + raise CodegenUnsupportedFiniteElementError(element) + elif short == "GL": + raise NotImplementedError("Gauss-Legendre polynomials not implemented") + elif short == "DGLL": + raise NotImplementedError("Discontinuous Gauss-Lobatto-Legendre polynomials not implemented") + elif short == "OPB": + if isQuadrilateral(cell): + gt = "Dune::GeometryType::cube" + elif isSimplical(cell): + gt = "Dune::GeometryType::simplex" + else: + raise CodegenUnsupportedFiniteElementError(element) + + include_file("dune/pdelab/finiteelementmap/opbfem.hh", filetag="driver") + return "using {} = Dune::PDELab::OPBLocalFiniteElementMap<{}, {}, {}, {}, {}>;" \ + .format(name, df, r, degree, dim, gt) + elif short == "Monom": + raise NotImplementedError("Monomials basis DG not implemented") + elif short == "RaTu": + raise NotImplementedError("Rannacher-Turek elements not implemented") + elif short == "RT": + raise NotImplementedError("Raviart-Thomas elements not implemented") + elif short == "BDM": + raise NotImplementedError("Brezzi-Douglas-Marini elements not implemented") + else: + raise CodegenUnsupportedFiniteElementError(element) def type_fem(element): @@ -188,23 +223,13 @@ def type_fem(element): @preamble(section="fem") def define_fem(element, name): femtype = type_fem(element) - from dune.codegen.pdelab.driver import isDG - if isinstance(element, TensorProductElement): - # Only allow TensorProductElements where all subelements are - # of the same type ('CG' or 'DG') - assert(len(set(subel._short_name for subel in element.sub_elements())) == 1) - if isDG(element.sub_elements()[0]): - return "{} {};".format(femtype, name) - else: - assert(isLagrange(element.sub_elements()[0])) - gv = name_leafview() - return "{} {}({});".format(femtype, name, gv) - elif isDG(element): - return "{} {};".format(femtype, name) - else: - assert(isLagrange(element)) + + # Determine whether the FEM is grid-dependent - currently on the Lagrange elements are + if get_short_name(element) == "CG": gv = name_leafview() return "{} {}({});".format(femtype, name, gv) + else: + return "{} {};".format(femtype, name) def name_fem(element): diff --git a/python/dune/codegen/pdelab/driver/gridoperator.py b/python/dune/codegen/pdelab/driver/gridoperator.py index abc878f0715dbcb4a0501d56247c4f03c98f20df..bdf8b2f6245acb63b2a67c518ab98fef597672af 100644 --- a/python/dune/codegen/pdelab/driver/gridoperator.py +++ b/python/dune/codegen/pdelab/driver/gridoperator.py @@ -77,8 +77,7 @@ def typedef_localoperator(name, form_ident): filename = get_form_option("filename", form_ident) include_file(filename, filetag="driver") lopname = localoperator_basename(form_ident) - range_type = type_range() - return "using {} = {}<{}, {}, {}>;".format(name, lopname, ugfs, vgfs, range_type) + return "using {} = {}<{}, {}>;".format(name, lopname, ugfs, vgfs) def type_localoperator(form_ident): diff --git a/python/dune/codegen/pdelab/driver/instationary.py b/python/dune/codegen/pdelab/driver/instationary.py index 5cf7170f1dca6f4274db4bf77b4b1627466e20cb..355fd0e743f6a59716319241018d3c5794c784bd 100644 --- a/python/dune/codegen/pdelab/driver/instationary.py +++ b/python/dune/codegen/pdelab/driver/instationary.py @@ -132,10 +132,25 @@ def name_time(): def typedef_timesteppingmethod(name): r_type = type_range() explicit = get_option('explicit_time_stepping') + order = get_option('time_stepping_order') if explicit: - return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type) + if order == 1: + return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type) + elif order == 2: + return "using {} = Dune::PDELab::HeunParameter<{}>;".format(name, r_type) + elif order == 3: + return "using {} = Dune::PDELab::Shu3Parameter<{}>;".format(name, r_type) + elif order == 4: + return "using {} = Dune::PDELab::RK4Parameter<{}>;".format(name, r_type) + else: + raise NotImplementedError("Time stepping method not supported") else: - return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type) + if order == 1: + return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type) + elif order == 2: + return "using {} = Dune::PDELab::Alexander2Parameter<{}>;".format(name, r_type) + elif order == 3: + return "using {} = Dune::PDELab::Alexander3Parameter<{}>;".format(name, r_type) def type_timesteppingmethod(): @@ -150,8 +165,12 @@ def define_timesteppingmethod(name): if explicit: return "{} {};".format(tsm_type, name) else: - ini = name_initree() - return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini) + order = get_option('time_stepping_order') + if order == 1: + ini = name_initree() + return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini) + else: + return "{} {};".format(tsm_type, name) def name_timesteppingmethod(): diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py index aeca64d46c73f2327b48c22f07dca7a85a044104..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644 --- a/python/dune/codegen/pdelab/driver/timings.py +++ b/python/dune/codegen/pdelab/driver/timings.py @@ -4,7 +4,7 @@ from dune.codegen.generation import (cached, include_file, pre_include, preamble, - ) + post_include) from dune.codegen.options import get_option from dune.codegen.pdelab.driver import (get_form_ident, is_linear, @@ -24,6 +24,9 @@ from dune.codegen.pdelab.driver.solve import (name_vector, ) +_sde_marks = {} + + @preamble(section="timings") def define_timing_identifier(name): ini = name_initree() @@ -125,6 +128,17 @@ def local_operator_likwid(): return "{}.register_likwid_timers();".format(lop_name) +@preamble(section="timings") +def local_operator_ssc_marks(): + lop_name = name_localoperator(get_form_ident()) + return "{}.dump_ssc_marks();".format(lop_name) + + +def ssc_macro(): + return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \ + '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)' + + @cached def setup_timer(): # TODO check that we are using YASP? @@ -138,6 +152,10 @@ def setup_timer(): logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably") local_operator_likwid() finalize_likwid() + elif get_option("use_sde"): + post_include(ssc_macro(), filetag='driver') + if get_option('instrumentation_level') >= 3: + local_operator_ssc_marks() else: from dune.codegen.loopy.target import type_floatingpoint pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver") @@ -156,14 +174,26 @@ def init_region_timer(region): setup_timer() if get_option("use_likwid"): init_likwid_timer(region) + elif get_option("use_sde"): + pass else: from dune.codegen.generation import post_include post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver") +def get_region_marks(region, driver): + if driver: + return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11)) + else: + return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1)) + + def start_region_timer(region): if get_option("use_likwid"): return ["LIKWID_MARKER_START(\"{}\");".format(region)] + elif get_option("use_sde"): + marks = get_region_marks(region, driver=True) + return ["__SSC_MARK(0x{});".format(marks[0])] else: return ["HP_TIMER_START({});".format(region)] @@ -171,6 +201,10 @@ def start_region_timer(region): def stop_region_timer(region): if get_option("use_likwid"): return ["LIKWID_MARKER_STOP(\"{}\");".format(region)] + elif get_option("use_sde"): + marks = get_region_marks(region, driver=True) + return ["__SSC_MARK(0x{});".format(marks[1]), + "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)] else: timestream = name_timing_stream() return ["HP_TIMER_STOP({});".format(region), @@ -207,7 +241,7 @@ def timed_region(region, actions): init_region_timer(region) - if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'): + if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")): timestream = name_timing_stream() lop_name = name_localoperator(get_form_ident()) print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) diff --git a/python/dune/codegen/pdelab/geometry.py b/python/dune/codegen/pdelab/geometry.py index b9d5b0d03e9c137c30fa49bfefee18d835e7f003..df6b91548a5ccf217b3e36a7d176be828a35f374 100644 --- a/python/dune/codegen/pdelab/geometry.py +++ b/python/dune/codegen/pdelab/geometry.py @@ -12,9 +12,7 @@ from dune.codegen.generation import (class_member, temporary_variable, valuearg, ) -from dune.codegen.options import (get_form_option, - option_switch, - ) +from dune.codegen.options import get_form_option from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint from dune.codegen.pdelab.quadrature import (quadrature_preamble, ) @@ -89,6 +87,10 @@ class GenericPDELabGeometryMixin(GeometryMixinBase): if restriction == Restriction.NONE: return local + return self._to_cell(local, restriction) + + @kernel_cached + def _to_cell(self, local, restriction): basename = get_pymbolic_basename(local) name = "{}_in_{}side".format(basename, "in" if restriction is Restriction.POSITIVE else "out") temporary_variable(name, shape=(world_dimension(),), shape_impl=("fv",)) @@ -269,9 +271,9 @@ class EquidistantGeometryMixin(AxiparallelGeometryMixin): @preamble(kernel="operator") def _define_jacobian_determinant_eval(self, name): - from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param, lop_template_range_field + from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param gfs = name_ansatz_gfs_constructor_param() - rft = lop_template_range_field() + rft = type_floatingpoint() return "{} = {}.gridView().template begin<0>()->geometry().integrationElement(Dune::FieldVector<{}, {}>());".format(name, gfs, rft, world_dimension()) def define_jacobian_inverse_transposed(self, name, restriction): @@ -289,9 +291,9 @@ class EquidistantGeometryMixin(AxiparallelGeometryMixin): @preamble(kernel="operator") def _define_jacobian_inverse_transposed_eval(self, name): - from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param, lop_template_range_field + from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param gfs = name_ansatz_gfs_constructor_param() - rft = lop_template_range_field() + rft = type_floatingpoint() return "{} = {}.gridView().template begin<0>()->geometry().jacobianInverseTransposed(Dune::FieldVector<{}, {}>());".format(name, gfs, rft, world_dimension()) diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index dfc553fbefee7fa8c2e6aa1d92ae7ea047acdd38..835fe44e24df9f0849391e1f50422512df43c044 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -7,8 +7,8 @@ import numpy as np from dune.codegen.options import (get_form_option, get_option, - option_switch, - set_form_option) + form_option_context, + ) from dune.codegen.generation import (accumulation_mixin, base_class, class_basename, @@ -26,18 +26,22 @@ from dune.codegen.generation import (accumulation_mixin, iname, include_file, initializer_list, + kernel_cached, post_include, retrieve_cache_functions, retrieve_cache_items, ReturnArg, run_hook, template_parameter, + dump_ssc_marks ) from dune.codegen.cgen.clazz import (AccessModifier, BaseClass, ClassMember, ) +from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.ufl.modified_terminals import Restriction +from frozendict import frozendict import dune.codegen.loopy.mangler @@ -72,11 +76,6 @@ def name_test_gfs_constructor_param(): return "gfsv" -@template_parameter(classtag="operator") -def lop_template_range_field(): - return "RF" - - @class_member(classtag="operator") def lop_domain_field(name): # TODO: Rethink for not Galerkin Method @@ -153,6 +152,11 @@ def enum_alpha(): return _enum_alpha(ufl_measure_to_pdelab_measure(integral_type)) +@class_member(classtag="operator") +def enum_skeleton_twosided(): + return "enum { doSkeletonTwoSided = true };" + + def name_initree_member(): define_initree("_iniParams") return "_iniParams" @@ -279,7 +283,7 @@ def determine_accumulation_space(info, number): return AccumulationSpace(lfs=lfs, index=lfsi, restriction=info.restriction, - element=element + element=subel ) @@ -344,7 +348,11 @@ class AccumulationMixinBase(object): @accumulation_mixin("generic") class GenericAccumulationMixin(AccumulationMixinBase): def get_accumulation_info(self, expr): - return get_accumulation_info(expr, self) + restriction = self.restriction + if self.measure == 'exterior_facet': + restriction = Restriction.POSITIVE + + return get_accumulation_info(expr, restriction, self.indices, self) def list_accumulation_infos(self, expr): return list_accumulation_infos(expr, self) @@ -402,19 +410,16 @@ def list_accumulation_infos(expr, visitor): return itertools.product(testgen, trialgen) -def get_accumulation_info(expr, visitor): +@kernel_cached +def get_accumulation_info(expr, restriction, indices, visitor): element = expr.ufl_element() leaf_element = element element_index = 0 from ufl import MixedElement if isinstance(expr.ufl_element(), MixedElement): - element_index = visitor.indices[0] + element_index = indices[0] leaf_element = element.extract_component(element_index)[1] - restriction = visitor.restriction - if visitor.measure == 'exterior_facet': - restriction = Restriction.POSITIVE - inames = visitor.lfs_inames(leaf_element, restriction, expr.number() @@ -448,7 +453,6 @@ def generate_accumulation_instruction(expr, visitor): ) from dune.codegen.generation import instruction - from dune.codegen.options import option_switch quad_inames = visitor.quadrature_inames() lfs_inames = frozenset(visitor.test_info.inames) if visitor.trial_info: @@ -495,7 +499,8 @@ def visit_integral(integral): # Start the visiting process! visitor = get_visitor(measure, subdomain_id) - visitor.accumulate(integrand) + with global_context(visitor=visitor): + visitor.accumulate(integrand) run_hook(name="after_visit", args=(visitor,), @@ -505,24 +510,35 @@ def visit_integral(integral): def generate_kernel(integrals): logger = logging.getLogger(__name__) - # Visit all integrals once to collect information (dry-run)! - logger.debug('generate_kernel: visit_integrals (dry run)') - with global_context(dry_run=True): + # Assert that metadata for a given measure type agrees. This is a limitation + # of our current approach that is hard to overcome. + def remove_nonuser_metadata(d): + return frozendict({k: v for k, v in d.items() if k != "estimated_polynomial_degree"}) + + meta_dicts = [remove_nonuser_metadata(i.metadata()) for i in integrals] + if len(set(meta_dicts)) > 1: + measure = get_global_context_value("measure") + raise CodegenUFLError("Measure {} used with varying metadata! dune-codegen does not currently support this.") + + with form_option_context(**meta_dicts[0]): + # Visit all integrals once to collect information (dry-run)! + logger.debug('generate_kernel: visit_integrals (dry run)') + with global_context(dry_run=True): + for integral in integrals: + visit_integral(integral) + + # Now perform some checks on what should be done + from dune.codegen.sumfact.vectorization import decide_vectorization_strategy + logger.debug('generate_kernel: decide_vectorization_strategy') + decide_vectorization_strategy() + + # Delete the cache contents and do the real thing! + logger.debug('generate_kernel: visit_integrals (no dry run)') + from dune.codegen.generation import delete_cache_items + delete_cache_items("kernel_default") for integral in integrals: visit_integral(integral) - # Now perform some checks on what should be done - from dune.codegen.sumfact.vectorization import decide_vectorization_strategy - logger.debug('generate_kernel: decide_vectorization_strategy') - decide_vectorization_strategy() - - # Delete the cache contents and do the real thing! - logger.debug('generate_kernel: visit_integrals (no dry run)') - from dune.codegen.generation import delete_cache_items - delete_cache_items("kernel_default") - for integral in integrals: - visit_integral(integral) - from dune.codegen.pdelab.signatures import kernel_name, assembly_routine_signature name = kernel_name() signature = assembly_routine_signature() @@ -691,6 +707,19 @@ class RegisterLikwidMethod(ClassMember): ClassMember.__init__(self, content) +class RegisterSSCMarksMethod(ClassMember): + def __init__(self): + knl = name_example_kernel() + assert(knl is not None) + + content = ["void dump_ssc_marks()" + "{"] + register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')] + content.extend(map(lambda x: ' ' + x, register_liwkid_timers)) + content += ["}"] + ClassMember.__init__(self, content) + + class LoopyKernelMethod(ClassMember): def __init__(self, signature, kernel, add_timings=True, initializer_list=[]): from loopy import generate_body @@ -718,6 +747,12 @@ class LoopyKernelMethod(ClassMember): init_likwid_timer(timer_name) content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name)) register_liwkid_timer(timer_name) + elif get_option('use_sde'): + from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro + post_include(ssc_macro(), filetag='operatorfile') + marks = get_region_marks(timer_name, driver=False) + content.append(' ' + '__SSC_MARK(0x{});'.format(marks[0])) + dump_ssc_marks(timer_name) else: post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') content.append(' ' + 'HP_TIMER_START({});'.format(timer_name)) @@ -730,6 +765,11 @@ class LoopyKernelMethod(ClassMember): init_likwid_timer(setuptimer) content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer)) register_liwkid_timer(setuptimer) + elif get_option('use_sde'): + from dune.codegen.pdelab.driver.timings import get_region_marks + setup_marks = get_region_marks(setuptimer, driver=False) + content.append(' ' + '__SSC_MARK(0x{});'.format(setup_marks[0])) + dump_ssc_marks(setuptimer) else: post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile') content.append(' HP_TIMER_START({});'.format(setuptimer)) @@ -742,6 +782,8 @@ class LoopyKernelMethod(ClassMember): if add_timings and get_option('instrumentation_level') >= 4: if get_option('use_likwid'): content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer)) + elif get_option('use_sde'): + content.append(' ' + '__SSC_MARK(0x{});'.format(setup_marks[1])) else: content.append(' ' + 'HP_TIMER_STOP({});'.format(setuptimer)) @@ -752,6 +794,8 @@ class LoopyKernelMethod(ClassMember): if add_timings and get_option('instrumentation_level') >= 3: if get_option('use_likwid'): content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name)) + elif get_option('use_sde'): + content.append(' ' + '__SSC_MARK(0x{});'.format(marks[1])) else: content.append(' ' + 'HP_TIMER_STOP({});'.format(timer_name)) @@ -823,7 +867,6 @@ def local_operator_default_settings(operator, form): localoperator_basename(operator) lop_template_ansatz_gfs() lop_template_test_gfs() - lop_template_range_field() # Make sure there is always the same constructor arguments, even if some of them are # not strictly needed. Also ensure the order. @@ -837,10 +880,18 @@ def local_operator_default_settings(operator, form): base_class('Dune::PDELab::LocalOperatorDefaultFlags', classtag="operator") from dune.codegen.pdelab.driver import is_stationary if not is_stationary(): - rf = lop_template_range_field() + rf = type_floatingpoint() base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>' .format(rf), classtag="operator") + # *always* add the volume pattern, PDELab cannot handle matrices without diagonal blocks + with global_context(integral_type="cell"): + enum_pattern() + pattern_baseclass() + + if get_form_option("block_preconditioner_diagonal") or get_form_option("block_preconditioner_pointdiagonal"): + enum_skeleton_twosided() + def measure_is_enabled(measure): option_dict = {"cell": "enable_volume", @@ -855,6 +906,16 @@ def generate_residual_kernels(form, original_form): if not get_form_option("generate_residuals"): return {} + if get_form_option("block_preconditioner_pointdiagonal"): + from ufl import derivative + jacform = derivative(original_form, original_form.coefficients()[0]) + + from dune.codegen.ufl.preprocess import preprocess_form + jacform = preprocess_form(jacform).preprocessed_form + + from dune.codegen.ufl.transformations.blockpreconditioner import diagonal_block_jacobian + form = diagonal_block_jacobian(jacform) + logger = logging.getLogger(__name__) with global_context(form_type='residual'): operator_kernels = {} @@ -957,6 +1018,12 @@ def generate_jacobian_kernels(form, original_form): from dune.codegen.pdelab.signatures import assembler_routine_name with global_context(kernel=assembler_routine_name()): kernel = [k for k in generate_kernels_per_integral(jac_apply_form.integrals_by_type(measure))] + + if kernel: + enum_pattern() + pattern_baseclass() + enum_alpha() + operator_kernels[(measure, 'jacobian_apply')] = kernel # Generate dummy functions for those kernels, that vanished in the differentiation process @@ -970,13 +1037,9 @@ def generate_jacobian_kernels(form, original_form): operator_kernels[(it, 'jacobian_apply')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] if get_form_option("generate_jacobians"): with global_context(form_type="jacobian"): - if get_form_option("generate_jacobians"): - if get_form_option("sumfact"): - was_sumfact = True - if get_form_option("sumfact_regular_jacobians"): - old_geometry_mixins = get_form_option("geometry_mixins") - set_form_option("geometry_mixins", "generic") - set_form_option("sumfact", False) + with form_option_context(conditional=get_form_option("sumfact") and get_form_option("sumfact_regular_jacobians"), + geometry_mixins="generic", + sumfact=False): for measure in set(i.integral_type() for i in jacform.integrals()): if not measure_is_enabled(measure): continue @@ -986,6 +1049,12 @@ def generate_jacobian_kernels(form, original_form): from dune.codegen.pdelab.signatures import assembler_routine_name with global_context(kernel=assembler_routine_name()): kernel = [k for k in generate_kernels_per_integral(jacform.integrals_by_type(measure))] + + if kernel: + enum_pattern() + pattern_baseclass() + enum_alpha() + operator_kernels[(measure, 'jacobian')] = kernel # Generate dummy functions for those kernels, that vanished in the differentiation process @@ -997,10 +1066,6 @@ def generate_jacobian_kernels(form, original_form): with global_context(integral_type=it): from dune.codegen.pdelab.signatures import assembly_routine_signature operator_kernels[(it, 'jacobian')] = [LoopyKernelMethod(assembly_routine_signature(), kernel=None)] - if get_form_option("sumfact_regular_jacobians"): - if was_sumfact: - set_form_option("sumfact", True) - set_form_option("geometry_mixins", old_geometry_mixins) return operator_kernels @@ -1184,6 +1249,8 @@ def generate_localoperator_file(kernels, filename): include_file('dune/codegen/common/timer.hh', filetag='operatorfile') if get_option('use_likwid'): operator_methods.append(RegisterLikwidMethod()) + elif get_option('use_sde'): + operator_methods.append(RegisterSSCMarksMethod()) else: operator_methods.append(TimerMethod()) elif get_option('opcounter'): diff --git a/python/dune/codegen/pdelab/quadrature.py b/python/dune/codegen/pdelab/quadrature.py index ae6a7e2db7212b254e3857bc86813ea420ed0251..ca44056ce82037124cc3ea2a9781e6eb1feb9692 100644 --- a/python/dune/codegen/pdelab/quadrature.py +++ b/python/dune/codegen/pdelab/quadrature.py @@ -7,12 +7,14 @@ from dune.codegen.generation import (class_member, iname, include_file, instruction, + kernel_cached, preamble, quadrature_mixin, temporary_variable, valuearg, ) -from dune.codegen.pdelab.localoperator import lop_template_range_field, name_ansatz_gfs_constructor_param +from dune.codegen.loopy.target import type_floatingpoint +from dune.codegen.pdelab.localoperator import name_ansatz_gfs_constructor_param from dune.codegen.options import get_form_option from pymbolic.primitives import Variable, Subscript @@ -51,7 +53,7 @@ class GenericQuadratureMixin(QuadratureMixinBase): @class_member(classtag="operator") def define_quadrature_weights(self, name): - rf = lop_template_range_field() + rf = type_floatingpoint() from dune.codegen.pdelab.geometry import local_dimension dim = local_dimension() self.eval_quadrature_weights(name) @@ -70,6 +72,7 @@ class GenericQuadratureMixin(QuadratureMixinBase): def quadrature_inames(self): return (quadrature_iname(),) + @kernel_cached def quadrature_position(self, index=None): from dune.codegen.pdelab.geometry import local_dimension dim = local_dimension() @@ -89,7 +92,7 @@ class GenericQuadratureMixin(QuadratureMixinBase): @class_member(classtag="operator") def define_quadrature_points(self, name): - rf = lop_template_range_field() + rf = type_floatingpoint() from dune.codegen.pdelab.geometry import local_dimension dim = local_dimension() self.eval_quadrature_points(name) @@ -200,7 +203,7 @@ def quadrature_order(): possible to use a different quadrature_order per direction. """ if get_form_option("quadrature_order"): - quadrature_order = tuple(map(int, get_form_option("quadrature_order").split(','))) + quadrature_order = tuple(map(int, str(get_form_option("quadrature_order")).split(','))) else: quadrature_order = _estimate_quadrature_order() diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644 --- a/python/dune/codegen/pdelab/signatures.py +++ b/python/dune/codegen/pdelab/signatures.py @@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable, name_coefficientcontainer, type_coefficientcontainer, name_applycontainer, + type_linearizationpointcontainer, ) from dune.codegen.pdelab.spaces import (name_testfunctionspace, type_testfunctionspace, @@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, avt) + return (geot, lfsut, cct, lpt, lfsvt, avt) def nonlinear_jacobian_apply_volume_args(): @@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, avt) + return (geot, lfsut, cct, lpt, lfsvt, avt) def nonlinear_jacobian_apply_boundary_args(): @@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt) + return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt) def nonlinear_jacobian_apply_skeleton_args(): diff --git a/python/dune/codegen/pdelab/spaces.py b/python/dune/codegen/pdelab/spaces.py index d8c3c5684ce95e5a80baca6b8da11753f997607c..19de9bc7e7d0fa8bf14327eea39ff559d3450da7 100644 --- a/python/dune/codegen/pdelab/spaces.py +++ b/python/dune/codegen/pdelab/spaces.py @@ -5,6 +5,7 @@ from dune.codegen.generation import (class_member, function_mangler, generator_factory, include_file, + kernel_cached, preamble, valuearg, ) @@ -122,15 +123,12 @@ name_lfs = partial(_function_space_traversal, defaultname=available_lfs_names, r type_gfs = partial(_function_space_traversal, defaultname=available_gfs_names, recfunc=_type_gfs) -def initialize_function_spaces(expr, visitor): - restriction = visitor.restriction - if visitor.measure == 'exterior_facet': - restriction = Restriction.POSITIVE - +@kernel_cached +def initialize_function_spaces(expr, restriction, indices): index = None from ufl import MixedElement if isinstance(expr.ufl_element(), MixedElement): - index = visitor.indices[0] + index = indices[0] from ufl.classes import Argument, Coefficient if isinstance(expr, Argument) and expr.number() == 0: diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py index a924a39ae1bba19a94c47e5542567d8717cedf57..7a86ba52eff5f255c3d040d31951043a65266890 100644 --- a/python/dune/codegen/pdelab/tensors.py +++ b/python/dune/codegen/pdelab/tensors.py @@ -1,12 +1,12 @@ """ Code generation for explicitly specified tensors """ from dune.codegen.generation import (get_counted_variable, - domain, kernel_cached, - iname, instruction, temporary_variable, ) +from dune.codegen.loopy.symbolic import FusedMultiplyAdd as FMA +from loopy.match import Writes import pymbolic.primitives as prim import numpy as np @@ -14,6 +14,116 @@ import loopy as lp import itertools as it +def define_determinant(name, matrix, shape, visitor): + temporary_variable(name, managed=True) + + assert len(shape) == 2 and shape[0] == shape[1] + dim = shape[0] + + matrix_entry = [[prim.Subscript(prim.Variable(matrix), (i, j)) for j in range(dim)] for i in range(dim)] + if dim == 2: + expr_determinant = FMA(matrix_entry[0][0], matrix_entry[1][1], -1 * matrix_entry[1][0] * matrix_entry[0][1]) + + elif dim == 3: + fma_A = FMA(matrix_entry[1][1], matrix_entry[2][2], -1 * matrix_entry[1][2] * matrix_entry[2][1]) + fma_B = FMA(matrix_entry[1][0], matrix_entry[2][2], -1 * matrix_entry[1][2] * matrix_entry[2][0]) + fma_C = FMA(matrix_entry[1][0], matrix_entry[2][1], -1 * matrix_entry[1][1] * matrix_entry[2][0]) + + expr_determinant = FMA(matrix_entry[0][2], fma_C, + FMA(matrix_entry[0][0], fma_A, -1 * matrix_entry[0][1] * fma_B)) + else: + raise NotImplementedError() + instruction(expression=expr_determinant, + assignee=prim.Variable(name), + within_inames=frozenset(visitor.quadrature_inames()), + depends_on=frozenset({Writes(matrix)}) + ) + + +def define_determinant_inverse(name, matrix, shape, visitor): + det = name_determinant(matrix, shape, visitor) + + temporary_variable(name, managed=True) + + instruction(expression=prim.Quotient(1, prim.Variable(det)), + assignee=prim.Variable(name), + within_inames=frozenset(visitor.quadrature_inames()), + depends_on=frozenset({Writes(matrix), Writes(det)}) + ) + + +def define_matrix_inverse(name, name_inv, shape, visitor): + temporary_variable(name_inv, shape=shape, managed=True) + + det_inv = name_determinant_inverse(name, shape, visitor) + + assert len(shape) == 2 and shape[0] == shape[1] + dim = shape[0] + + matrix_entry = [[prim.Subscript(prim.Variable(name), (i, j)) for j in range(dim)] for i in range(dim)] + assignee = [[prim.Subscript(prim.Variable(name_inv), (i, j)) for j in range(dim)] for i in range(dim)] + exprs = [[None for _ in range(dim)] for _ in range(dim)] + + if dim == 2: + for i in range(2): + for j in range(2): + sign = 1. if i == j else -1. + exprs[i][j] = prim.Product((sign, prim.Variable(det_inv), matrix_entry[1 - i][1 - j])) + elif dim == 3: + exprs[0][0] = prim.Variable(det_inv) * FMA(matrix_entry[1][1], matrix_entry[2][2], + -1 * matrix_entry[1][2] * matrix_entry[2][1]) + exprs[1][0] = prim.Variable(det_inv) * FMA(matrix_entry[0][1], matrix_entry[2][2], + -1 * matrix_entry[0][2] * matrix_entry[2][1]) * -1 + exprs[2][0] = prim.Variable(det_inv) * FMA(matrix_entry[0][1], matrix_entry[1][2], + -1 * matrix_entry[0][2] * matrix_entry[1][1]) + + exprs[0][1] = prim.Variable(det_inv) * FMA(matrix_entry[1][0], matrix_entry[2][2], + -1 * matrix_entry[1][2] * matrix_entry[2][0]) * -1 + exprs[1][1] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[2][2], + -1 * matrix_entry[0][2] * matrix_entry[2][0]) + exprs[2][1] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[1][2], + -1 * matrix_entry[0][2] * matrix_entry[1][0]) * -1 + + exprs[0][2] = prim.Variable(det_inv) * FMA(matrix_entry[1][0], matrix_entry[2][1], + -1 * matrix_entry[1][1] * matrix_entry[2][0]) + exprs[1][2] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[2][1], + -1 * matrix_entry[0][1] * matrix_entry[2][0]) * -1 + exprs[2][2] = prim.Variable(det_inv) * FMA(matrix_entry[0][0], matrix_entry[1][1], + -1 * matrix_entry[0][1] * matrix_entry[1][0]) + else: + raise NotImplementedError + for j in range(dim): + for i in range(dim): + instruction(expression=exprs[i][j], + assignee=assignee[i][j], + within_inames=frozenset(visitor.quadrature_inames()), + depends_on=frozenset({Writes(name), Writes(det_inv)})) + + +def name_determinant(matrix, shape, visitor): + name = matrix + "_det" + + define_determinant(name, matrix, shape, visitor) + + return name + + +def name_determinant_inverse(matrix, shape, visitor): + name = matrix + "_det_inv" + + define_determinant_inverse(name, matrix, shape, visitor) + + return name + + +def name_matrix_inverse(name, shape, visitor): + name_inv = name + "_inv" + + define_matrix_inverse(name, name_inv, shape, visitor) + + return name_inv + + def define_assembled_tensor(name, expr, visitor): temporary_variable(name, shape=expr.ufl_shape, @@ -22,7 +132,7 @@ def define_assembled_tensor(name, expr, visitor): visitor.indices = indices instruction(assignee=prim.Subscript(prim.Variable(name), indices), expression=visitor.call(expr), - forced_iname_deps=frozenset(visitor.interface.quadrature_inames()), + forced_iname_deps=frozenset(visitor.quadrature_inames()), depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}), tags=frozenset({"quad"}), ) @@ -36,18 +146,47 @@ def name_assembled_tensor(o, visitor): @kernel_cached +def code_generation_time_inversion(expr, visitor): + mat = np.ndarray(expr.ufl_shape) + for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)): + visitor.indices = indices + val = visitor.call(expr.ufl_operands[0]) + if not isinstance(val, (float, int)): + visitor.indices = None + return None + + mat[indices] = val + + visitor.indices = None + return np.linalg.inv(mat) + + def pymbolic_matrix_inverse(o, visitor): + # Try to evaluate the matrix at code generation time. + # If this works (it does e.g. for Maxwell on structured grids) + # we can invert the matrix at code generation time!!! indices = visitor.indices visitor.indices = None - name = name_assembled_tensor(o.ufl_operands[0], visitor) - - instruction(code="{}.invert();".format(name), - within_inames=frozenset(visitor.interface.quadrature_inames()), - depends_on=frozenset({lp.match.Writes(name), - lp.match.Tagged("sumfact_stage1"), - }), - tags=frozenset({"quad"}), - ) + + mat = code_generation_time_inversion(o, visitor) + if mat is not None: + return mat[indices] + + # If code generation time inversion failed, we assemble it in C++ + # and invert it there. + expr = o.ufl_operands[0] + name = name_assembled_tensor(expr, visitor) + + if expr.shape[0] <= 3: + name = name_matrix_inverse(name, expr.ufl_shape, visitor) + else: + instruction(code="{}.invert();".format(name), + within_inames=frozenset(visitor.quadrature_inames()), + depends_on=frozenset({lp.match.Writes(name), + lp.match.Tagged("sumfact_stage1"), + }), + tags=frozenset({name}), + ) visitor.indices = indices return prim.Variable(name) diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index c1ccd13a5552980db0be3788651065dc8a39654c..25be6f7eb80b365d8eead39c3cbb93c0e36c52bc 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -39,9 +39,6 @@ from dune.codegen.sumfact.permutation import (permute_backward, from dune.codegen.sumfact.tabulation import (basis_functions_per_direction, construct_basis_matrix_sequence, ) -from dune.codegen.sumfact.switch import (get_facedir, - get_facemod, - ) from dune.codegen.sumfact.symbolic import SumfactKernel, SumfactKernelInterfaceBase from dune.codegen.ufl.modified_terminals import extract_modified_arguments from dune.codegen.tools import get_pymbolic_basename, get_leaf, ImmutableCuttingRecord @@ -55,6 +52,7 @@ import pymbolic.primitives as prim from loopy.symbolic import WalkMapper import ufl.classes as uc from ufl import FiniteElement, MixedElement, TensorProductElement +import itertools basis_sf_kernels = generator_factory(item_tags=("basis_sf_kernels",), context_tags='kernel', no_deco=True) @@ -161,8 +159,10 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): if self.trial_element is None: return () else: - from dune.codegen.sumfact.basis import SumfactBasisMixin - return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) + mixin = get_form_option("basis_mixins") + from dune.codegen.generation import construct_from_mixins + MixinType = construct_from_mixins(mixins=[mixin], mixintype="basis", name="MixinType") + return MixinType.lfs_inames(MixinType(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The result of stage 2 has the correct quadrature permutation but no @@ -351,11 +351,84 @@ class SumfactAccumulationMixin(AccumulationMixinBase): return get_accumulation_info(expr, self) def list_accumulation_infos(self, expr): - return list_accumulation_infos(expr, self) + return itertools.product(_gradsplitting_generator(expr, self), + _trial_generator(expr, self), + ) def generate_accumulation_instruction(self, expr): return generate_accumulation_instruction(expr, self) + def get_facedir(self, restriction): + from dune.codegen.pdelab.restriction import Restriction + if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet": + return get_global_context_value("facedir_s") + if restriction == Restriction.NEGATIVE: + return get_global_context_value("facedir_n") + return None + + def get_facemod(self, restriction): + from dune.codegen.pdelab.restriction import Restriction + if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet": + return get_global_context_value("facemod_s") + if restriction == Restriction.NEGATIVE: + return get_global_context_value("facemod_n") + return None + + def additional_matrix_sequence(self): + return None + + @property + def prohibit_jacobian(self): + return False + + +@accumulation_mixin("sumfact_pointdiagonal") +class SumfactPointDiagonalAccumulationMixin(SumfactAccumulationMixin): + def additional_matrix_sequence(self): + info = self.current_info[1] + return construct_basis_matrix_sequence(transpose=True, + derivative=info.grad_index, + facedir=self.get_facedir(info.restriction), + facemod=self.get_facemod(info.restriction), + basis_size=get_basis_size(info), + ) + + def get_accumulation_info(self, expr): + element = expr.ufl_element() + leaf_element = element + element_index = 0 + from ufl import MixedElement + if isinstance(expr.ufl_element(), MixedElement): + element_index = self.indices[0] + leaf_element = element.extract_component(element_index)[1] + + restriction = self.restriction + if self.measure == 'exterior_facet': + from dune.codegen.pdelab.restriction import Restriction + restriction = Restriction.POSITIVE + + grad_index = None + if self.reference_grad: + if isinstance(expr.ufl_element(), MixedElement): + grad_index = self.indices[1] + else: + grad_index = self.indices[0] + + return SumfactAccumulationInfo(element=expr.ufl_element(), + element_index=element_index, + restriction=restriction, + grad_index=grad_index, + ) + + def list_accumulation_infos(self, expr): + return itertools.product(_gradsplitting_generator(expr, self, number=0), + _gradsplitting_generator(expr, self, number=1), + ) + + @property + def prohibit_jacobian(self): + return True + class SumfactAccumulationInfo(ImmutableRecord): def __init__(self, @@ -422,9 +495,9 @@ def _get_childs(element): yield (i, element.extract_component(i)[1]) -def _test_generator(expr, visitor): +def _gradsplitting_generator(expr, visitor, number=0): from dune.codegen.ufl.modified_terminals import extract_modified_arguments - ma = extract_modified_arguments(expr, argnumber=0) + ma = extract_modified_arguments(expr, argnumber=number) if len(ma) == 0: return element = ma[0].argexpr.ufl_element() @@ -440,7 +513,8 @@ def _test_generator(expr, visitor): for res in restrictions: for ei, e in _get_childs(element): for grad in (None,) + tuple(range(dim)): - yield SumfactAccumulationInfo(element_index=ei, + yield SumfactAccumulationInfo(element=element, + element_index=ei, restriction=res, grad_index=grad) @@ -465,13 +539,20 @@ def _trial_generator(expr, visitor): yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e) -def list_accumulation_infos(expr, visitor): - import itertools - return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor)) +def get_basis_size(info): + leaf_element = info.element + element_index = info.element_index + dim = world_dimension() + from ufl import MixedElement + if isinstance(leaf_element, MixedElement): + leaf_element = leaf_element.extract_component(element_index)[1] + degree = leaf_element._degree + if isinstance(degree, int): + degree = (degree,) * dim + return tuple(deg + 1 for deg in degree) def generate_accumulation_instruction(expr, visitor): - dim = world_dimension() test_info = visitor.test_info trial_info = visitor.trial_info @@ -480,14 +561,7 @@ def generate_accumulation_instruction(expr, visitor): count_quadrature_point_operations(expr) # Number of basis functions per direction - leaf_element = test_info.element - from ufl import MixedElement - if isinstance(leaf_element, MixedElement): - leaf_element = leaf_element.extract_component(test_info.element_index)[1] - degree = leaf_element._degree - if isinstance(degree, int): - degree = (degree,) * dim - basis_size = tuple(deg + 1 for deg in degree) + basis_size = get_basis_size(test_info) # Anisotropic finite elements are not (yet) supported by Dune assert(size == basis_size[0] for size in basis_size) @@ -521,22 +595,29 @@ def generate_accumulation_instruction(expr, visitor): matrix_sequence = construct_basis_matrix_sequence( transpose=True, derivative=test_info.grad_index, - facedir=get_facedir(test_info.restriction), - facemod=get_facemod(test_info.restriction), - basis_size=basis_size) + facedir=visitor.get_facedir(test_info.restriction), + facemod=visitor.get_facemod(test_info.restriction), + basis_size=basis_size, + additional_sequence=visitor.additional_matrix_sequence()) jacobian_inames = trial_info.inames priority = test_info.grad_index if priority is None: priority = 3 + trial_element = trial_info.element + trial_element_index = trial_info.element_index + if visitor.prohibit_jacobian: + trial_element = None + trial_element_index = 0 + output = AccumulationOutput(matrix_sequence=matrix_sequence, accumvar=accumvar, restriction=(test_info.restriction, trial_info.restriction), test_element=test_info.element, test_element_index=test_info.element_index, - trial_element=trial_info.element, - trial_element_index=trial_info.element_index, + trial_element=trial_element, + trial_element_index=trial_element_index, ) sf = SumfactKernel(matrix_sequence=matrix_sequence, diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 1e801f4550d17d89ef8972fe3ffa867d4a44ea25..7908ab3515eb43c1f667216481821024076bcf88 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -20,7 +20,7 @@ from cgen import ArrayOf, AlignedAttribute, Initializer from dune.codegen.generation import cache_restoring, delete_cache_items from dune.codegen.loopy.target import DuneTarget, type_floatingpoint from dune.codegen.sumfact.realization import realize_sumfact_kernel_function -from dune.codegen.options import get_option, set_option +from dune.codegen.options import get_option, option_context from dune.codegen.error import CodegenAutotuneError @@ -185,296 +185,281 @@ def generate_standalone_code_google_benchmark(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Extract sum factorization kernel - from dune.codegen.pdelab.localoperator import extract_kernel_from_cache - knl = realize_sumfact_kernel_function(sf) - - # Add the implementation of the kernel. - # TODO: This can probably done in a safer way? - first_line = knl.member.lines[0] - arguments = first_line[first_line.find("(") + 1:first_line.find(")")] - - with open(filename, "w") as f: - f.writelines(["// {}".format(first_line), - "\n", - "#include \"config.h\"\n", - "#include \"benchmark/benchmark.h\"\n", - "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", - "#include<dune/codegen/common/vectorclass.hh>\n", - "#include<dune/codegen/sumfact/onedquadrature.hh>\n", - "#include<dune/codegen/sumfact/horizontaladd.hh>\n", - "#include<random>\n", - "#include<fstream>\n", - "#include<iostream>\n", - "\n" - ]) - - write_global_data(sf, filename) - - with open(filename, "a") as f: - arguments = ', '.join(sf.interface.signature_args) - if len(arguments) > 0: - arguments = ', ' + arguments - arguments = 'const char* buffer0, const char* buffer1' + arguments - f.write("void sumfact_kernel({})\n".format(arguments)) - for line in knl.member.lines[1:]: - f.write("{}\n".format(line)) - - f.write("\n\n") - f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") - - write_setup_code(sf, filename, define_thetas=False) - - additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] - additional_arguments = ', '.join(additional_arguments) - if len(additional_arguments) > 0: - additional_arguments = ', ' + additional_arguments - with open(filename, "a") as f: - f.writelines([" for (auto _ : state){\n", - " sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments), - " }\n", - "}\n", - "BENCHMARK(BM_sumfact_kernel);\n", - "\n", - "BENCHMARK_MAIN();" - ]) + with option_context(opcounter=False): + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + # Add the implementation of the kernel. + # TODO: This can probably done in a safer way? + first_line = knl.member.lines[0] + arguments = first_line[first_line.find("(") + 1:first_line.find(")")] + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include \"benchmark/benchmark.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + write_global_data(sf, filename) + + with open(filename, "a") as f: + arguments = ', '.join(sf.interface.signature_args) + if len(arguments) > 0: + arguments = ', ' + arguments + arguments = 'const char* buffer0, const char* buffer1' + arguments + f.write("void sumfact_kernel({})\n".format(arguments)) + for line in knl.member.lines[1:]: + f.write("{}\n".format(line)) + + f.write("\n\n") + f.write("static void BM_sumfact_kernel(benchmark::State& state){\n") + + write_setup_code(sf, filename, define_thetas=False) + + additional_arguments = [i.split()[-1] for i in sf.interface.signature_args] + additional_arguments = ', '.join(additional_arguments) + if len(additional_arguments) > 0: + additional_arguments = ', ' + additional_arguments + with open(filename, "a") as f: + f.writelines([" for (auto _ : state){\n", + " sumfact_kernel(buffer0, buffer1{});\n".format(additional_arguments), + " }\n", + "}\n", + "BENCHMARK(BM_sumfact_kernel);\n", + "\n", + "BENCHMARK_MAIN();" + ]) def generate_standalone_code(sf, filename): delete_cache_items("kernel_default") # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Extract sum factorization kernel - from dune.codegen.pdelab.localoperator import extract_kernel_from_cache - knl = realize_sumfact_kernel_function(sf) - first_line = knl.member.lines[0] - - with open(filename, "w") as f: - f.writelines(["// {}".format(first_line), - "\n", - "#include \"config.h\"\n", - "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", - "#include<dune/codegen/common/tsc.hh>\n", - "#include<dune/codegen/common/vectorclass.hh>\n", - "#include<dune/codegen/sumfact/onedquadrature.hh>\n", - "#include<dune/codegen/sumfact/horizontaladd.hh>\n", - "#include<random>\n", - "#include<fstream>\n", - "#include<iostream>\n", - "\n" - ]) - - f.writelines(["int main(int argc, char** argv)\n", - "{\n", - ]) - - write_setup_code(sf, filename) - - # Write measurement - with open(filename, "a") as f: - # Start a TSC timer - f.writelines([" auto start = Dune::PDELab::TSC::start();\n", - ]) - - # Add the implementation of the kernel. - repeats = int(1e9 / sf.operations) - f.write(" for(int i=0; i<{}; ++i)\n".format(repeats)) - f.write(" {\n") - for line in knl.member.lines[1:]: - f.write(" {}\n".format(line)) - f.write(" }\n") - - # Stop the TSC timer and write the result to a file - f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", - " std::ofstream file;\n", - " file.open(argv[1]);\n", - " file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))), - " file.close();\n", - " accum += output[dis(rng)];\n", - " std::cout << accum;\n", - "}\n", - ]) - - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + with option_context(opcounter=False): + # Extract sum factorization kernel + from dune.codegen.pdelab.localoperator import extract_kernel_from_cache + knl = realize_sumfact_kernel_function(sf) + first_line = knl.member.lines[0] + + with open(filename, "w") as f: + f.writelines(["// {}".format(first_line), + "\n", + "#include \"config.h\"\n", + "#include<dune/pdelab/finiteelementmap/qkdg.hh>\n", + "#include<dune/codegen/common/tsc.hh>\n", + "#include<dune/codegen/common/vectorclass.hh>\n", + "#include<dune/codegen/sumfact/onedquadrature.hh>\n", + "#include<dune/codegen/sumfact/horizontaladd.hh>\n", + "#include<random>\n", + "#include<fstream>\n", + "#include<iostream>\n", + "\n" + ]) + + f.writelines(["int main(int argc, char** argv)\n", + "{\n", + ]) + + write_setup_code(sf, filename) + + # Write measurement + with open(filename, "a") as f: + # Start a TSC timer + f.writelines([" auto start = Dune::PDELab::TSC::start();\n", + ]) + + # Add the implementation of the kernel. + repeats = int(1e9 / sf.operations) + f.write(" for(int i=0; i<{}; ++i)\n".format(repeats)) + f.write(" {\n") + for line in knl.member.lines[1:]: + f.write(" {}\n".format(line)) + f.write(" }\n") + + # Stop the TSC timer and write the result to a file + f.writelines([" auto stop = Dune::PDELab::TSC::stop();\n", + " std::ofstream file;\n", + " file.open(argv[1]);\n", + " file << Dune::PDELab::TSC::elapsed(start, stop) / {} << std::endl;\n".format(str(float(repeats))), + " file.close();\n", + " accum += output[dis(rng)];\n", + " std::cout << accum;\n", + "}\n", + ]) def generate_standalone_kernel_code(kernel, signature, filename, transformations=None): # Turn off opcounting - opcounting = get_option("opcounter") - set_option("opcounter", False) - - # Remove opcounter from signature - p = re.compile('OpCounter::OpCounter<([^>]*)>') - assert len(signature) == 1 - sig = signature[0] - sig = p.sub(r'\1', sig) - assert 'OpCounter' not in signature - - # Which transformations were applied - codegen_transformations = '' - if transformations: + with option_context(opcounter=False): + # Remove opcounter from signature + p = re.compile('OpCounter::OpCounter<([^>]*)>') + assert len(signature) == 1 + sig = signature[0] + sig = p.sub(r'\1', sig) + assert 'OpCounter' not in signature + + # Which transformations were applied codegen_transformations = '' - for trafo in transformations: - codegen_transformations += '// {}\n'.format(trafo) - - template = 'kernel_benchmark_template1.cc.in' - use_datasets = True - - # Old benchmark template - # template = 'kernel_benchmark_template0.cc.in' - # use_datasets = False - - template_filename = pkg_resources.resource_filename(__name__, template) - with open(template_filename, 'r') as f: - benchmark = f.read() - - # Find function arguments and global arguments - arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') - arguments = [a.split(' ')[-1] for a in arguments] - global_args = [a for a in kernel.args if a.name not in arguments] - buffer_arguments = [a for a in arguments if a.startswith('buff')] - input_arguments = [a for a in arguments if a not in buffer_arguments] - - # Declare global arguments - codegen_declare_global_arguments = '' - target = DuneTarget() - for g in global_args: - decl_info = g.decl_info(target, True, g.dtype) - for idi in decl_info: - ast_builder = target.get_device_ast_builder() - arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) - arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) - arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) - codegen_declare_global_arguments += '{}\n'.format(arg_decl) - codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') - - # Helper function for argument initialization - def _initialize_arg(arg): - if isinstance(arg, lp.ValueArg): - return [] - real = type_floatingpoint() - size = reduce(mul, arg.shape) - fill_name = arg.name + '_fill' - lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), - ' for (std::size_t i=0; i<{}; ++i){{'.format(size), - ' {}[i] = unif(re);'.format(fill_name), - ' }'] - return lines - - # Initialize global arguments - codegen_initialize_global_arguments = '' - for arg in global_args: - lines = _initialize_arg(arg) - codegen_initialize_global_arguments += '\n'.join(lines) + '\n' - codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') - - codegen_initialize_input = '' - - # Function we want to benchmark - codegen_benchmark_function = '' - codegen_benchmark_function += sig[0:sig.find(')') + 1] - codegen_benchmark_function += lp.generate_body(kernel) - codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') - - # Declare function arguments - codegen_declare_arguments = [] - codegen_declare_input = [] - function_arguments = [a for a in kernel.args if a.name in arguments] - for arg in function_arguments: - if 'buffer' in arg.name: - byte_size = reduce(mul, arg.shape) * 8 - codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, - byte_size, - arg.alignment),) - elif isinstance(arg, lp.ValueArg): - assert 'jacobian_offset' in arg.name - decl = arg.get_arg_decl(ast_builder) - decl = Initializer(decl, 'unif_int(re)') - codegen_declare_arguments.append((' {}\n'.format(decl))) - else: - assert 'fastdg' in arg.name - size = reduce(mul, arg.shape) - min_stride = min([tag.stride for tag in arg.dim_tags]) - size *= min_stride - alignment = arg.dtype.itemsize + if transformations: + codegen_transformations = '' + for trafo in transformations: + codegen_transformations += '// {}\n'.format(trafo) + + template = 'kernel_benchmark_template1.cc.in' + use_datasets = True + + # Old benchmark template + # template = 'kernel_benchmark_template0.cc.in' + # use_datasets = False + + template_filename = pkg_resources.resource_filename(__name__, template) + with open(template_filename, 'r') as f: + benchmark = f.read() + + # Find function arguments and global arguments + arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + global_args = [a for a in kernel.args if a.name not in arguments] + buffer_arguments = [a for a in arguments if a.startswith('buff')] + input_arguments = [a for a in arguments if a not in buffer_arguments] + + # Declare global arguments + codegen_declare_global_arguments = '' + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + codegen_declare_global_arguments += '{}\n'.format(arg_decl) + codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') + + # Helper function for argument initialization + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] real = type_floatingpoint() - if use_datasets: - codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, - arg.name, - size, - alignment))) + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + # Initialize global arguments + codegen_initialize_global_arguments = '' + for arg in global_args: + lines = _initialize_arg(arg) + codegen_initialize_global_arguments += '\n'.join(lines) + '\n' + codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') + + codegen_initialize_input = '' + + # Function we want to benchmark + codegen_benchmark_function = '' + codegen_benchmark_function += sig[0:sig.find(')') + 1] + codegen_benchmark_function += lp.generate_body(kernel) + codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') + + # Declare function arguments + codegen_declare_arguments = [] + codegen_declare_input = [] + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + codegen_declare_arguments.append((' {}\n'.format(decl))) else: - codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, - arg.name, - size, - alignment))) - - codegen_declare_arguments = ''.join(codegen_declare_arguments) - codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') - codegen_declare_input = ''.join(codegen_declare_input) - codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') - - # Initialize function arguments - codegen_initialize_arguments = '' - codegen_initialize_input = '' - for arg in function_arguments: - if 'fastdg' in arg.name: - if use_datasets: - lines = _initialize_arg(arg) - lines = [' ' + a for a in lines] - lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] - lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') - lines.append('}') - codegen_initialize_input += '\n'.join(lines) + '\n' + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + min_stride = min([tag.stride for tag in arg.dim_tags]) + size *= min_stride + alignment = arg.dtype.itemsize + real = type_floatingpoint() + if use_datasets: + codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + else: + codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + + codegen_declare_arguments = ''.join(codegen_declare_arguments) + codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') + codegen_declare_input = ''.join(codegen_declare_input) + codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') + + # Initialize function arguments + codegen_initialize_arguments = '' + codegen_initialize_input = '' + for arg in function_arguments: + if 'fastdg' in arg.name: + if use_datasets: + lines = _initialize_arg(arg) + lines = [' ' + a for a in lines] + lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] + lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') + lines.append('}') + codegen_initialize_input += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' else: lines = _initialize_arg(arg) codegen_initialize_arguments += '\n'.join(lines) + '\n' + codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') + codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') + + # Call the benchmark function + if use_datasets: + arguments_with_datasets = arguments.copy() + arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] + codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' + codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' + for arg in input_arguments: + codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) + codegen_call_benchmark_function += '}' else: - lines = _initialize_arg(arg) - codegen_initialize_arguments += '\n'.join(lines) + '\n' - codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') - codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') - - # Call the benchmark function - if use_datasets: - arguments_with_datasets = arguments.copy() - arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] - codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' - codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' - for arg in input_arguments: - codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) - codegen_call_benchmark_function += '}' - else: - codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' - codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') - - # Replace placeholders in benchmark template - benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) - benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) - benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) - benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) - benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) - benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) - benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) - - # Write benchmark source file - with open(filename, 'w') as f: - f.writelines(benchmark) - - # Maybe turn opcounting on again - set_option("opcounter", opcounting) + codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' + codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') + + # Replace placeholders in benchmark template + benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) + benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) + benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) + benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) + benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) + benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) + + # Write benchmark source file + with open(filename, 'w') as f: + f.writelines(benchmark) def autotune_realization(sf=None, kernel=None, signature=None, transformations=None): diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index 757d51870aa7ccf690b9362b9b0522a4892f792b..83e5206b5289719f3c4e85f4ca523976cb6feb79 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -31,9 +31,6 @@ from dune.codegen.sumfact.permutation import (permute_backward, sumfact_cost_permutation_strategy, sumfact_quadrature_permutation_strategy, ) -from dune.codegen.sumfact.switch import (get_facedir, - get_facemod, - ) from dune.codegen.pdelab.argument import name_coefficientcontainer, name_applycontainer from dune.codegen.pdelab.basis import GenericBasisMixin from dune.codegen.pdelab.geometry import (local_dimension, @@ -86,7 +83,7 @@ class SumfactBasisMixin(GenericBasisMixin): temporary_variable(name, shape=()) quad_inames = self.quadrature_inames() inames = self.lfs_inames(element, restriction) - facedir = get_facedir(restriction) + facedir = self.get_facedir(restriction) # Collect the pairs of lfs/quad inames that are in use # On facets, the normal direction of the facet is excluded @@ -106,7 +103,7 @@ class SumfactBasisMixin(GenericBasisMixin): # Add the missing direction on facedirs by evaluating at either 0 or 1 if facedir is not None: - facemod = get_facemod(restriction) + facemod = self.get_facemod(restriction) prod = prod + (prim.Call(PolynomialLookup(name_polynomials(element.degree()), False), (prim.Variable(inames[facedir]), facemod)),) @@ -141,7 +138,7 @@ class SumfactBasisMixin(GenericBasisMixin): temporary_variable(name, shape=()) quad_inames = self.quadrature_inames() inames = self.lfs_inames(element, restriction) - facedir = get_facedir(restriction) + facedir = self.get_facedir(restriction) # Map the direction to a quadrature iname quadinamemapping = {} @@ -161,7 +158,7 @@ class SumfactBasisMixin(GenericBasisMixin): prod.append(tab.pymbolic((prim.Variable(quadinamemapping[i]), prim.Variable(inames[i])))) if facedir is not None: - facemod = get_facemod(restriction) + facemod = self.get_facemod(restriction) prod.append(prim.Call(PolynomialLookup(name_polynomials(element.degree()), index == facedir), (prim.Variable(inames[facedir]), facemod)),) @@ -197,8 +194,8 @@ class SumfactBasisMixin(GenericBasisMixin): # Construct the matrix sequence for this sum factorization matrix_sequence = construct_basis_matrix_sequence(derivative=derivative, - facedir=get_facedir(restriction), - facemod=get_facemod(restriction), + facedir=self.get_facedir(restriction), + facemod=self.get_facemod(restriction), basis_size=basis_size) inp = LFSSumfactKernelInput(matrix_sequence=matrix_sequence, @@ -235,6 +232,28 @@ class SumfactBasisMixin(GenericBasisMixin): return prim.Subscript(var, vsf.quadrature_index(sf, self)) +@basis_mixin("sumfact_pointdiagonal") +class SumfactPointDiagonalBasisMixin(SumfactBasisMixin): + def lfs_inames(self, element, restriction, number=1): + return () + + def implement_basis(self, element, restriction, number): + info = self.current_info[number] + if element == info.element and restriction == info.restriction: + return 1 + else: + return 0 + + def implement_reference_gradient(self, element, restriction, number): + index, = self.indices + self.indices = None + info = self.current_info[number] + if element == info.element and restriction == info.restriction and index == info.grad_index: + return 1 + else: + return 0 + + class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): def __init__(self, matrix_sequence=None, diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index 8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b..79ca2726bd3fab0dca0f7a21598ef13b9fe0dd0a 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -16,7 +16,8 @@ from dune.codegen.generation import (class_member, valuearg, ) from dune.codegen.loopy.flatten import flatten_index -from dune.codegen.options import get_option +from dune.codegen.loopy.target import type_floatingpoint +from dune.codegen.options import get_form_option, get_option from dune.codegen.pdelab.geometry import (enforce_boundary_restriction, local_dimension, world_dimension, @@ -28,7 +29,6 @@ from dune.codegen.pdelab.geometry import (enforce_boundary_restriction, ) from dune.codegen.pdelab.localoperator import (name_ansatz_gfs_constructor_param, lop_template_ansatz_gfs, - lop_template_range_field, ) from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.sumfact.accumulation import basis_sf_kernels, sumfact_iname @@ -39,10 +39,8 @@ from dune.codegen.sumfact.permutation import (permute_backward, sumfact_quadrature_permutation_strategy, ) from dune.codegen.sumfact.quadrature import additional_inames -from dune.codegen.sumfact.switch import get_facedir, get_facemod from dune.codegen.sumfact.symbolic import SumfactKernelInterfaceBase, SumfactKernel from dune.codegen.tools import get_pymbolic_basename, ImmutableCuttingRecord -from dune.codegen.options import get_form_option, option_switch from dune.codegen.ufl.modified_terminals import Restriction from loopy.match import Writes @@ -52,8 +50,13 @@ import numpy as np import loopy as lp +class SumfactGeometryMixinBase(GenericPDELabGeometryMixin): + def nonsumfact_fallback(self): + return None + + @geometry_mixin("sumfact_multilinear") -class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): +class SumfactMultiLinearGeometryMixin(SumfactGeometryMixinBase): def nonsumfact_fallback(self): return "generic" @@ -162,8 +165,8 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): def outer_normal(self): """ This is the *unnormalized* outer normal """ name = "outer_normal" - facedir_s = get_facedir(Restriction.POSITIVE) - facemod_s = get_facemod(Restriction.POSITIVE) + facedir_s = self.get_facedir(Restriction.POSITIVE) + facemod_s = self.get_facemod(Restriction.POSITIVE) temporary_variable(name, shape=(world_dimension(),)) for i in range(world_dimension()): @@ -210,8 +213,8 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): restriction = enforce_boundary_restriction(self) # Generate sum factorization kernel and add vectorization info - matrix_sequence = construct_basis_matrix_sequence(facedir=get_facedir(restriction), - facemod=get_facemod(restriction), + matrix_sequence = construct_basis_matrix_sequence(facedir=self.get_facedir(restriction), + facemod=self.get_facemod(restriction), basis_size=(2,) * world_dimension()) inp = GeoCornersInput(matrix_sequence=matrix_sequence, direction=self.indices[0], @@ -243,7 +246,7 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): @geometry_mixin("sumfact_axiparallel") -class SumfactAxiParallelGeometryMixin(AxiparallelGeometryMixin): +class SumfactAxiParallelGeometryMixin(SumfactGeometryMixinBase, AxiparallelGeometryMixin): def nonsumfact_fallback(self): return "axiparallel" @@ -253,8 +256,8 @@ class SumfactAxiParallelGeometryMixin(AxiparallelGeometryMixin): assert isinstance(i, int) # Use facemod_s and facedir_s - if i == get_facedir(Restriction.POSITIVE): - if get_facemod(Restriction.POSITIVE): + if i == self.get_facedir(Restriction.POSITIVE): + if self.get_facemod(Restriction.POSITIVE): return 1 else: return -1 @@ -270,7 +273,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle def facet_jacobian_determinant(self, o): name = "fdetjac" self.define_facet_jacobian_determinant(name) - facedir = get_facedir(Restriction.POSITIVE) + facedir = self.get_facedir(Restriction.POSITIVE) globalarg(name, shape=(world_dimension(),)) return prim.Subscript(prim.Variable(name), (facedir,)) @@ -283,7 +286,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle @kernel_cached(kernel="operator") def _define_facet_jacobian_determinant_eval(self, name): gfs = name_ansatz_gfs_constructor_param() - rft = lop_template_range_field() + rft = type_floatingpoint() code = ["{", " auto e = *({}.gridView().template begin<0>());".format(gfs), " int dir=0;", @@ -304,8 +307,7 @@ class SumfactEqudistantGeometryMixin(EquidistantGeometryMixin, SumfactAxiParalle restriction = Restriction.NONE if self.measure == "interior_facet": restriction = Restriction.POSITIVE - from dune.codegen.sumfact.switch import get_facedir - face = get_facedir(restriction) + face = self.get_facedir(restriction) lowcorner = name_lowerleft_corner() meshwidth = name_meshwidth() @@ -483,8 +485,7 @@ def define_corner(name, low): @class_member(classtag="operator") def define_mesh_width(name): - from dune.codegen.pdelab.localoperator import lop_template_range_field - rft = lop_template_range_field() + rft = type_floatingpoint() define_mesh_width_eval(name) return "Dune::FieldVector<{}, {}> {};".format(rft, world_dimension(), name) @@ -528,8 +529,8 @@ def _name_jacobian(i, j, restriction, visitor): """ # Create matrix sequence with derivative in j direction matrix_sequence = construct_basis_matrix_sequence(derivative=j, - facedir=get_facedir(restriction), - facemod=get_facemod(restriction), + facedir=visitor.get_facedir(restriction), + facemod=visitor.get_facemod(restriction), basis_size=(2,) * world_dimension()) # Sum factorization input for the i'th component of the geometry mapping diff --git a/python/dune/codegen/sumfact/permutation.py b/python/dune/codegen/sumfact/permutation.py index 7f37dfeae795031ec81b292b96e65649f8fc78cc..916f7773592191a7dcc0731e56b93524daf4528e 100644 --- a/python/dune/codegen/sumfact/permutation.py +++ b/python/dune/codegen/sumfact/permutation.py @@ -3,7 +3,7 @@ import itertools from dune.codegen.options import get_option -from dune.codegen.sumfact.switch import get_facedir, get_facemod +from dune.codegen.sumfact.tabulation import quadrature_points_per_direction from dune.codegen.ufl.modified_terminals import Restriction @@ -125,8 +125,9 @@ def sumfact_quadrature_permutation_strategy(dim, restriction): # all others can be derived by rotating the cube and matching edge # directions. def _order_on_self(restriction): - facedir = get_facedir(restriction) - facemod = get_facemod(restriction) + from dune.codegen.sumfact.accumulation import SumfactAccumulationMixin + facedir = SumfactAccumulationMixin.get_facedir(None, restriction) + facemod = SumfactAccumulationMixin.get_facemod(None, restriction) quadrature_order = { (0, 0): (0, 1, 2), diff --git a/python/dune/codegen/sumfact/quadrature.py b/python/dune/codegen/sumfact/quadrature.py index 91e99c4c3c0cd333d6e64353ed06ac0c129a7e05..ae387426ebda79a8435811c548e91761f2de1b3c 100644 --- a/python/dune/codegen/sumfact/quadrature.py +++ b/python/dune/codegen/sumfact/quadrature.py @@ -10,7 +10,6 @@ from dune.codegen.generation import (domain, quadrature_mixin, temporary_variable, ) -from dune.codegen.sumfact.switch import get_facedir from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction, local_quadrature_points_per_direction, name_oned_quadrature_points, @@ -22,7 +21,6 @@ from dune.codegen.pdelab.geometry import (local_dimension, ) from dune.codegen.pdelab.quadrature import GenericQuadratureMixin from dune.codegen.options import get_form_option -from dune.codegen.sumfact.switch import get_facedir from dune.codegen.loopy.target import dtype_floatingpoint from loopy import CallMangleInfo diff --git a/python/dune/codegen/sumfact/switch.py b/python/dune/codegen/sumfact/switch.py index a031790dd5321841eb8c100195cde675651fe1d3..6b8c43e4dac7bfd08f7bbbfb6a551b47fd02a499 100644 --- a/python/dune/codegen/sumfact/switch.py +++ b/python/dune/codegen/sumfact/switch.py @@ -12,7 +12,7 @@ from dune.codegen.pdelab.signatures import (assembly_routine_args, assembly_routine_signature, kernel_name, ) -from dune.codegen.options import get_form_option, get_option, set_form_option +from dune.codegen.options import get_form_option, get_option, form_option_context from dune.codegen.cgen.clazz import ClassMember @@ -26,21 +26,12 @@ def sumfact_generate_kernels_per_integral(integrals): if measure == "exterior_facet": # Maybe skip sum factorization on boundary integrals if not get_form_option("sumfact_on_boundary"): - set_form_option("sumfact", False) - - # Try to find a fallback for sum factorized geometry mixins - geometry_backup = get_form_option("geometry_mixins") - mixin = construct_from_mixins(mixins=[geometry_backup])() - if hasattr(mixin, "nonsumfact_fallback"): - set_form_option("geometry_mixins", mixin.nonsumfact_fallback()) - - for k in generate_kernels_per_integral(integrals): - yield k - - # Reset state - set_form_option("geometry_mixins", geometry_backup) - set_form_option("sumfact", True) - return + mixin = construct_from_mixins(mixins=[get_form_option("geometry_mixins")])() + geometry = mixin.nonsumfact_fallback() or get_form_option("geometry_mixins") + with form_option_context(sumfact=False, geometry_mixins=geometry): + for k in generate_kernels_per_integral(integrals): + yield k + return # Generate all necessary kernels for facedir in range(dim): @@ -169,25 +160,3 @@ def generate_interior_facet_switch(): block.append("}") return ClassMember(signature + block) - - -def get_facedir(restriction): - from dune.codegen.pdelab.restriction import Restriction - if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet": - return get_global_context_value("facedir_s") - if restriction == Restriction.NEGATIVE: - return get_global_context_value("facedir_n") - if restriction == Restriction.NONE: - return None - assert False - - -def get_facemod(restriction): - from dune.codegen.pdelab.restriction import Restriction - if restriction == Restriction.POSITIVE or get_global_context_value("integral_type") == "exterior_facet": - return get_global_context_value("facemod_s") - if restriction == Restriction.NEGATIVE: - return get_global_context_value("facemod_n") - if restriction == Restriction.NONE: - return None - assert False diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index dfd9383f93c79d3c50bfcca24cdd717fcc16aa58..8fdd1dd8d23ddca5745be5acb4e02837478d4c9d 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -14,7 +14,7 @@ from dune.codegen.sumfact.permutation import (flop_cost, sumfact_cost_permutation_strategy, sumfact_quadrature_permutation_strategy, ) -from dune.codegen.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray +from dune.codegen.sumfact.tabulation import BasisTabulationMatrixBase, BasisTabulationMatrixArray, quadrature_points_per_direction from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint from dune.codegen.loopy.vcl import ExplicitVCLCast, VCLLowerUpperLoad from dune.codegen.tools import get_leaf, maybe_wrap_subscript, remove_duplicates @@ -562,6 +562,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): ImmutableRecord.__init__(self, **defaultdict) prim.Variable.__init__(self, "SUMFACT") + # Precompute and cache a number of keys + self._cached_cache_key = None + self._cached_flop_cost = {} + # # The methods/fields needed to get a well-formed pymbolic node # @@ -617,12 +621,15 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): Any two sum factorization kernels having the same cache_key are realized simultaneously! """ - if self.buffer is None: - # During dry run, we return something unique to this kernel - return repr(self) - else: - # Later we identify parallely implemented kernels by the assigned buffer - return self.buffer + if self._cached_cache_key is None: + if self.buffer is None: + # During dry run, we return something unique to this kernel + self._cached_cache_key = repr(self) + else: + # Later we identify parallely implemented kernels by the assigned buffer + self._cached_cache_key = self.buffer + + return self._cached_cache_key @property def inout_key(self): @@ -818,7 +825,10 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def operations(self): """ The total number of floating point operations for the kernel to be carried out """ - return flop_cost(self.matrix_sequence_cost_permuted) + qp = quadrature_points_per_direction() + if qp not in self._cached_flop_cost: + self._cached_flop_cost[qp] = flop_cost(self.matrix_sequence_cost_permuted) + return self._cached_flop_cost[qp] # Extract the argument list and store it on the class. This needs to be done @@ -865,6 +875,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) prim.Variable.__init__(self, "VecSUMFAC") + # Precompute and cache a number of keys + self._cached_cache_key = None + self._cached_flop_cost = {} + def __getinitargs__(self): return (self.kernels, self.horizontal_width, self.vertical_width, self.buffer, self.insn_dep) @@ -897,7 +911,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) Any two sum factorization kernels having the same cache_key are realized simulatenously! """ - return (self.matrix_sequence_quadrature_permuted, self.restriction, self.stage, self.buffer) + if self._cached_cache_key is None: + self._cached_cache_key = (self.matrix_sequence_quadrature_permuted, self.restriction, self.stage, self.buffer) + + return self._cached_cache_key # # Deduce all data fields of normal sum factorization kernels from the underlying kernels @@ -1123,4 +1140,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) def operations(self): """ The total number of floating point operations for the kernel to be carried out """ - return flop_cost(self.matrix_sequence_cost_permuted) + qp = quadrature_points_per_direction() + if qp not in self._cached_flop_cost: + self._cached_flop_cost[qp] = flop_cost(self.matrix_sequence_cost_permuted) + return self._cached_flop_cost[qp] diff --git a/python/dune/codegen/sumfact/tabulation.py b/python/dune/codegen/sumfact/tabulation.py index 9def97eb3ba4fdae280cc65e7f12ca73164d1146..0170c59463eed570fcce679636899e0d633d32fe 100644 --- a/python/dune/codegen/sumfact/tabulation.py +++ b/python/dune/codegen/sumfact/tabulation.py @@ -18,12 +18,10 @@ from dune.codegen.generation import (class_member, transform, valuearg ) -from dune.codegen.loopy.target import dtype_floatingpoint +from dune.codegen.loopy.target import dtype_floatingpoint, type_floatingpoint from dune.codegen.loopy.vcl import ExplicitVCLCast, get_vcl_type_size from dune.codegen.options import get_option -from dune.codegen.pdelab.localoperator import (name_domain_field, - lop_template_range_field, - ) +from dune.codegen.pdelab.localoperator import name_domain_field from dune.codegen.pdelab.quadrature import quadrature_order from dune.codegen.tools import maybe_wrap_subscript, ceildiv from loopy import CallMangleInfo @@ -50,6 +48,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction=None, slice_size=None, slice_index=None, + additional_tabulation=None, ): """ Arguments: @@ -61,6 +60,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction: Direction corresponding to this matrix slice_size: Number of slices for this direction slice_index: To which slice does this belong + additional_tabulation: A factor to be multiplied with this basis tabulation matrix. """ assert(isinstance(basis_size, int)) ImmutableRecord.__init__(self, @@ -71,6 +71,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction=direction, slice_size=slice_size, slice_index=slice_index, + additional_tabulation=additional_tabulation, ) @property @@ -90,6 +91,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): if self.slice_size is not None: infos.append("s{}".format(self.slice_index)) + if self.additional_tabulation is not None: + infos.append("prod{}".format(self.additional_tabulation._shortname)) + return "".join(infos) def __str__(self): @@ -122,7 +126,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): def pymbolic(self, indices): name = str(self) define_theta(name, self) - return prim.Subscript(prim.Variable(name), indices) + ret = prim.Subscript(prim.Variable(name), indices) + + return ret @property def vectorized(self): @@ -337,7 +343,7 @@ def name_oned_quadrature_points(bound): @class_member(classtag="operator") def typedef_polynomials(name, degree): - range_field = lop_template_range_field() + range_field = type_floatingpoint() domain_field = name_domain_field() include_file("dune/pdelab/finiteelementmap/qkdg.hh", filetag="operatorfile") @@ -372,7 +378,7 @@ def name_polynomials(degree): def sort_quadrature_points_weights(qp, qw, bound): - range_field = lop_template_range_field() + range_field = type_floatingpoint() domain_field = name_domain_field() include_file("dune/codegen/sumfact/onedquadrature.hh", filetag="operatorfile") return frozenset({instruction(code="onedQuadraturePointsWeights<{}, {}, {}>({}, {});" @@ -461,22 +467,31 @@ def define_theta(name, tabmat, additional_indices=(), width=None): else: args.append(tabmat.face) + # Get right hand side of basis evaluation matrix assignment + expr = prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)) + + # Maybe multiply another matrix (needed for the very special case of assembling point diagonals) + if tabmat.additional_tabulation is not None: + expr = prim.Product((expr, prim.Call(PolynomialLookup(polynomials, tabmat.additional_tabulation.derivative), tuple(args)))) + instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices), - expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)), + expression=expr, kernel="operator", depends_on=dep, ) -def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None): +def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None, additional_sequence=None): dim = world_dimension() result = [None] * dim + if additional_sequence is None: + additional_sequence = [None] * dim quadrature_size = quadrature_points_per_direction() assert (basis_size is not None) if facedir is not None: quadrature_size = quadrature_size[:facedir] + (1,) + quadrature_size[facedir:] - for i in range(dim): + for i, add_seq in zip(range(dim), additional_sequence): onface = None if facedir == i: onface = facemod @@ -485,6 +500,7 @@ def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=No basis_size=basis_size[i], transpose=transpose, derivative=derivative == i, - face=onface) + face=onface, + additional_tabulation=add_seq) return tuple(result) diff --git a/python/dune/codegen/sumfact/vectorization.py b/python/dune/codegen/sumfact/vectorization.py index e753652b10b3ac9b1765ee071bce1ccebd15f6b1..8da146281d6d5323303d215f42117ff5f011da37 100644 --- a/python/dune/codegen/sumfact/vectorization.py +++ b/python/dune/codegen/sumfact/vectorization.py @@ -20,7 +20,7 @@ from dune.codegen.sumfact.tabulation import (quadrature_points_per_direction, set_quadrature_points, ) from dune.codegen.error import CodegenVectorizationError -from dune.codegen.options import get_form_option, get_option, set_form_option +from dune.codegen.options import get_form_option, get_option, form_option_context from dune.codegen.tools import add_to_frozendict, round_to_multiple, list_diff from pymbolic.mapper.flop_counter import FlopCounter @@ -331,17 +331,16 @@ def level1_optimal_vectorization_strategy(sumfacts, width): # If we are using the 'target' strategy, we might want to log some information. if get_form_option("vectorization_strategy") == "target": # Print the achieved cost and the target cost on the screen - set_form_option("vectorization_strategy", "model") - target = float(get_form_option("vectorization_target")) - qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) - cost = strategy_cost((qp, optimal_strategies[qp])) - - print("The target cost was: {}".format(target)) - print("The achieved cost was: {}".format(cost)) - optimum = level1_optimal_vectorization_strategy(sumfacts, width) - print("The optimal cost would be: {}".format(strategy_cost(optimum))) - set_form_option("vectorization_strategy", "target") - print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) + with form_option_context(vectorization_strategy="model"): + target = float(get_form_option("vectorization_target")) + qp = min(optimal_strategies, key=lambda qp: abs(strategy_cost((qp, optimal_strategies[qp])) - target)) + cost = strategy_cost((qp, optimal_strategies[qp])) + + print("The target cost was: {}".format(target)) + print("The achieved cost was: {}".format(cost)) + optimum = level1_optimal_vectorization_strategy(sumfacts, width) + print("The optimal cost would be: {}".format(strategy_cost(optimum))) + print("The score in 'target' logic was: {}".format(strategy_cost((qp, optimal_strategies[qp])))) # Print the employed vectorization strategy into a file suffix = "" diff --git a/python/dune/codegen/ufl/transformations/blockpreconditioner.py b/python/dune/codegen/ufl/transformations/blockpreconditioner.py index c992af12e0cae3499f6f6694027610dfd53c535e..b3fd11249e766bdc8abec4c0d01defbf5de9e59c 100644 --- a/python/dune/codegen/ufl/transformations/blockpreconditioner.py +++ b/python/dune/codegen/ufl/transformations/blockpreconditioner.py @@ -21,7 +21,7 @@ class OffDiagonalBlockSwitcher(MultiFunction): def positive_restricted(self, o): self.res = Restriction.POSITIVE ret = self(o.ufl_operands[0]) - self.rest = Restriction.NONE + self.res = Restriction.NONE if isinstance(ret, uc.Zero): return ret else: @@ -55,6 +55,8 @@ class OffDiagonalBlockSwitcher(MultiFunction): def list_restriction_tuples(diagonal): if diagonal: yield (Restriction.NONE, Restriction.NONE) + yield (Restriction.POSITIVE, Restriction.POSITIVE) + return res = (Restriction.POSITIVE, Restriction.NEGATIVE) amount = 1 if diagonal else 2 diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644 --- a/python/dune/codegen/ufl/visitor.py +++ b/python/dune/codegen/ufl/visitor.py @@ -37,6 +37,7 @@ from ufl.classes import (Coefficient, JacobianDeterminant, ) +from pytools import product as ptproduct import pymbolic.primitives as prim import numpy as np @@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): # def product(self, o): - return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands)) + ops = tuple(self.call(op) for op in o.ufl_operands) + if all(isinstance(op, (int, float)) for op in ops): + return ptproduct(ops) + return prim.flattened_product(ops) def float_value(self, o): return o.value() @@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1])) def sum(self, o): - return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands)) + ops = tuple(self.call(op) for op in o.ufl_operands) + if all(isinstance(op, (int, float)) for op in ops): + return sum(ops) + return prim.flattened_sum(ops) def zero(self, o): # UFL has Zeroes with shape. We ignore those indices. diff --git a/python/setup.py b/python/setup.py index 2edf70aaf5f063a278f06b5cfe7c91ac145ed857..9692259428fc392fda99403d44396d3d420ea1d0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -29,6 +29,7 @@ setup(name='dune.codegen', description='Performance optimizing form compiler for the Dune project', author='Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>', url='https://gitlab.dune-project.org/dominic/dune-codegen.git', + python_requires='>=3', packages=['dune.codegen', 'dune.codegen.blockstructured', 'dune.codegen.cgen', @@ -47,5 +48,6 @@ setup(name='dune.codegen', "console_scripts": [ "generate_operators = dune.codegen.compile:entry_generate_operators", "generate_driver = dune.codegen.compile:entry_generate_driver", + "show_options = dune.codegen.options:show_options", ] }) diff --git a/test/adjoint/poisson_mc_driver.hh b/test/adjoint/poisson_mc_driver.hh index 5bd222b800baf36c09bd172c48d706a135912c80..a3ac578afc8d9a2039dd9437ef48fe971a37fcb6 100644 --- a/test/adjoint/poisson_mc_driver.hh +++ b/test/adjoint/poisson_mc_driver.hh @@ -60,7 +60,7 @@ bool driver(int argc, char** argv){ Dune::PDELab::constraints(p1_bctype, p1_dirichlet_gfs_, p1_dirichlet_gfs__cc); // Set up grid grid operators... - using LOP_R = ROperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType>; + using LOP_R = ROperator<P1_dirichlet_GFS, P1_dirichlet_GFS>; using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; using GO_r = Dune::PDELab::GridOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, P1_dirichlet_GFS_CC, P1_dirichlet_GFS_CC>; LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree); @@ -122,7 +122,7 @@ bool driver(int argc, char** argv){ GF_X x_gf(p1_dirichlet_gfs_, x_r); // Local operator for adjoint problem - using LOP_Adjoint = RAdjointOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_X>; + using LOP_Adjoint = RAdjointOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF_X>; LOP_Adjoint lop_adjoint(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, x_gf); // Grid operator for adjoint problem @@ -156,7 +156,7 @@ bool driver(int argc, char** argv){ DJDM dJdm(7,0.0); // Local operator for control problem - using LOP_Control = RControlOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF_Adjoint, DJDM>; + using LOP_Control = RControlOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF_Adjoint, DJDM>; LOP_Control lop_control(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, gf_adjoint, dJdm); // Grid operator for control problem diff --git a/test/coeffeval/coeffeval_poisson.cc b/test/coeffeval/coeffeval_poisson.cc index 4f9e8861b5139dacb8183327ad053b3cb2373b8b..f0d645ffb5a4521f80c82684de736412d49f737e 100644 --- a/test/coeffeval/coeffeval_poisson.cc +++ b/test/coeffeval/coeffeval_poisson.cc @@ -74,7 +74,7 @@ int main(int argc, char** argv) GF c_gf(p2_gfs, c); // Local Operator - using LOP_R = PoissonLocalOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, RangeType, GF>; + using LOP_R = PoissonLocalOperator<P1_dirichlet_GFS, P1_dirichlet_GFS, GF>; LOP_R lop_r(p1_dirichlet_gfs_, p1_dirichlet_gfs_, initree, c_gf); // Constraints stuff diff --git a/test/poisson/poisson_tensor.ufl b/test/poisson/poisson_tensor.ufl index b527d05258667dae629f608a1a630e5f11f947b8..3591d4d70e17f8811cf32bb759212c7c33e0bb24 100644 --- a/test/poisson/poisson_tensor.ufl +++ b/test/poisson/poisson_tensor.ufl @@ -12,6 +12,9 @@ V = FiniteElement("CG", cell, 1) u = TrialFunction(V) v = TestFunction(V) +# Test metadata setting of options +dx = dx(metadata={"quadrature_order": 27}) + r= (inner(A*grad(u), grad(v)) + c*u*v -f*v)*dx exact_solution = g is_dirichlet = 1 diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt index 187aee212ed125a99bc82670fe34e680379542f7..61b81b76082cfa3c83b04d2632194d5a4a6e6447 100644 --- a/test/sumfact/CMakeLists.txt +++ b/test/sumfact/CMakeLists.txt @@ -2,3 +2,4 @@ add_subdirectory(hyperbolic) add_subdirectory(mass) add_subdirectory(poisson) add_subdirectory(stokes) +add_subdirectory(preconditioner) diff --git a/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc index 2f8cc6c8614e0cdb954a52fa5de808dbff466b07..d9fa57bf0d692186a5761667508f9e7c1b645715 100644 --- a/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc +++ b/test/sumfact/poisson/facedir-facemod-variation/poisson_dg_3d_facedir_facemod_variation_driver.cc @@ -109,7 +109,7 @@ int main(int argc, char** argv){ Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc); // Set up grid grid operators... - using LOP_R = rOperator<DG1_GFS, DG1_GFS, RangeType>; + using LOP_R = rOperator<DG1_GFS, DG1_GFS>; using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; using GO_r = Dune::PDELab::GridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>; LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree); diff --git a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc index 4b57deaa50bc6e435b40dec8060d55e927797b2a..cc4c1d01d3d6d5bb67f123e98696e7945002dc45 100644 --- a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc +++ b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_dg_3d_facedir_facemod_variation_driver.cc @@ -251,7 +251,7 @@ int main(int argc, char** argv){ Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc); // Set up grid grid operators... - using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS, RangeType>; + using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS>; using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; using GO_r = Dune::PDELab::GridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>; LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree); diff --git a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc index 90a5c381b538f8c41403161f96ba78a960fe491b..ad75cfc4ed894a45879fd2e078f01fa53aff3b09 100644 --- a/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc +++ b/test/sumfact/poisson/facedir-facemod-variation/sumfact_poisson_fastdg_3d_facedir_facemod_variation_driver.cc @@ -225,7 +225,7 @@ int main(int argc, char** argv){ Dune::PDELab::constraints(dg1_gfs_, dg1_gfs__cc); // Set up grid grid operators... - using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS, RangeType>; + using LOP_R = CLASSNAME<DG1_GFS, DG1_GFS>; using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; using GO_r = Dune::PDELab::FastDGGridOperator<DG1_GFS, DG1_GFS, LOP_R, MatrixBackend, DF, RangeType, RangeType, DG1_GFS_CC, DG1_GFS_CC>; LOP_R lop_r(dg1_gfs_, dg1_gfs_, initree); diff --git a/test/sumfact/preconditioner/CMakeLists.txt b/test/sumfact/preconditioner/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea77ba5dca71ce5afdff567be1f872dc0dc604e0 --- /dev/null +++ b/test/sumfact/preconditioner/CMakeLists.txt @@ -0,0 +1,11 @@ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_preconditioner_2d + INIFILE preconditioner_2d.mini + SOURCE test_preconditioner_2d.cc + ) + +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_preconditioner_3d + INIFILE preconditioner_3d.mini + SOURCE test_preconditioner_3d.cc + ) diff --git a/test/sumfact/preconditioner/poisson_dg_2d.ufl b/test/sumfact/preconditioner/poisson_dg_2d.ufl new file mode 100644 index 0000000000000000000000000000000000000000..ff41f6164df46aa1909db03768aef4041011afae --- /dev/null +++ b/test/sumfact/preconditioner/poisson_dg_2d.ufl @@ -0,0 +1,38 @@ +cell = "quadrilateral" +dim = 2 +degree = 1 + +x = SpatialCoordinate(cell) +c = (0.5-x[0])**2 + (0.5-x[1])**2 +g = exp(-1.*c) +f = 2*(2.-2*c)*g + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +# penalty factor +alpha = 1.0 +h_ext = CellVolume(cell) / FacetArea(cell) +gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0 +theta = 1.0 + +r = inner(grad(u), grad(v))*dx \ + - f*v*dx \ + - inner(n, avg(grad(u)))*jump(v)*dS \ + + gamma_int*jump(u)*jump(v)*dS \ + + theta*jump(u)*inner(avg(grad(v)), n)*dS \ + - inner(n, grad(u))*v*ds \ + + gamma_ext*u*v*ds \ + + theta*u*inner(grad(v), n)*ds \ + - gamma_ext*g*v*ds \ + - theta*g*inner(grad(v), n)*ds + +exact_solution = g diff --git a/test/sumfact/preconditioner/poisson_dg_3d.ufl b/test/sumfact/preconditioner/poisson_dg_3d.ufl new file mode 100644 index 0000000000000000000000000000000000000000..cc6594573762ee12e1b11206eeba07a4bd038726 --- /dev/null +++ b/test/sumfact/preconditioner/poisson_dg_3d.ufl @@ -0,0 +1,38 @@ +cell = hexahedron +dim = 3 +degree = 1 + +x = SpatialCoordinate(cell) +c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2 +g = exp(-1.*c) +f = 2*(3.-2*c)*g + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +# penalty factor +alpha = 1.0 +h_ext = CellVolume(cell) / FacetArea(cell) +gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0 +theta = 1.0 + +r = inner(grad(u), grad(v))*dx \ + - f*v*dx \ + - inner(n, avg(grad(u)))*jump(v)*dS \ + + gamma_int*jump(u)*jump(v)*dS \ + + theta*jump(u)*inner(avg(grad(v)), n)*dS \ + - inner(n, grad(u))*v*ds \ + + gamma_ext*u*v*ds \ + + theta*u*inner(grad(v), n)*ds \ + - gamma_ext*g*v*ds \ + - theta*g*inner(grad(v), n)*ds + +exact_solution = g diff --git a/test/sumfact/preconditioner/preconditioner_2d.mini b/test/sumfact/preconditioner/preconditioner_2d.mini new file mode 100644 index 0000000000000000000000000000000000000000..adcf214dde7018a9c5aa093cff89c51cf67d6a3a --- /dev/null +++ b/test/sumfact/preconditioner/preconditioner_2d.mini @@ -0,0 +1,46 @@ +__name = preconditioner_2d +__exec_suffix = exec + +cells = 2 2 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +operators = r, blockdiag, blockoffdiag, pointdiag + +[formcompiler.r] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +classname = FullOperator +filename = full_2d_operator.hh + +[formcompiler.blockdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_diagonal = 1 +form = r +classname = BlockDiagonalOperator +filename = block_diagonal_2d_operator.hh + +[formcompiler.blockoffdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_offdiagonal = 1 +form = r +classname = BlockOffDiagonalOperator +filename = block_offdiagonal_2d_operator.hh + +[formcompiler.pointdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_pointdiagonal = 1 +form = r +classname = PointDiagonalOperator +filename = point_diagonal_2d_operator.hh diff --git a/test/sumfact/preconditioner/preconditioner_3d.mini b/test/sumfact/preconditioner/preconditioner_3d.mini new file mode 100644 index 0000000000000000000000000000000000000000..304d6e28f0e6f152d6327929984c0121dd55e978 --- /dev/null +++ b/test/sumfact/preconditioner/preconditioner_3d.mini @@ -0,0 +1,46 @@ +__name = preconditioner_3d +__exec_suffix = exec + +cells = 2 2 2 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +operators = r, blockdiag, blockoffdiag, pointdiag + +[formcompiler.r] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +classname = FullOperator +filename = full_3d_operator.hh + +[formcompiler.blockdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_diagonal = 1 +form = r +classname = BlockDiagonalOperator +filename = block_diagonal_3d_operator.hh + +[formcompiler.blockoffdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_offdiagonal = 1 +form = r +classname = BlockOffDiagonalOperator +filename = block_offdiagonal_3d_operator.hh + +[formcompiler.pointdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_pointdiagonal = 1 +form = r +classname = PointDiagonalOperator +filename = point_diagonal_3d_operator.hh diff --git a/test/sumfact/preconditioner/test_preconditioner_2d.cc b/test/sumfact/preconditioner/test_preconditioner_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..3e8fcfc117288feba4d5bd76abb5ca548e18f649 --- /dev/null +++ b/test/sumfact/preconditioner/test_preconditioner_2d.cc @@ -0,0 +1,129 @@ +#include "config.h" + +#include "dune/common/parallel/mpihelper.hh" +#include "dune/pdelab/stationary/linearproblem.hh" +#include "dune/pdelab/backend/istl.hh" +#include "dune/grid/yaspgrid.hh" +#include "dune/pdelab/finiteelementmap/qkdg.hh" +#include "dune/pdelab/gridoperator/fastdg.hh" +#include "dune/testtools/gridconstruction.hh" +#include "dune/common/parametertree.hh" +#include "dune/common/parametertreeparser.hh" +#include <random> +#include "dune/pdelab/gridfunctionspace/vtk.hh" +#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh" +#include "string" +#include "dune/codegen/vtkpredicate.hh" + +// Include all the generated operators +#include "full_2d_operator.hh" +#include "block_diagonal_2d_operator.hh" +#include "block_offdiagonal_2d_operator.hh" +#include "point_diagonal_2d_operator.hh" + +int main(int argc, char** argv){ + try + { + // Initialize basic stuff... + Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv); + using RangeType = double; + Dune::ParameterTree initree; + Dune::ParameterTreeParser::readINITree(argv[1], initree); + + // Setup grid (view)... + using Grid = Dune::YaspGrid<2, Dune::EquidistantCoordinates<RangeType, 2>>; + using GV = Grid::LeafGridView; + using DF = Grid::ctype; + IniGridFactory<Grid> factory(initree); + std::shared_ptr<Grid> grid = factory.getGrid(); + GV gv = grid->leafGridView(); + + // Set up finite element maps... + using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 2>; + DG2_FEM dg2_fem; + + // Set up grid function spaces... + using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>; + using NoConstraintsAssembler = Dune::PDELab::NoConstraints; + using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>; + DG2_GFS dg2_gfs_(gv, dg2_fem); + dg2_gfs_.name("dg2_gfs_"); + + // Set up constraints container... + using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type; + DG2_GFS_CC dg2_gfs__cc; + dg2_gfs__cc.clear(); + Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc); + + // Set up grid grid operators... + using FullLOP = FullOperator<DG2_GFS, DG2_GFS>; + using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; + using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree); + dg2_gfs_.update(); + MatrixBackend mb(5); + FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb); + + // Additional grid operators for preconditioner + using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS>; + using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree); + BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb); + + using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS>; + using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree); + BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb); + + using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS>; + using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree); + PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb); + + // Set up solution vectors... + using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>; + V_R x(dg2_gfs_, 0.0); + + // Testing! + + // Assemble all those matrices + using Dune::PDELab::Backend::native; + using M = typename FullGO::Traits::Jacobian; + M m(fullgo); + fullgo.jacobian(x, m); + Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1); + + using BDM = typename BDGO::Traits::Jacobian; + BDM bdm(bdgo); + bdgo.jacobian(x, bdm); + Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1); + + using BODM = typename BODGO::Traits::Jacobian; + BODM bodm(bodgo); + bodgo.jacobian(x, bodm); + Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1); + + V_R pd(dg2_gfs_, 0.0); + pdgo.residual(x, pd); + Dune::printvector(std::cout, native(pd), "point diagonal vector", "row"); + + // test failure boolean + bool testfail(false); + + // TODO: Properly test this stuff given the above matrices. + // Right now, visuals need to suffice. + + // Return statement... + return testfail; + + } + catch (Dune::Exception& e) + { std::cerr << "Dune reported error: " << e << std::endl; + return 1; + } + catch (std::exception& e) + { std::cerr << "Unknown exception thrown!" << std::endl; + return 1; + } +} + diff --git a/test/sumfact/preconditioner/test_preconditioner_3d.cc b/test/sumfact/preconditioner/test_preconditioner_3d.cc new file mode 100644 index 0000000000000000000000000000000000000000..90ea7aee7def84e9fc2395a798ace1bc39018f7d --- /dev/null +++ b/test/sumfact/preconditioner/test_preconditioner_3d.cc @@ -0,0 +1,129 @@ +#include "config.h" + +#include "dune/common/parallel/mpihelper.hh" +#include "dune/pdelab/stationary/linearproblem.hh" +#include "dune/pdelab/backend/istl.hh" +#include "dune/grid/yaspgrid.hh" +#include "dune/pdelab/finiteelementmap/qkdg.hh" +#include "dune/pdelab/gridoperator/fastdg.hh" +#include "dune/testtools/gridconstruction.hh" +#include "dune/common/parametertree.hh" +#include "dune/common/parametertreeparser.hh" +#include <random> +#include "dune/pdelab/gridfunctionspace/vtk.hh" +#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh" +#include "string" +#include "dune/codegen/vtkpredicate.hh" + +// Include all the generated operators +#include "full_3d_operator.hh" +#include "block_diagonal_3d_operator.hh" +#include "block_offdiagonal_3d_operator.hh" +#include "point_diagonal_3d_operator.hh" + +int main(int argc, char** argv){ + try + { + // Initialize basic stuff... + Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv); + using RangeType = double; + Dune::ParameterTree initree; + Dune::ParameterTreeParser::readINITree(argv[1], initree); + + // Setup grid (view)... + using Grid = Dune::YaspGrid<3, Dune::EquidistantCoordinates<RangeType, 3>>; + using GV = Grid::LeafGridView; + using DF = Grid::ctype; + IniGridFactory<Grid> factory(initree); + std::shared_ptr<Grid> grid = factory.getGrid(); + GV gv = grid->leafGridView(); + + // Set up finite element maps... + using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 3>; + DG2_FEM dg2_fem; + + // Set up grid function spaces... + using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>; + using NoConstraintsAssembler = Dune::PDELab::NoConstraints; + using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>; + DG2_GFS dg2_gfs_(gv, dg2_fem); + dg2_gfs_.name("dg2_gfs_"); + + // Set up constraints container... + using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type; + DG2_GFS_CC dg2_gfs__cc; + dg2_gfs__cc.clear(); + Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc); + + // Set up grid grid operators... + using FullLOP = FullOperator<DG2_GFS, DG2_GFS>; + using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; + using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree); + dg2_gfs_.update(); + MatrixBackend mb(5); + FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb); + + // Additional grid operators for preconditioner + using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS>; + using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree); + BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb); + + using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS>; + using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree); + BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb); + + using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS>; + using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree); + PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb); + + // Set up solution vectors... + using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>; + V_R x(dg2_gfs_, 0.0); + + // Testing! + + // Assemble all those matrices + using Dune::PDELab::Backend::native; + using M = typename FullGO::Traits::Jacobian; + M m(fullgo); + fullgo.jacobian(x, m); + Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1); + + using BDM = typename BDGO::Traits::Jacobian; + BDM bdm(bdgo); + bdgo.jacobian(x, bdm); + Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1); + + using BODM = typename BODGO::Traits::Jacobian; + BODM bodm(bodgo); + bodgo.jacobian(x, bodm); + Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1); + + V_R pd(dg2_gfs_, 0.0); + pdgo.residual(x, pd); + Dune::printvector(std::cout, native(pd), "point diagonal vector", "row"); + + // test failure boolean + bool testfail(false); + + // TODO: Properly test this stuff given the above matrices. + // Right now, visuals need to suffice. + + // Return statement... + return testfail; + + } + catch (Dune::Exception& e) + { std::cerr << "Dune reported error: " << e << std::endl; + return 1; + } + catch (std::exception& e) + { std::cerr << "Unknown exception thrown!" << std::endl; + return 1; + } +} +