diff --git a/CMakeLists.txt b/CMakeLists.txt index e333cab7b37c5466067f1cd5f05c9385e5fa120c..bf740e74d1a29d6e822e44698388d2984e7c7fb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.12) +cmake_minimum_required(VERSION 3.1.3) project(dune-codegen CXX) if(NOT (dune-common_DIR OR dune-common_ROOT OR @@ -29,6 +29,7 @@ dune_register_package_flags(LIBRARIES dunecodegen) dune_enable_all_packages() +add_subdirectory(doc) add_subdirectory(dune/codegen) add_subdirectory(cmake/modules) diff --git a/README.md b/README.md index 8896b12177ec1532e69d64f1748b81f140123643..5856f3514a91a64db9734af6cf7825952b3935c0 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,25 @@ ctest Note that this takes quite a while. +## Building and Running dune-codegen in an offline environment + +dune-codegen relies on installing Python packages into self-contained environments +during its configuration and build process. In order to do this in an offline +environment, we recommend using the tool `devpi`. One of its use cases is to provide +a local mirror for the Python package index. A quickstart tutorial for this use case +is available [5]. It boils down to the following: + +* Installing the `devpi-server` package through your favorite method +* Setting up a local server with `devpi-server --init` +* Making sure it is running in the background (explicitly with `devpi-server --start/stop` or by configuring a systemd service. +* Have the environment variable `PIP_INDEX_URL` to its index, e.g. by adding this line to your `~/.bashrc` (where `http://localhost:3141` might differ depending on your devpi configuration): +``` +export PIP_INDEX_URL=http://localhost:3141/root/pypi/+simple/ +``` + +At first installation, the locally mirrored package index will access PyPI. +Later on, it will install packages from its local cache. + ## Links [0]: https://git-lfs.github.com/ @@ -112,3 +131,4 @@ Note that this takes quite a while. [2]: https://gitlab.dune-project.org/quality/dune-testtools [3]: http://isl.gforge.inria.fr/ [4]: https://www.dune-project.org/doc/installation/ +[5]: https://github.com/devpi/devpi/blob/master/doc/quickstart-pypimirror.rst diff --git a/bin/donkey_benchmark_execution_wrapper.py b/bin/donkey_benchmark_execution_wrapper.py index d383963318291ae947c70b1f083c320a60250303..22835c7cbce61336168396007e73703394eb7433 100755 --- a/bin/donkey_benchmark_execution_wrapper.py +++ b/bin/donkey_benchmark_execution_wrapper.py @@ -5,8 +5,16 @@ import sys import subprocess import os +os.environ['DUNE_CODEGEN_THREADS'] = '20' + # Run the actual command -command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split() + +# The actual measurements will be called like this (mpi parallelism) +# command = "srun -p haswell10c -n 20 -c 2 --cpu_bin=verbose,core".split() + +# Command for autotune benchmarks with threading +command = "srun -p haswell10c -n 1 -c 20 --hint=nomultithread --cpu_bin=verbose,core".split() + command.extend(sys.argv[1:]) ret = subprocess.call(command) diff --git a/cmake/modules/DuneCodegenMacros.cmake b/cmake/modules/DuneCodegenMacros.cmake index 61713109b32cc1af4716b0f9f133879592fec49c..cff09c5ec49c0675680b5f49c1f24f9008c93cc5 100644 --- a/cmake/modules/DuneCodegenMacros.cmake +++ b/cmake/modules/DuneCodegenMacros.cmake @@ -1,6 +1,6 @@ # File for module specific CMake tests. # -# .. cmake_function:: add_generated_executable +# .. cmake_function:: dune_add_generated_executable # # .. cmake_param:: UFLFILE # :single: @@ -22,19 +22,20 @@ # The name given to the added executable target. # # .. cmake_param:: SOURCE +# :single: # # The cc source file to build from. If omitted, a minimal # source file and a driver file will be generated. # # .. cmake_param:: FORM_COMPILER_ARGS # :multi: -# :argname arg: +# :argname: arg # # Additional arguments as recognized by the form compiler. # # .. cmake_param:: DEPENDS # :multi: -# :argname dep: +# :argname: dep # # Additional dependencies of the generated executable (changes in those # will retrigger generation) @@ -57,7 +58,7 @@ # # .. cmake_param:: ANALYZE_GRID_COMMAND # :multi: -# :argname command: +# :argname: command # # Use this to pass a custom grid analysis command. This is necessary # if you use a custom grid generation methdod. The inifile and the @@ -104,7 +105,7 @@ else() endif() file(GLOB_RECURSE UFL2PDELAB_SOURCES ${UFL2PDELAB_GLOB_PATTERN}) -function(add_generated_executable) +function(dune_add_generated_executable) set(OPTIONS EXCLUDE_FROM_ALL ANALYZE_GRID) set(SINGLE TARGET SOURCE UFLFILE INIFILE) set(MULTI FORM_COMPILER_ARGS DEPENDS ANALYZE_GRID_COMMAND) @@ -112,15 +113,20 @@ function(add_generated_executable) cmake_parse_arguments(GEN "${OPTIONS}" "${SINGLE}" "${MULTI}" ${ARGN}) if(GEN_UNPARSED_ARGUMENTS) - message(FATAL_ERROR "Unrecognized arguments in add_generated_executable. This usually indicates a typo.") + message(FATAL_ERROR "Unrecognized arguments in dune_add_generated_executable. This usually indicates a typo.") + endif() + + set(MPI_OPTION "0") + if(MPI_FOUND) + set(MPI_OPTION "1") endif() # Apply defaults and enforce requirements if(NOT GEN_TARGET) - message(FATAL_ERROR "Need to specify the TARGET parameter for add_generated_executable") + message(FATAL_ERROR "Need to specify the TARGET parameter for dune_add_generated_executable") endif() if(NOT GEN_UFLFILE) - message(FATAL_ERROR "Need to specify the UFLFILE parameter for add_generated_executable") + message(FATAL_ERROR "Need to specify the UFLFILE parameter for dune_add_generated_executable") endif() if(NOT IS_ABSOLUTE ${GEN_UFLFILE}) set(GEN_UFLFILE ${CMAKE_CURRENT_SOURCE_DIR}/${GEN_UFLFILE}) @@ -138,6 +144,7 @@ function(add_generated_executable) --target-name ${GEN_TARGET} --driver-file ${GEN_SOURCE} --project-basedir ${CMAKE_BINARY_DIR} + --with-mpi ${MPI_OPTION} ${GEN_FORM_COMPILER_ARGS} DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} COMMENT "Generating driver for the target ${GEN_TARGET}" @@ -171,10 +178,8 @@ function(add_generated_executable) endif() # Parse a mapping of operators to build and their respective filenames - dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} - OUTPUT_VARIABLE depdata - ) - parse_python_data(PREFIX depdata INPUT ${depdata}) + dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env python ${dune-codegen_path}/deplist.py ${GEN_INIFILE} ${GEN_TARGET} ${CMAKE_CURRENT_BINARY_DIR}/interface.log) + parse_python_data(PREFIX depdata FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) if(DUNE_CODEGEN_PROFILING) # This is a bit silly, but cProfile only finds entry point scripts @@ -198,6 +203,7 @@ function(add_generated_executable) --ini-file ${GEN_INIFILE} --target-name ${GEN_TARGET} --operator-to-build ${op} + --with-mpi ${MPI_OPTION} ${ANALYZE_GRID_OPTION} DEPENDS ${GEN_UFLFILE} ${UFL2PDELAB_SOURCES} ${GEN_DEPENDS} ${DUNE_CODEGEN_ADDITIONAL_PYTHON_SOURCES} ${ANALYZE_GRID_FILE} COMMENT "Generating operator file ${depdata___${op}} for the target ${GEN_TARGET}" diff --git a/cmake/modules/GeneratedSystemtests.cmake b/cmake/modules/GeneratedSystemtests.cmake index 206ce3bcbf10a457182ffbb1a5f8e415ea3fcb24..2a1907950a7f5bd68e85a5c536465a73634828cf 100644 --- a/cmake/modules/GeneratedSystemtests.cmake +++ b/cmake/modules/GeneratedSystemtests.cmake @@ -1,11 +1,85 @@ -# System testing for generated executables. All ideas taken from dune-testtools. +# System testing for generated executables. All ideas taken from dune-testtools. +# +# .. cmake_function:: dune_add_formcompiler_system_test +# +# .. cmake_param:: UFLFILE +# :single: +# :required: +# +# The UFL file to create the generate code from. +# +# .. cmake_param:: INIFILE +# :single: +# :required: +# +# The ini file that controls the form compilation process. +# It is expected to contain a [formcompiler] section. This +# file may contain meta ini annotations. +# +# .. cmake_param:: BASENAME +# :single: +# :required: +# +# The basename for the generated executables. +# +# .. cmake_param:: SCRIPT +# :single: +# +# The python script that decides about test failure/success. +# Defaults to a script that simply runs the program and checks +# the exit code. More scripts to be found in dune-testtools. +# +# .. cmake_param:: SOURCE +# :single: +# +# The cc source file to build from. If omitted, a minimal +# source file and a driver file will be generated. +# +# .. cmake_param:: CREATED_TARGETS +# :single: +# +# A variable name that should be filled with the list of generated +# targets. This can be used to modify these lateron. +# +# .. cmake_param:: DEPENDS +# :multi: +# :argname: dep +# +# Additional dependencies of the generated executable (changes in those +# will retrigger generation) +# +# .. cmake_param:: NO_TESTS +# :option: +# +# If given, code will be generated and built normally, but no tests will +# be added to the test suite. +# +# .. cmake_param:: ANALYZE_GRID +# :option: +# +# Set this option to enable code generation time grid analysis. +# This is useful to reduce the variety of sum factorization kernels +# in unstructured grids. Note that the grid analysis tool needs to +# be able to construct your grid from the given inifile. If you have +# a custom grid construction method, you can use ANALYZE_GRID_COMMAND +# instead. +# +# .. cmake_param:: ANALYZE_GRID_COMMAND +# :multi: +# :argname: command +# +# Use this to pass a custom grid analysis command. This is necessary +# if you use a custom grid generation methdod. The inifile and the +# outputfile will be appended to this command. You can use the analysis code in +# dune/codegen/sumfact/analyzegrid.hh to write your own tool. +# Specifying this option will automatically set ANALYZE_GRID. # function(dune_add_formcompiler_system_test) # parse arguments set(OPTION DEBUG NO_TESTS ANALYZE_GRID) - set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE) - set(MULTI CREATED_TARGETS DEPENDS) + set(SINGLE INIFILE BASENAME SCRIPT UFLFILE SOURCE CREATED_TARGETS) + set(MULTI DEPENDS ANALYZE_GRID_COMMAND) cmake_parse_arguments(SYSTEMTEST "${OPTION}" "${SINGLE}" "${MULTI}" ${ARGN}) if(SYSTEMTEST_UNPARSED_ARGUMENTS) @@ -25,6 +99,10 @@ function(dune_add_formcompiler_system_test) if(SYSTEMTEST_ANALYZE_GRID) set(ANALYZE_GRID_STR "ANALYZE_GRID") endif() + set(ANALYZE_GRID_COMMAND_STR "") + if(SYSTEMTEST_ANALYZE_GRID_COMMAND) + set(ANALYZE_GRID_COMMAND_STR "ANALYZE_GRID_COMMAND ${SYSTEMTEST_ANALYZE_GRID_COMMAND}") + endif() # set a default for the script. call_executable.py just calls the executable. # There, it is also possible to hook in things depending on the inifile @@ -41,9 +119,14 @@ function(dune_add_formcompiler_system_test) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} ${CMAKE_CURRENT_BINARY_DIR}/tmp_${SYSTEMTEST_INIFILE}) # expand the given meta ini file into the build tree - execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py --cmake --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} --dir ${CMAKE_CURRENT_BINARY_DIR} --section formcompiler - OUTPUT_VARIABLE output) - parse_python_data(PREFIX INIINFO INPUT "${output}") + execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_expand_metaini.py + --cmake + --ini ${CMAKE_CURRENT_SOURCE_DIR}/${SYSTEMTEST_INIFILE} + --dir ${CMAKE_CURRENT_BINARY_DIR} + --section formcompiler + --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log + ) + parse_python_data(PREFIX INIINFO FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) foreach(inifile ${INIINFO_names}) if(${INIINFO_${inifile}_suffix} STREQUAL "__empty") @@ -55,23 +138,25 @@ function(dune_add_formcompiler_system_test) endif() endif() - add_generated_executable(TARGET ${tname} - UFLFILE ${SYSTEMTEST_UFLFILE} - INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}" - DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS} - EXCLUDE_FROM_ALL - ${SOURCE} - ${ANALYZE_GRID_STR} - ) + dune_add_generated_executable(TARGET ${tname} + UFLFILE ${SYSTEMTEST_UFLFILE} + INIFILE "${CMAKE_CURRENT_BINARY_DIR}/${inifile}" + DEPENDS ${SYSTEMTEST_INIFILE} ${SYSTEMTEST_DEPENDS} + EXCLUDE_FROM_ALL + ${SOURCE} + ${ANALYZE_GRID_STR} + ${ANALYZE_GRID_COMMAND_STR} + ) # Enrich the target with preprocessor variables from the __static section # just the way that dune-testtools does. dune_execute_process(COMMAND ${CMAKE_BINARY_DIR}/run-in-dune-env dune_extract_static.py --ini ${inifile} + --file ${CMAKE_CURRENT_BINARY_DIR}/interface.log WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} OUTPUT_VARIABLE output ERROR_MESSAGE "Error extracting static info from ${inifile}") - parse_python_data(PREFIX STAT INPUT "${output}") + parse_python_data(PREFIX STAT FILE ${CMAKE_CURRENT_BINARY_DIR}/interface.log) foreach(config ${STAT___CONFIGS}) foreach(cd ${STAT___STATIC_DATA}) diff --git a/cmake/modules/deplist.py b/cmake/modules/deplist.py index 9cb5d7d42cbfc712e18e37798dffc2d553416f8c..7b0afac3c7c5f216e9710cef394d175c0c6c9ee9 100755 --- a/cmake/modules/deplist.py +++ b/cmake/modules/deplist.py @@ -22,5 +22,5 @@ def get_filename(operator): result = {"__{}".format(o): get_filename(o) for o in operators} result["__operators"] = ";".join(operators) -printForCMake(result) +printForCMake(result, sys.argv[3]) sys.exit(0) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..956de868a985aec3af90d7263417eb16b9be09df --- /dev/null +++ b/doc/CMakeLists.txt @@ -0,0 +1 @@ +dune_cmake_sphinx_doc(MODULE_ONLY) diff --git a/dune/codegen/common/simdtraits.hh b/dune/codegen/common/simdtraits.hh new file mode 100644 index 0000000000000000000000000000000000000000..73ee4caff1bb743d275e20173556416cc0bd2d30 --- /dev/null +++ b/dune/codegen/common/simdtraits.hh @@ -0,0 +1,16 @@ +#ifndef DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH +#define DUNE_CODEGEN_COMMON_SIMD_TRAITS_HH + +/** This is just the declaration of the traits classes, specialization for VCL and + * OpCounter VCL are elsewhere. + */ + +template<typename T> +struct base_floatingpoint +{}; + +template<typename T> +struct simd_size +{}; + +#endif diff --git a/dune/codegen/common/vcltraits.hh b/dune/codegen/common/vcltraits.hh new file mode 100644 index 0000000000000000000000000000000000000000..de764dbe6f78a51d1bf5934856ca36d4b90a77d0 --- /dev/null +++ b/dune/codegen/common/vcltraits.hh @@ -0,0 +1,86 @@ +#ifndef DUNE_CODEGEN_COMMON_VCLTRAITS_HH +#define DUNE_CODEGEN_COMMON_VCLTRAITS_HH + +/** A collection of traits tools for the Vector Class Library */ + +#include<dune/codegen/common/vectorclass.hh> + + +template<> +struct base_floatingpoint<Vec2d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec4f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec2d> +{ + static constexpr std::size_t value = 2; +}; + +template<> +struct simd_size<Vec4f> +{ + static constexpr std::size_t value = 4; +}; + +#if MAX_VECTOR_SIZE >= 256 +template<> +struct base_floatingpoint<Vec4d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec8f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec4d> +{ + static constexpr std::size_t value = 4; +}; + +template<> +struct simd_size<Vec8f> +{ + static constexpr std::size_t value = 8; +}; +#endif + +#if MAX_VECTOR_SIZE >= 512 +template<> +struct base_floatingpoint<Vec8d> +{ + using value = double; +}; + +template<> +struct base_floatingpoint<Vec16f> +{ + using value = float; +}; + +template<> +struct simd_size<Vec8d> +{ + static constexpr std::size_t value = 8; +}; + +template<> +struct simd_size<Vec16f> +{ + static constexpr std::size_t value = 16; +}; + +#endif + +#endif diff --git a/dune/codegen/common/vectorclass.hh b/dune/codegen/common/vectorclass.hh index aa3bba7b98f9cf57dcc223f61755e3aa26c5bdca..648ea52fe24490efc8af431fe54de4089515af33 100644 --- a/dune/codegen/common/vectorclass.hh +++ b/dune/codegen/common/vectorclass.hh @@ -1,71 +1,23 @@ #ifndef DUNE_CODEGEN_COMMON_VECTORCLASS_HH #define DUNE_CODEGEN_COMMON_VECTORCLASS_HH - -template<typename T> -struct base_floatingpoint -{}; +#include<dune/codegen/common/simdtraits.hh> #ifdef ENABLE_COUNTER + #if HAVE_DUNE_OPCOUNTER #include<dune/opcounter/vectorclass.hh> - -template<typename F, int size> -struct base_floatingpoint<OpCounter::impl::OpCounterVector<F, size>> -{ - using value = OpCounter::OpCounter<F>; -}; - - #else #error "dune-opcounter is needed for opcounted vector types" #endif + #else + #include<dune/codegen/vectorclass/vectorclass.h> #include<dune/codegen/vectorclass/vectormath_exp.h> #include<dune/codegen/vectorclass/vectormath_hyp.h> #include<dune/codegen/vectorclass/vectormath_trig.h> - -template<> -struct base_floatingpoint<Vec2d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec4f> -{ - using value = float; -}; - - -#if MAX_VECTOR_SIZE >= 256 -template<> -struct base_floatingpoint<Vec4d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec8f> -{ - using value = float; -}; -#endif - -#if MAX_VECTOR_SIZE >= 512 -template<> -struct base_floatingpoint<Vec8d> -{ - using value = double; -}; - -template<> -struct base_floatingpoint<Vec16f> -{ - using value = float; -}; -#endif +#include<dune/codegen/common/vcltraits.hh> #endif diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8881504f934e7b1d755580ffc46d47f2a1467346..67694937376b1b6cdaa66d44db07289652f62bcc 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -24,6 +24,7 @@ add_subdirectory(test) add_executable(_autotune_target EXCLUDE_FROM_ALL _autotune.cc) target_compile_options(_autotune_target PUBLIC -fno-strict-aliasing) -if(benchmark_FOUND) - target_link_libraries(_autotune_target benchmark) +find_package(Threads) +if(benchmark_FOUND AND Threads_FOUND) + target_link_libraries(_autotune_target benchmark Threads::Threads) endif() diff --git a/python/dune/codegen/__init__.py b/python/dune/codegen/__init__.py index e892baf44c4c9fb301536c15c40059a5acc5d602..196419782487f33f13cf797d19c1fbdcbd13d946 100644 --- a/python/dune/codegen/__init__.py +++ b/python/dune/codegen/__init__.py @@ -10,3 +10,6 @@ import dune.codegen.loopy.symbolic # noqa import dune.codegen.pdelab # noqa import dune.codegen.sumfact # noqa import dune.codegen.blockstructured # noqa + +# Trigger imports that register finite elements in UFL +import dune.codegen.ufl.finiteelement # noqa diff --git a/python/dune/codegen/blockstructured/accumulation.py b/python/dune/codegen/blockstructured/accumulation.py index 9af9759d5e3426df12a10db39f94048e48006867..ec2e3d9917998a7a024796d48aaf6be6488a271c 100644 --- a/python/dune/codegen/blockstructured/accumulation.py +++ b/python/dune/codegen/blockstructured/accumulation.py @@ -1,12 +1,12 @@ -from dune.codegen.blockstructured.tools import sub_element_inames -from dune.codegen.generation import accumulation_mixin, instruction +from dune.codegen.blockstructured.tools import sub_element_inames, name_accumulation_alias +from dune.codegen.generation import accumulation_mixin, instruction, get_global_context_value from dune.codegen.loopy.target import dtype_floatingpoint from dune.codegen.options import get_form_option from dune.codegen.pdelab.geometry import world_dimension, name_intersection_geometry_wrapper from dune.codegen.pdelab.localoperator import determine_accumulation_space, GenericAccumulationMixin from dune.codegen.pdelab.argument import name_accumulation_variable from dune.codegen.pdelab.localoperator import boundary_predicates -from dune.codegen.generation.loopy import function_mangler, globalarg, temporary_variable +from dune.codegen.generation.loopy import function_mangler, temporary_variable import loopy as lp import pymbolic.primitives as prim @@ -16,32 +16,10 @@ from loopy.match import Writes @accumulation_mixin("blockstructured") class BlockStructuredAccumulationMixin(GenericAccumulationMixin): def generate_accumulation_instruction(self, expr): - if get_form_option('vectorization_blockstructured'): - return generate_accumulation_instruction_vectorized(expr, self) - else: + if get_global_context_value("form_type") == "jacobian": return generate_accumulation_instruction(expr, self) - - -def name_accumulation_alias(container, accumspace): - name = container + "_" + accumspace.lfs.name + "_alias" - name_tail = container + "_" + accumspace.lfs.name + "_alias_tail" - k = get_form_option("number_of_blocks") - p = accumspace.element.degree() - - def _add_alias_insn(name): - dim = world_dimension() - element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) - index_stride = tuple((p * k + 1)**i for i in range(0, dim)) - globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) - code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name) - instruction(within_inames=frozenset(), - code=code, - read_variables=frozenset({container}), - assignees=frozenset({name})) - - _add_alias_insn(name) - _add_alias_insn(name_tail) - return name + else: + return generate_accumulation_instruction_vectorized(expr, self) @function_mangler diff --git a/python/dune/codegen/blockstructured/argument.py b/python/dune/codegen/blockstructured/argument.py index 420773e85bea93ee55cb310255da7fa60d55d9de..deff1b8415d246d0ed197692df46b61db2dbd5cc 100644 --- a/python/dune/codegen/blockstructured/argument.py +++ b/python/dune/codegen/blockstructured/argument.py @@ -1,29 +1,11 @@ -from dune.codegen.generation import (kernel_cached, - valuearg, instruction, globalarg) +from dune.codegen.generation import kernel_cached, valuearg from dune.codegen.options import get_form_option from dune.codegen.pdelab.argument import CoefficientAccess -from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames -from dune.codegen.pdelab.geometry import world_dimension +from dune.codegen.blockstructured.tools import micro_index_to_macro_index, sub_element_inames, name_container_alias from loopy.types import NumpyType import pymbolic.primitives as prim -def name_alias(container, lfs, element): - name = container + "_" + lfs.name + "_alias" - k = get_form_option("number_of_blocks") - p = element.degree() - dim = world_dimension() - element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) - index_stride = tuple((p * k + 1)**i for i in range(0, dim)) - globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) - code = "const auto {} = &{}({},0);".format(name, container, lfs.name) - instruction(within_inames=frozenset(), - code=code, - read_variables=frozenset({container}), - assignees=frozenset({name})) - return name - - # TODO remove the need for element @kernel_cached def pymbolic_coefficient(container, lfs, element, index): @@ -36,9 +18,6 @@ def pymbolic_coefficient(container, lfs, element, index): lfs = prim.Variable(lfs) # use higher order FEM index instead of Q1 index - if get_form_option("vectorization_blockstructured"): - subelem_inames = sub_element_inames() - coeff_alias = name_alias(container, lfs, element) - return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index)) - else: - return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) + subelem_inames = sub_element_inames() + coeff_alias = name_container_alias(container, lfs, element) + return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index)) diff --git a/python/dune/codegen/blockstructured/basis.py b/python/dune/codegen/blockstructured/basis.py index 5af8bb48f2bb625a0b336c5cae6d42863f0063c5..efbc1bc9b3b206d0503686679d6a15547d5004a9 100644 --- a/python/dune/codegen/blockstructured/basis.py +++ b/python/dune/codegen/blockstructured/basis.py @@ -7,7 +7,7 @@ from dune.codegen.generation import (basis_mixin, globalarg, class_member, initializer_list, - include_file,) + include_file, preamble) from dune.codegen.tools import get_pymbolic_basename, get_pymbolic_indices from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.pdelab.basis import (GenericBasisMixin, @@ -38,11 +38,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin): assert not isinstance(element, MixedElement) name = "phi_{}".format(FEM_name_mangling(element)) name = restricted_name(name, restriction) + self.init_basis_cache(element, restriction) self.evaluate_basis(element, name, restriction) inames = self.lfs_inames(element, restriction, number, context=context) return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0)) + @preamble(kernel='operator') + def init_basis_cache(self, element, restriction): + if not restriction: + cache = name_localbasis_cache(element) + localbasis = name_localbasis(element) + qp_name = get_pymbolic_basename(self.quadrature_position_in_micro()) + return ["for (int i=0; i < {}.size(); ++i)".format(qp_name), + "{", + " {}.evaluateFunction({}[i], {});".format(cache, qp_name, localbasis), + "}"] + else: + return [] + @kernel_cached def evaluate_basis(self, element, name, restriction): temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1), @@ -60,11 +74,25 @@ class BlockStructuredBasisMixin(GenericBasisMixin): assert not isinstance(element, MixedElement) name = "js_{}".format(FEM_name_mangling(element)) name = restricted_name(name, restriction) + self.init_gradient_cache(element, restriction) self.evaluate_reference_gradient(element, name, restriction) inames = self.lfs_inames(element, restriction, number, context=context) return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, element.degree() + 1), 0)) + @preamble(kernel='operator') + def init_gradient_cache(self, element, restriction): + if not restriction: + cache = name_localbasis_cache(element) + localbasis = name_localbasis(element) + qp_name = get_pymbolic_basename(self.quadrature_position_in_micro()) + return ["for (int i=0; i < {}.size(); ++i)".format(qp_name), + "{", + " {}.evaluateJacobian({}[i], {});".format(cache, qp_name, localbasis), + "}"] + else: + return [] + @kernel_cached def evaluate_reference_gradient(self, element, name, restriction): temporary_variable(name, shape=((element.degree() + 1)**world_dimension(), 1, world_dimension()), diff --git a/python/dune/codegen/blockstructured/tools.py b/python/dune/codegen/blockstructured/tools.py index 802b819f23754aa0862e122f4127a3db18593117..e9acf640cc3f6febad32dd9bec40cda7d75b881f 100644 --- a/python/dune/codegen/blockstructured/tools.py +++ b/python/dune/codegen/blockstructured/tools.py @@ -2,7 +2,7 @@ from dune.codegen.tools import get_pymbolic_basename from dune.codegen.generation import (iname, domain, temporary_variable, - instruction) + instruction, globalarg, preamble) from dune.codegen.pdelab.geometry import world_dimension from dune.codegen.options import get_form_option import pymbolic.primitives as prim @@ -74,3 +74,33 @@ def name_point_in_macro(point_in_micro, visitor): name = get_pymbolic_basename(point_in_micro) + "_macro" define_point_in_macro(name, point_in_micro, visitor) return name + + +@preamble +def define_container_alias(name, container, lfs, element, is_const): + k = get_form_option("number_of_blocks") + p = element.degree() + dim = world_dimension() + element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) + index_stride = tuple((p * k + 1)**i for i in range(0, dim)) + globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) + if is_const: + return "const auto {} = &{}({},0);".format(name, container, lfs.name) + else: + return "auto {} = &{}.container()({},0);".format(name, container, lfs.name) + + +def name_accumulation_alias(container, accumspace): + name = container + "_" + accumspace.lfs.name + "_alias" + name_tail = container + "_" + accumspace.lfs.name + "_alias_tail" + + define_container_alias(name, container, accumspace.lfs, accumspace.element, is_const=False) + define_container_alias(name_tail, container, accumspace.lfs, accumspace.element, is_const=False) + return name + + +def name_container_alias(container, lfs, element): + name = container + "_" + lfs.name + "_alias" + + define_container_alias(name, container, lfs, element, is_const=True) + return name diff --git a/python/dune/codegen/blockstructured/vectorization.py b/python/dune/codegen/blockstructured/vectorization.py index d5224e6ffdd19ae88ccbdc6a798a8b416a6fd36d..94bc4875f262c72d695007be04c0b4107208f2f5 100644 --- a/python/dune/codegen/blockstructured/vectorization.py +++ b/python/dune/codegen/blockstructured/vectorization.py @@ -26,20 +26,22 @@ def add_vcl_temporaries(knl, vcl_size): init_iname = 'init_vec{}'.format(vcl_size) from islpy import BasicSet init_domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(init_iname, get_vcl_type_size(dtype_floatingpoint()))) + + silenced_warnings = [] + for alias in vector_alias: vector_name = alias.replace('alias', 'vec{}'.format(vcl_size)) new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(vcl_size,), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) - # write once to the vector such that loopy won't complain - new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(init_iname)), - expression=0, within_inames=frozenset({init_iname}), - id='init_{}'.format(vector_name))) + # silence warning such that loopy won't complain + silenced_warnings.append("read_no_write({})".format(vector_name)) from loopy.kernel.data import VectorizeTag return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [init_domain], temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries), - iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()})) + iname_to_tag=dict(**knl.iname_to_tag, **{init_iname: VectorizeTag()}), + silenced_warnings=knl.silenced_warnings + silenced_warnings) def add_vcl_accum_insns(knl, inner_iname, outer_iname, vcl_size, level): @@ -248,6 +250,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): load_insns.append(lp.CallInstruction(assignees=(), expression=call_load, id=load_id, within_inames=insn.within_inames | insn.reduction_inames(), depends_on=insn.depends_on | write_ids, + depends_on_is_final=True, tags=frozenset({'vectorized_{}'.format(level)}))) read_dependencies.setdefault(id, set()) read_dependencies[id].add(load_id) @@ -277,6 +280,7 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): id=store_id, within_inames=insn.within_inames, depends_on=(insn.depends_on | frozenset({id}) | read_dependencies[id] | write_ids), + depends_on_is_final=True, tags=frozenset({'vectorized_{}'.format(level)}))) # replace alias with vcl vector, except for accumulation assignee @@ -341,10 +345,12 @@ def add_vcl_access(knl, inner_iname, vcl_size, level=0): new_insns.append(insn.copy(assignee=assignee_vec, depends_on=(insn.depends_on | read_dependencies[insn.id] | write_ids), + depends_on_is_final=True, tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) else: new_insns.append(insn.copy(depends_on=(insn.depends_on | read_dependencies[insn.id] | write_ids), + depends_on_is_final=True, tags=insn.tags | frozenset({'vectorized_{}'.format(level)}))) return knl.copy(instructions=new_insns + load_insns + store_insns) diff --git a/python/dune/codegen/generation/__init__.py b/python/dune/codegen/generation/__init__.py index bed0256407b7259bab61b6e932c4a17761097e75..97090e18852359b10d1a2d3f74a268a3abac60f1 100644 --- a/python/dune/codegen/generation/__init__.py +++ b/python/dune/codegen/generation/__init__.py @@ -24,6 +24,7 @@ from dune.codegen.generation.cpp import (base_class, preamble, post_include, template_parameter, + dump_ssc_marks ) from dune.codegen.generation.hooks import (hook, diff --git a/python/dune/codegen/generation/cpp.py b/python/dune/codegen/generation/cpp.py index b918291067f45c5f988bc8fdcea55651d538a9db..2ea4c346590ee80ef329fdc9394b9fbc3c59db9c 100644 --- a/python/dune/codegen/generation/cpp.py +++ b/python/dune/codegen/generation/cpp.py @@ -55,3 +55,10 @@ def dump_accumulate_timer(name): @generator_factory(item_tags=("register_likwid_timers",)) def register_liwkid_timer(name): return "LIKWID_MARKER_REGISTER(\"{}\");".format(name) + + +@generator_factory(item_tags=("register_ssc_marks",)) +def dump_ssc_marks(name): + from dune.codegen.pdelab.driver.timings import get_region_marks + return 'std::cout << "{}: " << {} << " <--> " << {} << std::endl;'.format(name, + *get_region_marks(name, driver=False)) diff --git a/python/dune/codegen/logging.conf b/python/dune/codegen/logging.conf index 32974b12744dd1c23c36f561148943ec936b3ccb..1fcc4ba02962460ea3f1db8c678dbba5ef3fef2e 100644 --- a/python/dune/codegen/logging.conf +++ b/python/dune/codegen/logging.conf @@ -1,5 +1,5 @@ [loggers] -keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.vectorization +keys=root,dune.codegen.pdelab.localoperator,dune.codegen.sumfact.transformations,dune.codegen.sumfact.vectorization [handlers] keys=consoleHandler @@ -16,6 +16,12 @@ handlers=consoleHandler qualname=dune.codegen.pdelab.localoperator propagate=0 +[logger_dune.codegen.sumfact.transformations] +level=INFO +handlers=consoleHandler +qualname=dune.codegen.sumfact.transformations +propagate=0 + [logger_dune.codegen.sumfact.vectorization] level=INFO handlers=consoleHandler diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..02644e80c3c0f782309a165c050949e0f1d71cb2 --- /dev/null +++ b/python/dune/codegen/loopy/transformations/performance.py @@ -0,0 +1,8 @@ +from dune.codegen.options import get_form_option +from dune.codegen.sumfact.transformations import sumfact_performance_transformations + + +def performance_transformations(kernel, signature): + if get_form_option("sumfact"): + kernel = sumfact_performance_transformations(kernel, signature) + return kernel diff --git a/python/dune/codegen/loopy/transformations/remove_reductions.py b/python/dune/codegen/loopy/transformations/remove_reductions.py new file mode 100644 index 0000000000000000000000000000000000000000..1ccca09c230f5ec32a9134035f3e883c63cc3680 --- /dev/null +++ b/python/dune/codegen/loopy/transformations/remove_reductions.py @@ -0,0 +1,71 @@ +import loopy as lp + +import pymbolic.primitives as prim + + +def remove_reduction(knl, match): + """Removes all matching reductions and do direct accumulation in assignee instead""" + + # Find reductions + for instr in lp.find_instructions(knl, match): + if isinstance(instr.expression, lp.symbolic.Reduction): + instructions = [] + + # Dependencies + depends_on = instr.depends_on + depending = [] + for i in knl.instructions: + if instr.id in i.depends_on: + depending.append(i.id) + + # Remove the instruction from the kernel + knl = lp.remove_instructions(knl, set([instr.id])) + + # Add instruction that sets assignee to zero + id_zero = instr.id + '_set_zero' + instructions.append(lp.Assignment(instr.assignee, + 0, + within_inames=instr.within_inames, + id=id_zero, + tags=('set_zero',) + )) + + # Add instruction that accumulates directly in assignee + assignee = instr.assignee + expression = prim.Sum((assignee, instr.expression.expr)) + within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames) + id_accum = instr.id + '_accum' + instructions.append(lp.Assignment(assignee, + expression, + within_inames=within_inames, + id=id_accum, + depends_on=frozenset((id_zero,) + tuple(depends_on)), + tags=('assignment',))) + + knl = knl.copy(instructions=knl.instructions + instructions) + + # Restore dependencies + for dep in depending: + match = lp.match.Id(dep) + knl = lp.add_dependency(knl, match, id_accum) + return knl + + +def remove_all_reductions(knl): + """Remove all reductions from loopy kernel + + This removes all reductions by instead setting the assignee to zero and + directly accumulating in the assignee. + """ + # Find ids of all reductions + ids = [] + for instr in knl.instructions: + if isinstance(instr.expression, lp.symbolic.Reduction): + ids.append(instr.id) + + # Remove reductions + for id in ids: + match = lp.match.Id(id) + knl = remove_reduction(knl, match) + + return knl diff --git a/python/dune/codegen/options.py b/python/dune/codegen/options.py index 257da40169f578a4b64a367f6aaa4cd55729253d..51722da3fe34a3fa24b85f4d47e06750c0cc0007 100644 --- a/python/dune/codegen/options.py +++ b/python/dune/codegen/options.py @@ -37,6 +37,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): debug_cache_with_stack = CodegenOption(default=False, helpstr="Store stack along with cache objects. Makes debugging caching issues easier.") driver_file = CodegenOption(helpstr="The filename for the generated driver header") explicit_time_stepping = CodegenOption(default=False, helpstr="use explicit time stepping") + time_stepping_order = CodegenOption(default=1, helpstr="Order of the time stepping method") exact_solution_expression = CodegenOption(helpstr="name of the exact solution expression in the ufl file") compare_l2errorsquared = CodegenOption(helpstr="maximal allowed l2 error squared of difference between numerical solution and interpolation of exact solution (NOTE: requires --exact-solution-expression)") grid_info = CodegenOption(default=None, helpstr="Path to file with information about facedir and facemod variations. This can be used to limit the generation of skeleton kernels.") @@ -49,6 +50,7 @@ class CodegenGlobalOptionsArray(ImmutableRecord): architecture = CodegenOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl|skylake") yaspgrid_offset = CodegenOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.") grid_unstructured = CodegenOption(default=False, helpstr="Set to true if you want to use an unstructured grid.") + grid_consistent = CodegenOption(default=False, helpstr="The used grid is already consistent.") precision_bits = CodegenOption(default=64, helpstr="The number of bits for the floating point type") overlapping = CodegenOption(default=False, helpstr="Use an overlapping solver and constraints. You still need to make sure to construct a grid with overlap! The parallel option will be set automatically.") operators = CodegenOption(default="r", helpstr="A comma separated list of operators, each name will be interpreted as a subsection name within the formcompiler section") @@ -56,7 +58,9 @@ class CodegenGlobalOptionsArray(ImmutableRecord): operator_to_build = CodegenOption(default=None, helpstr="The operators from the list that is about to be build now. CMake sets this one!!!") debug_interpolate_input = CodegenOption(default=False, helpstr="Should the input for printresidual and printmatix be interpolated (instead of random input).") use_likwid = CodegenOption(default=False, helpstr="Use likwid instead of own performance measurements.") + use_sde = CodegenOption(default=False, helpstr="Use sde instead of own performance measurements.") autotune_google_benchmark = CodegenOption(default=False, helpstr="Use google-benchmark library for autotuning (when autotuning is activated).") + with_mpi = CodegenOption(default=True, helpstr="The module was configured with mpi") # Arguments that are mainly to be set by logic depending on other options max_vector_width = CodegenOption(default=256, helpstr=None) @@ -85,6 +89,9 @@ class CodegenFormOptionsArray(ImmutableRecord): sumfact = CodegenOption(default=False, helpstr="Use sumfactorization") sumfact_regular_jacobians = CodegenOption(default=False, helpstr="Generate non sum-factorized jacobians (only useful if sumfact is set)") sumfact_on_boundary = CodegenOption(default=True, helpstr="Whether boundary integrals should be vectorized. It might not be worth the hassle...") + sumfact_optimize_loop_order = CodegenOption(default=False, helpstr="Optimize order of loops in sumf factorization function using autotuning.") + sumfact_performance_transformations = CodegenOption(default=False, helpstr="Apply sum factorization specific performance transformations.") + sumfact_performance_transformations_testrun = CodegenOption(default=0, helpstr="If larger than zero determines test case to run.") vectorization_quadloop = CodegenOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_strategy = CodegenOption(default="none", helpstr="The identifier of the vectorization cost model. Possible values: none|explicit|model|target|autotune") vectorization_not_fully_vectorized_error = CodegenOption(default=False, helpstr="throw an error if nonquadloop vectorization did not fully vectorize") @@ -111,6 +118,7 @@ class CodegenFormOptionsArray(ImmutableRecord): control_variable = CodegenOption(default=None, helpstr="Name of control variable in UFL file") block_preconditioner_diagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the diagonal part of a block preconditioner") block_preconditioner_offdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the off-diagonal part of a block preconditioner") + block_preconditioner_pointdiagonal = CodegenOption(default=False, helpstr="Whether this operator should implement the point diagonal part of a block preconditioner") geometry_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for geometries. Currently implemented mixins: generic, axiparallel, equidistant, sumfact_multilinear, sumfact_axiparallel, sumfact_equidistant") quadrature_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for quadrature. Currently implemented: generic, sumfact") basis_mixins = CodegenOption(default="generic", helpstr="A comma separated list of mixin identifiers to use for basis function evaluation. Currently implemented: generic, sumfact") @@ -125,6 +133,20 @@ _global_options = CodegenGlobalOptionsArray() _form_options = {} +def show_options(): + def subopt(arr): + for k, v in arr.__dict__.items(): + if isinstance(v, CodegenOption) and v.helpstr is not None: + print("{}\n {}".format(k, v.helpstr)) + + print("This is a summary of options available for the code generation process:\n") + print("The following options can be given in the [formcompiler] section:") + subopt(CodegenGlobalOptionsArray) + + print("\nThefollowing options can be given in a form-specific subsection of [formcompiler]:") + subopt(CodegenFormOptionsArray) + + def initialize_options(): """ Initialize the options from the command line """ global _global_options @@ -209,10 +231,16 @@ def process_form_options(opt, form): if opt.filename is None: opt = opt.copy(filename="{}_{}_file.hh".format(get_option("target_name"), opt.classname)) + if opt.block_preconditioner_pointdiagonal: + opt = opt.copy(generate_jacobians=False, + basis_mixins="sumfact_pointdiagonal", + accumulation_mixins="sumfact_pointdiagonal", + ) + if opt.block_preconditioner_diagonal or opt.block_preconditioner_offdiagonal: assert opt.numerical_jacobian is False opt = opt.copy(generate_residuals=False, - generate_jacobians=False, + generate_jacobians=True, matrix_free=True, ) diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py index c3cc48298c1b2efb70c6c7716b29b221486044e1..dc1acd660c137c42be6fb65bb687bafe03fbc730 100644 --- a/python/dune/codegen/pdelab/argument.py +++ b/python/dune/codegen/pdelab/argument.py @@ -19,6 +19,7 @@ from dune.codegen.pdelab.spaces import (lfs_iname, ) from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.ufl.modified_terminals import Restriction +from dune.codegen.options import get_form_option from pymbolic.primitives import Call, Subscript, Variable @@ -116,6 +117,10 @@ def type_coefficientcontainer(): return "X" +def type_linearizationpointcontainer(): + return "Z" + + def name_jacobian(restriction1, restriction2): # Restrictions may only differ if NONE if (restriction1 == Restriction.NONE) or (restriction2 == Restriction.NONE): @@ -144,6 +149,8 @@ def name_accumulation_variable(restrictions=None): restrictions = (Restriction.NONE,) else: restrictions = (Restriction.POSITIVE,) + if get_form_option("block_preconditioner_pointdiagonal"): + restrictions = restrictions[:1] return name_residual(*restrictions) if ft == 'jacobian': if restrictions is None: diff --git a/python/dune/codegen/pdelab/driver/__init__.py b/python/dune/codegen/pdelab/driver/__init__.py index b60544c1c78242f1490c76c46d4be6a2c4448501..124354b50dd354263800c84d99425cc6c9a57c01 100644 --- a/python/dune/codegen/pdelab/driver/__init__.py +++ b/python/dune/codegen/pdelab/driver/__init__.py @@ -39,7 +39,10 @@ def get_form_ident(): def get_form(): data = get_global_context_value("data") - return data.object_by_name[get_form_option("form", get_form_ident())] + form = get_form_option("form") + if form is None: + form = get_form_ident() + return data.object_by_name[form] def get_dimension(): @@ -212,7 +215,10 @@ def name_initree(): @preamble(section="init") def define_mpihelper(name): include_file("dune/common/parallel/mpihelper.hh", filetag="driver") - return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name) + if get_option("with_mpi"): + return "Dune::MPIHelper& {} = Dune::MPIHelper::instance(argc, argv);".format(name) + else: + return "Dune::FakeMPIHelper& {} = Dune::FakeMPIHelper::instance(argc, argv);".format(name) def name_mpihelper(): @@ -282,6 +288,13 @@ def generate_driver(): contents = [] + # Assert that this program was called with ini file + contents += ['if (argc != 2){', + ' std::cerr << "This program needs to be called with an ini file" << std::endl;', + ' return 1;', + '}', + ''] + def add_section(tag, comment): tagcontents = [i for i in retrieve_cache_items("preamble and {}".format(tag), make_generable=True)] if tagcontents: diff --git a/python/dune/codegen/pdelab/driver/gridfunctionspace.py b/python/dune/codegen/pdelab/driver/gridfunctionspace.py index 377b304b03499aa3c109390cf72f4af0f6327e9b..9aad1d5e6096c9d1735790badd63824f24026d42 100644 --- a/python/dune/codegen/pdelab/driver/gridfunctionspace.py +++ b/python/dune/codegen/pdelab/driver/gridfunctionspace.py @@ -82,7 +82,8 @@ def define_grid(name): _type = type_grid() # TODO: In principle this is only necessary if we use sum factorization in # one of the operators. So this could be turned off if that is not the case. - if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured"): + if isQuadrilateral(get_trial_element().cell()) and get_option("grid_unstructured") and not \ + get_option("grid_consistent"): include_file("dune/consistent-edge-orientation/createconsistentgrid.hh", filetag="driver") return ["IniGridFactory<{}> factory({});".format(_type, ini), "std::shared_ptr<{}> grid_nonconsistent = factory.getGrid();".format(_type), diff --git a/python/dune/codegen/pdelab/driver/instationary.py b/python/dune/codegen/pdelab/driver/instationary.py index 5cf7170f1dca6f4274db4bf77b4b1627466e20cb..355fd0e743f6a59716319241018d3c5794c784bd 100644 --- a/python/dune/codegen/pdelab/driver/instationary.py +++ b/python/dune/codegen/pdelab/driver/instationary.py @@ -132,10 +132,25 @@ def name_time(): def typedef_timesteppingmethod(name): r_type = type_range() explicit = get_option('explicit_time_stepping') + order = get_option('time_stepping_order') if explicit: - return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type) + if order == 1: + return "using {} = Dune::PDELab::ExplicitEulerParameter<{}>;".format(name, r_type) + elif order == 2: + return "using {} = Dune::PDELab::HeunParameter<{}>;".format(name, r_type) + elif order == 3: + return "using {} = Dune::PDELab::Shu3Parameter<{}>;".format(name, r_type) + elif order == 4: + return "using {} = Dune::PDELab::RK4Parameter<{}>;".format(name, r_type) + else: + raise NotImplementedError("Time stepping method not supported") else: - return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type) + if order == 1: + return "using {} = Dune::PDELab::OneStepThetaParameter<{}>;".format(name, r_type) + elif order == 2: + return "using {} = Dune::PDELab::Alexander2Parameter<{}>;".format(name, r_type) + elif order == 3: + return "using {} = Dune::PDELab::Alexander3Parameter<{}>;".format(name, r_type) def type_timesteppingmethod(): @@ -150,8 +165,12 @@ def define_timesteppingmethod(name): if explicit: return "{} {};".format(tsm_type, name) else: - ini = name_initree() - return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini) + order = get_option('time_stepping_order') + if order == 1: + ini = name_initree() + return "{} {}({}.get<double>(\"instat.theta\",1.0));".format(tsm_type, name, ini) + else: + return "{} {};".format(tsm_type, name) def name_timesteppingmethod(): diff --git a/python/dune/codegen/pdelab/driver/timings.py b/python/dune/codegen/pdelab/driver/timings.py index aeca64d46c73f2327b48c22f07dca7a85a044104..6bbbd07e4b7701fe516eff9509525165ac23a5eb 100644 --- a/python/dune/codegen/pdelab/driver/timings.py +++ b/python/dune/codegen/pdelab/driver/timings.py @@ -4,7 +4,7 @@ from dune.codegen.generation import (cached, include_file, pre_include, preamble, - ) + post_include) from dune.codegen.options import get_option from dune.codegen.pdelab.driver import (get_form_ident, is_linear, @@ -24,6 +24,9 @@ from dune.codegen.pdelab.driver.solve import (name_vector, ) +_sde_marks = {} + + @preamble(section="timings") def define_timing_identifier(name): ini = name_initree() @@ -125,6 +128,17 @@ def local_operator_likwid(): return "{}.register_likwid_timers();".format(lop_name) +@preamble(section="timings") +def local_operator_ssc_marks(): + lop_name = name_localoperator(get_form_ident()) + return "{}.dump_ssc_marks();".format(lop_name) + + +def ssc_macro(): + return '#define __SSC_MARK(x) do{ __asm__ __volatile__' \ + '("movl %0, %%ebx; .byte 100, 103, 144" : :"i"(x) : "%ebx"); } while(0)' + + @cached def setup_timer(): # TODO check that we are using YASP? @@ -138,6 +152,10 @@ def setup_timer(): logger.warning("timings: using instrumentation level >= 3 with likwid will slow down your code considerably") local_operator_likwid() finalize_likwid() + elif get_option("use_sde"): + post_include(ssc_macro(), filetag='driver') + if get_option('instrumentation_level') >= 3: + local_operator_ssc_marks() else: from dune.codegen.loopy.target import type_floatingpoint pre_include("#define HP_TIMER_OPCOUNTER {}".format(type_floatingpoint()), filetag="driver") @@ -156,14 +174,26 @@ def init_region_timer(region): setup_timer() if get_option("use_likwid"): init_likwid_timer(region) + elif get_option("use_sde"): + pass else: from dune.codegen.generation import post_include post_include("HP_DECLARE_TIMER({});".format(region), filetag="driver") +def get_region_marks(region, driver): + if driver: + return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 11, (2 * (len(_sde_marks) + 1) + 1) * 11)) + else: + return _sde_marks.setdefault(region, (2 * (len(_sde_marks) + 1) * 1, (2 * (len(_sde_marks) + 1) + 1) * 1)) + + def start_region_timer(region): if get_option("use_likwid"): return ["LIKWID_MARKER_START(\"{}\");".format(region)] + elif get_option("use_sde"): + marks = get_region_marks(region, driver=True) + return ["__SSC_MARK(0x{});".format(marks[0])] else: return ["HP_TIMER_START({});".format(region)] @@ -171,6 +201,10 @@ def start_region_timer(region): def stop_region_timer(region): if get_option("use_likwid"): return ["LIKWID_MARKER_STOP(\"{}\");".format(region)] + elif get_option("use_sde"): + marks = get_region_marks(region, driver=True) + return ["__SSC_MARK(0x{});".format(marks[1]), + "std::cout << \"Timed region {}: {} <--> {}\" << std::endl;".format(region, *marks)] else: timestream = name_timing_stream() return ["HP_TIMER_STOP({});".format(region), @@ -207,7 +241,7 @@ def timed_region(region, actions): init_region_timer(region) - if get_option('instrumentation_level') >= 3 and not get_option('use_likwid'): + if get_option('instrumentation_level') >= 3 and not (get_option('use_likwid') or get_option("use_sde")): timestream = name_timing_stream() lop_name = name_localoperator(get_form_ident()) print_times.append("{}.dump_timers({}, {}, true);".format(lop_name, timestream, name_timing_identifier())) diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index 11474dbf3489fae9fc85e944c9289322d8b3d3d7..1fd09fc706795afec6832fbe50207ac41dd5e48c 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -32,6 +32,7 @@ from dune.codegen.generation import (accumulation_mixin, ReturnArg, run_hook, template_parameter, + dump_ssc_marks ) from dune.codegen.cgen.clazz import (AccessModifier, BaseClass, @@ -153,6 +154,11 @@ def enum_alpha(): return _enum_alpha(ufl_measure_to_pdelab_measure(integral_type)) +@class_member(classtag="operator") +def enum_skeleton_twosided(): + return "enum { doSkeletonTwoSided = true };" + + def name_initree_member(): define_initree("_iniParams") return "_iniParams" @@ -279,7 +285,7 @@ def determine_accumulation_space(info, number): return AccumulationSpace(lfs=lfs, index=lfsi, restriction=info.restriction, - element=element + element=subel ) @@ -595,6 +601,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin for trafo in transformations: kernel = trafo[0](kernel, *trafo[1], **trafo[2]) + # Apply performance transformations + from dune.codegen.loopy.transformations.performance import performance_transformations + kernel = performance_transformations(kernel, signature) + from dune.codegen.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) @@ -688,6 +698,19 @@ class RegisterLikwidMethod(ClassMember): ClassMember.__init__(self, content) +class RegisterSSCMarksMethod(ClassMember): + def __init__(self): + knl = name_example_kernel() + assert(knl is not None) + + content = ["void dump_ssc_marks()" + "{"] + register_liwkid_timers = [i for i in retrieve_cache_items(condition='register_ssc_marks')] + content.extend(map(lambda x: ' ' + x, register_liwkid_timers)) + content += ["}"] + ClassMember.__init__(self, content) + + class LoopyKernelMethod(ClassMember): def __init__(self, signature, kernel, add_timings=True, initializer_list=[]): from loopy import generate_body @@ -715,6 +738,12 @@ class LoopyKernelMethod(ClassMember): init_likwid_timer(timer_name) content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(timer_name)) register_liwkid_timer(timer_name) + elif get_option('use_sde'): + from dune.codegen.pdelab.driver.timings import get_region_marks, ssc_macro + post_include(ssc_macro(), filetag='operatorfile') + marks = get_region_marks(timer_name, driver=False) + content.append(' ' + '__SSC_MARK(0x{});'.format(marks[0])) + dump_ssc_marks(timer_name) else: post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') content.append(' ' + 'HP_TIMER_START({});'.format(timer_name)) @@ -727,6 +756,11 @@ class LoopyKernelMethod(ClassMember): init_likwid_timer(setuptimer) content.append(' ' + 'LIKWID_MARKER_START(\"{}\");'.format(setuptimer)) register_liwkid_timer(setuptimer) + elif get_option('use_sde'): + from dune.codegen.pdelab.driver.timings import get_region_marks + setup_marks = get_region_marks(setuptimer, driver=False) + content.append(' ' + '__SSC_MARK(0x{});'.format(setup_marks[0])) + dump_ssc_marks(setuptimer) else: post_include('HP_DECLARE_TIMER({});'.format(setuptimer), filetag='operatorfile') content.append(' HP_TIMER_START({});'.format(setuptimer)) @@ -739,6 +773,8 @@ class LoopyKernelMethod(ClassMember): if add_timings and get_option('instrumentation_level') >= 4: if get_option('use_likwid'): content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(setuptimer)) + elif get_option('use_sde'): + content.append(' ' + '__SSC_MARK(0x{});'.format(setup_marks[1])) else: content.append(' ' + 'HP_TIMER_STOP({});'.format(setuptimer)) @@ -749,6 +785,8 @@ class LoopyKernelMethod(ClassMember): if add_timings and get_option('instrumentation_level') >= 3: if get_option('use_likwid'): content.append(' ' + 'LIKWID_MARKER_STOP(\"{}\");'.format(timer_name)) + elif get_option('use_sde'): + content.append(' ' + '__SSC_MARK(0x{});'.format(marks[1])) else: content.append(' ' + 'HP_TIMER_STOP({});'.format(timer_name)) @@ -838,6 +876,14 @@ def local_operator_default_settings(operator, form): base_class('Dune::PDELab::InstationaryLocalOperatorDefaultMethods<{}>' .format(rf), classtag="operator") + # *always* add the volume pattern, PDELab cannot handle matrices without diagonal blocks + with global_context(integral_type="cell"): + enum_pattern() + pattern_baseclass() + + if get_form_option("block_preconditioner_diagonal") or get_form_option("block_preconditioner_pointdiagonal"): + enum_skeleton_twosided() + def measure_is_enabled(measure): option_dict = {"cell": "enable_volume", @@ -852,6 +898,16 @@ def generate_residual_kernels(form, original_form): if not get_form_option("generate_residuals"): return {} + if get_form_option("block_preconditioner_pointdiagonal"): + from ufl import derivative + jacform = derivative(original_form, original_form.coefficients()[0]) + + from dune.codegen.ufl.preprocess import preprocess_form + jacform = preprocess_form(jacform).preprocessed_form + + from dune.codegen.ufl.transformations.blockpreconditioner import diagonal_block_jacobian + form = diagonal_block_jacobian(jacform) + logger = logging.getLogger(__name__) with global_context(form_type='residual'): operator_kernels = {} @@ -954,6 +1010,12 @@ def generate_jacobian_kernels(form, original_form): from dune.codegen.pdelab.signatures import assembler_routine_name with global_context(kernel=assembler_routine_name()): kernel = [k for k in generate_kernels_per_integral(jac_apply_form.integrals_by_type(measure))] + + if kernel: + enum_pattern() + pattern_baseclass() + enum_alpha() + operator_kernels[(measure, 'jacobian_apply')] = kernel # Generate dummy functions for those kernels, that vanished in the differentiation process @@ -983,6 +1045,12 @@ def generate_jacobian_kernels(form, original_form): from dune.codegen.pdelab.signatures import assembler_routine_name with global_context(kernel=assembler_routine_name()): kernel = [k for k in generate_kernels_per_integral(jacform.integrals_by_type(measure))] + + if kernel: + enum_pattern() + pattern_baseclass() + enum_alpha() + operator_kernels[(measure, 'jacobian')] = kernel # Generate dummy functions for those kernels, that vanished in the differentiation process @@ -1181,6 +1249,8 @@ def generate_localoperator_file(kernels, filename): include_file('dune/codegen/common/timer.hh', filetag='operatorfile') if get_option('use_likwid'): operator_methods.append(RegisterLikwidMethod()) + elif get_option('use_sde'): + operator_methods.append(RegisterSSCMarksMethod()) else: operator_methods.append(TimerMethod()) elif get_option('opcounter'): diff --git a/python/dune/codegen/pdelab/signatures.py b/python/dune/codegen/pdelab/signatures.py index 09b832ac252138181d8be19c9fd4c098b5bb9b68..34acea22ea1fda1b8369bf499d7ce8ed0c37859d 100644 --- a/python/dune/codegen/pdelab/signatures.py +++ b/python/dune/codegen/pdelab/signatures.py @@ -9,6 +9,7 @@ from dune.codegen.pdelab.argument import (name_accumulation_variable, name_coefficientcontainer, type_coefficientcontainer, name_applycontainer, + type_linearizationpointcontainer, ) from dune.codegen.pdelab.spaces import (name_testfunctionspace, type_testfunctionspace, @@ -293,8 +294,9 @@ def nonlinear_jacobian_apply_volume_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, avt) + return (geot, lfsut, cct, lpt, lfsvt, avt) def nonlinear_jacobian_apply_volume_args(): @@ -312,8 +314,9 @@ def nonlinear_jacobian_apply_boundary_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, avt) + return (geot, lfsut, cct, lpt, lfsvt, avt) def nonlinear_jacobian_apply_boundary_args(): @@ -331,8 +334,9 @@ def nonlinear_jacobian_apply_skeleton_templates(): lfsut = type_trialfunctionspace() lfsvt = type_testfunctionspace() cct = type_coefficientcontainer() + lpt = type_linearizationpointcontainer() avt = type_accumulation_variable() - return (geot, lfsut, cct, cct, lfsvt, lfsut, cct, cct, lfsvt, avt, avt) + return (geot, lfsut, cct, lpt, lfsvt, lfsut, cct, lpt, lfsvt, avt, avt) def nonlinear_jacobian_apply_skeleton_args(): diff --git a/python/dune/codegen/pdelab/tensors.py b/python/dune/codegen/pdelab/tensors.py index a924a39ae1bba19a94c47e5542567d8717cedf57..68f6d393129600f0fda995ea1a4f239ff844c173 100644 --- a/python/dune/codegen/pdelab/tensors.py +++ b/python/dune/codegen/pdelab/tensors.py @@ -22,7 +22,7 @@ def define_assembled_tensor(name, expr, visitor): visitor.indices = indices instruction(assignee=prim.Subscript(prim.Variable(name), indices), expression=visitor.call(expr), - forced_iname_deps=frozenset(visitor.interface.quadrature_inames()), + forced_iname_deps=frozenset(visitor.quadrature_inames()), depends_on=frozenset({lp.match.Tagged("sumfact_stage1")}), tags=frozenset({"quad"}), ) @@ -36,17 +36,43 @@ def name_assembled_tensor(o, visitor): @kernel_cached +def code_generation_time_inversion(expr, visitor): + mat = np.ndarray(expr.ufl_shape) + for indices in it.product(*tuple(range(i) for i in expr.ufl_shape)): + visitor.indices = indices + val = visitor.call(expr.ufl_operands[0]) + if not isinstance(val, (float, int)): + visitor.indices = None + return None + + mat[indices] = val + + visitor.indices = None + return np.linalg.inv(mat) + + def pymbolic_matrix_inverse(o, visitor): + # Try to evaluate the matrix at code generation time. + # If this works (it does e.g. for Maxwell on structured grids) + # we can invert the matrix at code generation time!!! indices = visitor.indices visitor.indices = None + + mat = code_generation_time_inversion(o, visitor) + if mat is not None: + return mat[indices] + + # If code generation time inversion failed, we assemble it in C++ + # and invert it there. name = name_assembled_tensor(o.ufl_operands[0], visitor) instruction(code="{}.invert();".format(name), - within_inames=frozenset(visitor.interface.quadrature_inames()), + within_inames=frozenset(visitor.quadrature_inames()), depends_on=frozenset({lp.match.Writes(name), lp.match.Tagged("sumfact_stage1"), }), tags=frozenset({"quad"}), + assignees=frozenset({name}), ) visitor.indices = indices diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index 9394f746ec9cd7bafa569ccfbcb83d9a8e44663f..6e87697403eda5a8373b043fad3140fd56f87175 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -52,6 +52,7 @@ import pymbolic.primitives as prim from loopy.symbolic import WalkMapper import ufl.classes as uc from ufl import FiniteElement, MixedElement, TensorProductElement +import itertools basis_sf_kernels = generator_factory(item_tags=("basis_sf_kernels",), context_tags='kernel', no_deco=True) @@ -158,10 +159,12 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): if self.trial_element is None: return () else: - from dune.codegen.sumfact.basis import SumfactBasisMixin - return SumfactBasisMixin.lfs_inames(SumfactBasisMixin(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) + mixin = get_form_option("basis_mixins") + from dune.codegen.generation import construct_from_mixins + MixinType = construct_from_mixins(mixins=[mixin], mixintype="basis", name="MixinType") + return MixinType.lfs_inames(MixinType(), get_leaf(self.trial_element, self.trial_element_index), self.restriction) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The result of stage 2 has the correct quadrature permutation but no # cost permutation is applied. The inames for this method are # quadrature and cost permuted. This means we need to reverse the cost @@ -172,7 +175,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -347,7 +351,9 @@ class SumfactAccumulationMixin(AccumulationMixinBase): return get_accumulation_info(expr, self) def list_accumulation_infos(self, expr): - return list_accumulation_infos(expr, self) + return itertools.product(_gradsplitting_generator(expr, self), + _trial_generator(expr, self), + ) def generate_accumulation_instruction(self, expr): return generate_accumulation_instruction(expr, self) @@ -368,6 +374,61 @@ class SumfactAccumulationMixin(AccumulationMixinBase): return get_global_context_value("facemod_n") return None + def additional_matrix_sequence(self): + return None + + @property + def prohibit_jacobian(self): + return False + + +@accumulation_mixin("sumfact_pointdiagonal") +class SumfactPointDiagonalAccumulationMixin(SumfactAccumulationMixin): + def additional_matrix_sequence(self): + info = self.current_info[1] + return construct_basis_matrix_sequence(transpose=True, + derivative=info.grad_index, + facedir=get_facedir(info.restriction), + facemod=get_facemod(info.restriction), + basis_size=get_basis_size(info), + ) + + def get_accumulation_info(self, expr): + element = expr.ufl_element() + leaf_element = element + element_index = 0 + from ufl import MixedElement + if isinstance(expr.ufl_element(), MixedElement): + element_index = self.indices[0] + leaf_element = element.extract_component(element_index)[1] + + restriction = self.restriction + if self.measure == 'exterior_facet': + from dune.codegen.pdelab.restriction import Restriction + restriction = Restriction.POSITIVE + + grad_index = None + if self.reference_grad: + if isinstance(expr.ufl_element(), MixedElement): + grad_index = self.indices[1] + else: + grad_index = self.indices[0] + + return SumfactAccumulationInfo(element=expr.ufl_element(), + element_index=element_index, + restriction=restriction, + grad_index=grad_index, + ) + + def list_accumulation_infos(self, expr): + return itertools.product(_gradsplitting_generator(expr, self, number=0), + _gradsplitting_generator(expr, self, number=1), + ) + + @property + def prohibit_jacobian(self): + return True + class SumfactAccumulationInfo(ImmutableRecord): def __init__(self, @@ -434,9 +495,9 @@ def _get_childs(element): yield (i, element.extract_component(i)[1]) -def _test_generator(expr, visitor): +def _gradsplitting_generator(expr, visitor, number=0): from dune.codegen.ufl.modified_terminals import extract_modified_arguments - ma = extract_modified_arguments(expr, argnumber=0) + ma = extract_modified_arguments(expr, argnumber=number) if len(ma) == 0: return element = ma[0].argexpr.ufl_element() @@ -452,7 +513,8 @@ def _test_generator(expr, visitor): for res in restrictions: for ei, e in _get_childs(element): for grad in (None,) + tuple(range(dim)): - yield SumfactAccumulationInfo(element_index=ei, + yield SumfactAccumulationInfo(element=element, + element_index=ei, restriction=res, grad_index=grad) @@ -477,13 +539,20 @@ def _trial_generator(expr, visitor): yield SumfactAccumulationInfo(element_index=ei, restriction=res, element=e) -def list_accumulation_infos(expr, visitor): - import itertools - return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor)) +def get_basis_size(info): + leaf_element = info.element + element_index = info.element_index + dim = world_dimension() + from ufl import MixedElement + if isinstance(leaf_element, MixedElement): + leaf_element = leaf_element.extract_component(element_index)[1] + degree = leaf_element._degree + if isinstance(degree, int): + degree = (degree,) * dim + return tuple(deg + 1 for deg in degree) def generate_accumulation_instruction(expr, visitor): - dim = world_dimension() test_info = visitor.test_info trial_info = visitor.trial_info @@ -492,14 +561,7 @@ def generate_accumulation_instruction(expr, visitor): count_quadrature_point_operations(expr) # Number of basis functions per direction - leaf_element = test_info.element - from ufl import MixedElement - if isinstance(leaf_element, MixedElement): - leaf_element = leaf_element.extract_component(test_info.element_index)[1] - degree = leaf_element._degree - if isinstance(degree, int): - degree = (degree,) * dim - basis_size = tuple(deg + 1 for deg in degree) + basis_size = get_basis_size(test_info) # Anisotropic finite elements are not (yet) supported by Dune assert(size == basis_size[0] for size in basis_size) @@ -535,20 +597,27 @@ def generate_accumulation_instruction(expr, visitor): derivative=test_info.grad_index, facedir=visitor.get_facedir(test_info.restriction), facemod=visitor.get_facemod(test_info.restriction), - basis_size=basis_size) + basis_size=basis_size, + additional_sequence=visitor.additional_matrix_sequence()) jacobian_inames = trial_info.inames priority = test_info.grad_index if priority is None: priority = 3 + trial_element = trial_info.element + trial_element_index = trial_info.element_index + if visitor.prohibit_jacobian: + trial_element = None + trial_element_index = 0 + output = AccumulationOutput(matrix_sequence=matrix_sequence, accumvar=accumvar, restriction=(test_info.restriction, trial_info.restriction), test_element=test_info.element, test_element_index=test_info.element_index, - trial_element=trial_info.element, - trial_element_index=trial_info.element_index, + trial_element=trial_element, + trial_element_index=trial_element_index, ) sf = SumfactKernel(matrix_sequence=matrix_sequence, diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index a1d7442addc4e2d1d720b1cc81d8a914c1f75532..1e801f4550d17d89ef8972fe3ffa867d4a44ea25 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -1,21 +1,28 @@ """ Autotuning for sum factorization kernels """ -from dune.codegen.generation import cache_restoring, delete_cache_items -from dune.codegen.loopy.target import DuneTarget -from dune.codegen.sumfact.realization import realize_sumfact_kernel_function -from dune.codegen.options import get_option, set_option -from dune.codegen.error import CodegenAutotuneError - -import loopy as lp -from pytools import product - import os import re import subprocess import filelock import hashlib +import logging +import json +from operator import mul +import pkg_resources +from six.moves import reduce +import textwrap import time +import loopy as lp +from pytools import product +from cgen import ArrayOf, AlignedAttribute, Initializer + +from dune.codegen.generation import cache_restoring, delete_cache_items +from dune.codegen.loopy.target import DuneTarget, type_floatingpoint +from dune.codegen.sumfact.realization import realize_sumfact_kernel_function +from dune.codegen.options import get_option, set_option +from dune.codegen.error import CodegenAutotuneError + def get_cmake_cache_entry(entry): for line in open(os.path.join(get_option("project_basedir"), "CMakeCache.txt"), "r"): @@ -301,13 +308,207 @@ def generate_standalone_code(sf, filename): set_option("opcounter", opcounting) -def autotune_realization(sf): +def generate_standalone_kernel_code(kernel, signature, filename, transformations=None): + # Turn off opcounting + opcounting = get_option("opcounter") + set_option("opcounter", False) + + # Remove opcounter from signature + p = re.compile('OpCounter::OpCounter<([^>]*)>') + assert len(signature) == 1 + sig = signature[0] + sig = p.sub(r'\1', sig) + assert 'OpCounter' not in signature + + # Which transformations were applied + codegen_transformations = '' + if transformations: + codegen_transformations = '' + for trafo in transformations: + codegen_transformations += '// {}\n'.format(trafo) + + template = 'kernel_benchmark_template1.cc.in' + use_datasets = True + + # Old benchmark template + # template = 'kernel_benchmark_template0.cc.in' + # use_datasets = False + + template_filename = pkg_resources.resource_filename(__name__, template) + with open(template_filename, 'r') as f: + benchmark = f.read() + + # Find function arguments and global arguments + arguments = sig[sig.find('(') + 1:sig.find(')')].split(',') + arguments = [a.split(' ')[-1] for a in arguments] + global_args = [a for a in kernel.args if a.name not in arguments] + buffer_arguments = [a for a in arguments if a.startswith('buff')] + input_arguments = [a for a in arguments if a not in buffer_arguments] + + # Declare global arguments + codegen_declare_global_arguments = '' + target = DuneTarget() + for g in global_args: + decl_info = g.decl_info(target, True, g.dtype) + for idi in decl_info: + ast_builder = target.get_device_ast_builder() + arg_decl = lp.target.c.POD(ast_builder, idi.dtype, idi.name) + arg_decl = ArrayOf(arg_decl, reduce(mul, g.shape)) + arg_decl = AlignedAttribute(g.dtype.itemsize * g.vector_size(target), arg_decl) + codegen_declare_global_arguments += '{}\n'.format(arg_decl) + codegen_declare_global_arguments = textwrap.indent(codegen_declare_global_arguments, ' ') + + # Helper function for argument initialization + def _initialize_arg(arg): + if isinstance(arg, lp.ValueArg): + return [] + real = type_floatingpoint() + size = reduce(mul, arg.shape) + fill_name = arg.name + '_fill' + lines = [' {}* {} = (double *) {};'.format(real, fill_name, arg.name), + ' for (std::size_t i=0; i<{}; ++i){{'.format(size), + ' {}[i] = unif(re);'.format(fill_name), + ' }'] + return lines + + # Initialize global arguments + codegen_initialize_global_arguments = '' + for arg in global_args: + lines = _initialize_arg(arg) + codegen_initialize_global_arguments += '\n'.join(lines) + '\n' + codegen_initialize_global_arguments = textwrap.indent(codegen_initialize_global_arguments, ' ') + + codegen_initialize_input = '' + + # Function we want to benchmark + codegen_benchmark_function = '' + codegen_benchmark_function += sig[0:sig.find(')') + 1] + codegen_benchmark_function += lp.generate_body(kernel) + codegen_benchmark_function = textwrap.indent(codegen_benchmark_function, ' ') + + # Declare function arguments + codegen_declare_arguments = [] + codegen_declare_input = [] + function_arguments = [a for a in kernel.args if a.name in arguments] + for arg in function_arguments: + if 'buffer' in arg.name: + byte_size = reduce(mul, arg.shape) * 8 + codegen_declare_arguments.append(' char {}[{}] __attribute__ ((aligned ({})));\n'.format(arg.name, + byte_size, + arg.alignment),) + elif isinstance(arg, lp.ValueArg): + assert 'jacobian_offset' in arg.name + decl = arg.get_arg_decl(ast_builder) + decl = Initializer(decl, 'unif_int(re)') + codegen_declare_arguments.append((' {}\n'.format(decl))) + else: + assert 'fastdg' in arg.name + size = reduce(mul, arg.shape) + min_stride = min([tag.stride for tag in arg.dim_tags]) + size *= min_stride + alignment = arg.dtype.itemsize + real = type_floatingpoint() + if use_datasets: + codegen_declare_input.append(('{} {}[datasets][{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + else: + codegen_declare_input.append(('{} {}[{}] __attribute__ ((aligned ({})));\n'.format(real, + arg.name, + size, + alignment))) + + codegen_declare_arguments = ''.join(codegen_declare_arguments) + codegen_declare_arguments = textwrap.indent(codegen_declare_arguments, ' ') + codegen_declare_input = ''.join(codegen_declare_input) + codegen_declare_input = textwrap.indent(codegen_declare_input, ' ') + + # Initialize function arguments + codegen_initialize_arguments = '' + codegen_initialize_input = '' + for arg in function_arguments: + if 'fastdg' in arg.name: + if use_datasets: + lines = _initialize_arg(arg) + lines = [' ' + a for a in lines] + lines = [a.replace(arg.name + ';', arg.name + '[i];') for a in lines] + lines.insert(0, 'for(std::size_t i=0; i<datasets; ++i){') + lines.append('}') + codegen_initialize_input += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' + else: + lines = _initialize_arg(arg) + codegen_initialize_arguments += '\n'.join(lines) + '\n' + codegen_initialize_arguments = textwrap.indent(codegen_initialize_arguments, ' ') + codegen_initialize_input = textwrap.indent(codegen_initialize_input, ' ') + + # Call the benchmark function + if use_datasets: + arguments_with_datasets = arguments.copy() + arguments_with_datasets = [a if 'fastdg' not in a else a + '[i]' for a in arguments] + codegen_call_benchmark_function = 'for (std::size_t i=0; i<datasets; ++i){\n' + codegen_call_benchmark_function += ' ' + kernel.name + '({})'.format(','.join(arguments_with_datasets)) + ';\n' + for arg in input_arguments: + codegen_call_benchmark_function += 'benchmark::DoNotOptimize({}[i][0]);\n'.format(arg) + codegen_call_benchmark_function += '}' + else: + codegen_call_benchmark_function = kernel.name + '({})'.format(','.join(arguments)) + ';\n' + codegen_call_benchmark_function = textwrap.indent(codegen_call_benchmark_function, ' ') + + # Replace placeholders in benchmark template + benchmark = benchmark.replace('${CODEGEN_TRANSFORMATIONS}', codegen_transformations) + benchmark = benchmark.replace('${CODEGEN_DECLARE_GLOBAL_ARGUMENTS}', codegen_declare_global_arguments) + benchmark = benchmark.replace('${CODEGEN_DECLARE_INPUT}', codegen_declare_input) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS}', codegen_initialize_global_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_INPUT}', codegen_initialize_input) + benchmark = benchmark.replace('${CODEGEN_BENCHMARK_FUNCTION}', codegen_benchmark_function) + benchmark = benchmark.replace('${CODEGEN_DECLARE_ARGUMENTS}', codegen_declare_arguments) + benchmark = benchmark.replace('${CODEGEN_INITIALIZE_ARGUMENTS}', codegen_initialize_arguments) + benchmark = benchmark.replace('${CODEGEN_CALL_BENCHMARK_FUNCTION}', codegen_call_benchmark_function) + + # Write benchmark source file + with open(filename, 'w') as f: + f.writelines(benchmark) + + # Maybe turn opcounting on again + set_option("opcounter", opcounting) + + +def autotune_realization(sf=None, kernel=None, signature=None, transformations=None): + """Generate an microbenchmark, compile run and return time + + Parameters + ---------- + sf: SumfactKernel or VectorizedSumfactKernel + kernel: loopy.kernel.LoopKernel + signature: str + transformation: list of str + Will be used to distinguish between autotune targets + """ + if sf is None: + assert kernel is not None + assert signature is not None + else: + assert kernel is None + assert signature is None + + logger = logging.getLogger(__name__) + # Make sure that the benchmark directory exists dir = os.path.join(get_option("project_basedir"), "autotune-benchmarks") if not os.path.exists(dir): os.mkdir(dir) - basename = "autotune_sumfact_{}".format(sf.function_name) + if sf is None: + basename = "autotune_sumfact_{}".format(kernel.name) + else: + basename = "autotune_sumfact_{}".format(sf.function_name) + if transformations: + for trafo in transformations: + basename = '{}_{}'.format(basename, trafo) basename = hashlib.sha256(basename.encode()).hexdigest() filename = os.path.join(dir, "{}.cc".format(basename)) @@ -316,10 +517,16 @@ def autotune_realization(sf): executable = os.path.join(dir, basename) # Generate and compile a benchmark program + # + # Note: cache restoring is only necessary when generating from SumfactKernel with cache_restoring(): with filelock.FileLock(lock): if not os.path.isfile(logname): - if get_option("autotune_google_benchmark"): + logger.debug('Generate autotune target in file {}'.format(filename)) + + if sf is None: + generate_standalone_kernel_code(kernel, signature, filename, transformations) + elif get_option("autotune_google_benchmark"): generate_standalone_code_google_benchmark(sf, filename) else: generate_standalone_code(sf, filename) @@ -331,6 +538,7 @@ def autotune_realization(sf): call.extend(compiler_invocation(executable, filename)) devnull = open(os.devnull, 'w') + os.environ['DUNE_CODEGEN_THREADS'] = '1' ret = subprocess.call(call, stdout=devnull, stderr=subprocess.STDOUT) if ret != 0: raise CodegenAutotuneError("Compilation of autotune executable failed. Invocation: {}".format(" ".join(call))) @@ -349,6 +557,7 @@ def autotune_realization(sf): call.append(executable) if get_option("autotune_google_benchmark"): call.append("--benchmark_out={}".format(logname)) + call.append("--benchmark_repetitions=5") # call.append("--benchmark_out_format=csv") else: call.append(logname) @@ -366,7 +575,15 @@ def autotune_realization(sf): with open(logname) as json_file: try: data = json.load(json_file) - return data['benchmarks'][0]['cpu_time'] + minimal_time = 1e80 + for b in data['benchmarks']: + if b['name'].endswith('_mean') or b['name'].endswith('_median') or b['name'].endswith('_stddev'): + pass + else: + if b['cpu_time'] < minimal_time: + minimal_time = b['cpu_time'] + assert minimal_time < 1e80 + return minimal_time except Exception as e: print("Error while loading file {}".format(logname)) raise e diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index 8be5fc9e4a5157fee7f597b3000282238d85dd56..83e5206b5289719f3c4e85f4ca523976cb6feb79 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -232,6 +232,28 @@ class SumfactBasisMixin(GenericBasisMixin): return prim.Subscript(var, vsf.quadrature_index(sf, self)) +@basis_mixin("sumfact_pointdiagonal") +class SumfactPointDiagonalBasisMixin(SumfactBasisMixin): + def lfs_inames(self, element, restriction, number=1): + return () + + def implement_basis(self, element, restriction, number): + info = self.current_info[number] + if element == info.element and restriction == info.restriction: + return 1 + else: + return 0 + + def implement_reference_gradient(self, element, restriction, number): + index, = self.indices + self.indices = None + info = self.current_info[number] + if element == info.element and restriction == info.restriction and index == info.grad_index: + return 1 + else: + return 0 + + class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): def __init__(self, matrix_sequence=None, @@ -342,14 +364,15 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Note: Here we do not need to reverse any permutation since this is # already done in the setup_input method above! # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index 14a095696de75f6e169d5772949d1ed4707c2c7b..6f82b196a048996ed25a9eb22465565a181b5722 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -454,11 +454,12 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableCuttingRecord): return insn_dep.union(frozenset({insn})) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in new file mode 100644 index 0000000000000000000000000000000000000000..c4b49da5aec59006a9f007534dc7711b98ef0387 --- /dev/null +++ b/python/dune/codegen/sumfact/kernel_benchmark_template0.cc.in @@ -0,0 +1,37 @@ +// Transformations: +${CODEGEN_TRANSFORMATIONS} + +#include "config.h" +#include <iostream> +#include <fstream> +#include <random> +#include "benchmark/benchmark.h" +#include <dune/codegen/common/vectorclass.hh> +#include <dune/codegen/sumfact/horizontaladd.hh> + +${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + +${CODEGEN_BENCHMARK_FUNCTION} + +static void BM_sumfact_kernel(benchmark::State& state){ + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS} + + ${CODEGEN_DECLARE_INPUT} + ${CODEGEN_INITIALIZE_INPUT} + + ${CODEGEN_DECLARE_ARGUMENTS} + ${CODEGEN_INITIALIZE_ARGUMENTS} + + for (auto _ : state){ + ${CODEGEN_CALL_BENCHMARK_FUNCTION} + } +} + +${CODEGEN_BENCHMARK} + +BENCHMARK_MAIN(); diff --git a/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in new file mode 100644 index 0000000000000000000000000000000000000000..c03577d672123b3063020841b731fc1fd9780fce --- /dev/null +++ b/python/dune/codegen/sumfact/kernel_benchmark_template1.cc.in @@ -0,0 +1,58 @@ +// Transformations: +${CODEGEN_TRANSFORMATIONS} + +#include "config.h" + +#include <iostream> +#include <fstream> +#include <random> +#include <cstdlib> + +#include "benchmark/benchmark.h" +#include <dune/codegen/common/vectorclass.hh> +#include <dune/codegen/sumfact/horizontaladd.hh> + +const int datasets = 512; + +class Benchmark{ + ${CODEGEN_DECLARE_GLOBAL_ARGUMENTS} + + ${CODEGEN_DECLARE_INPUT} + +public: + Benchmark (){ + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_INITIALIZE_GLOBAL_ARGUMENTS} + + ${CODEGEN_INITIALIZE_INPUT} + } + + ${CODEGEN_BENCHMARK_FUNCTION} + + void run_benchmark(){ + ${CODEGEN_DECLARE_ARGUMENTS} + + std::uniform_real_distribution<double> unif(0,1); + std::uniform_int_distribution<int> unif_int(0,1); + std::default_random_engine re; + + ${CODEGEN_INITIALIZE_ARGUMENTS} + + ${CODEGEN_CALL_BENCHMARK_FUNCTION} + } + +}; + +static void BM_sumfact_kernel(benchmark::State& state){ + Benchmark benchmark; + for (auto _ : state){ + benchmark.run_benchmark(); + } +} + +BENCHMARK(BM_sumfact_kernel) -> Threads(atoi(std::getenv("DUNE_CODEGEN_THREADS"))) -> UseRealTime(); + +BENCHMARK_MAIN(); diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index 14cd0b1d0fe3814f294c22bc00f25260f4ee0233..461f5b0a3ace21e09a7a3707c4849aa1c36e6972 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -30,7 +30,6 @@ from dune.codegen.sumfact.quadrature import quadrature_points_per_direction from dune.codegen.sumfact.symbolic import (SumfactKernel, VectorizedSumfactKernel, ) -from dune.codegen.sumfact.vectorization import attach_vectorization_info from dune.codegen.sumfact.accumulation import sumfact_iname from dune.codegen.loopy.target import dtype_floatingpoint from dune.codegen.loopy.vcl import ExplicitVCLCast @@ -61,6 +60,12 @@ def name_buffer_storage(buff, which): return name +def _max_sum_factorization_buffer_size(sf): + size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + return size + + @kernel_cached def _realize_sum_factorization_kernel(sf): insn_dep = sf.insn_dep @@ -74,8 +79,7 @@ def _realize_sum_factorization_kernel(sf): for buf in buffers: # Determine the necessary size of the buffer. We assume that we do not # underintegrate the form!!! - size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, - product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) + size = _max_sum_factorization_buffer_size(sf) temporary_variable("{}_dummy".format(buf), shape=(size,), custom_base_storage=buf, @@ -118,10 +122,32 @@ class BufferSwitcher(object): def __init__(self): self.current = 0 - def get_temporary(self, name=None, **kwargs): + def get_temporary(self, sf=None, name=None, **kwargs): + assert sf assert name + bs = "buffer{}".format(self.current) - globalarg(bs) + shape = kwargs['shape'] + assert shape + dim_tags = kwargs['dim_tags'].split(',') + assert dim_tags + + # Calculate correct alignment + vec_size = 1 + if 'vec' in dim_tags: + vec_size = shape[dim_tags.index('vec')] + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = np.dtype(kwargs.get("dtype", dtype_floatingpoint())) + alignment = dtype.itemsize * vec_size + + # Add this buffer as global argument to the kernel. Add a shape to make + # benchmark generation for autotuning of loopy kernels easier. Since + # this buffer is used to store different data sizes we need to make + # sure it is big enough. + assert sf + size = _max_sum_factorization_buffer_size(sf) + globalarg(bs, shape=(size,), alignment=alignment, dim_tags=['f', ]) + temporary_variable(name, managed=True, custom_base_storage=bs, @@ -193,7 +219,8 @@ def realize_sumfact_kernel_function(sf): if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: input_summand = sf.interface.realize_direct_input(input_inames, inp_shape) elif l == 0: - input_summand = sf.interface.realize_input(input_inames, + input_summand = sf.interface.realize_input(sf, + input_inames, inp_shape, vec_iname, vec_shape, @@ -204,7 +231,8 @@ def realize_sumfact_kernel_function(sf): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buffer.get_temporary("buff_step{}_in".format(l), + inp = buffer.get_temporary(sf, + "buff_step{}_in".format(l), shape=inp_shape + vec_shape, dim_tags=ftags, ) @@ -250,7 +278,8 @@ def realize_sumfact_kernel_function(sf): output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation) output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) @@ -265,7 +294,8 @@ def realize_sumfact_kernel_function(sf): else: output_shape = tuple(out_shape[1:]) + (out_shape[0],) - out = buffer.get_temporary("buff_step{}_out".format(l), + out = buffer.get_temporary(sf, + "buff_step{}_out".format(l), shape=output_shape + vec_shape, dim_tags=ftags, ) diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 60641b0ce4af4ddef525a4c03a438c4ebe277c0c..dfd9383f93c79d3c50bfcca24cdd717fcc16aa58 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -60,7 +60,7 @@ class SumfactKernelInterfaceBase(object): """ raise NotImplementedError - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): """Interpret the input of sumfact kernel function in the right way (non fastdgg) This happens inside the sumfact kernel function. @@ -73,6 +73,7 @@ class SumfactKernelInterfaceBase(object): Parameters ---------- + sf : SumfactKernel or VectorizedSumfactKernel inames : tuple of pymbolic.primitives.Variable Inames for accesing the input. Ordered according to permuted matrix sequence. shape : tuple of int @@ -252,11 +253,12 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): dep = dep.union(inp.setup_input(sf, dep, index=i)) return dep - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -358,7 +360,7 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return prim.Call(prim.Variable(hadd_function), (result,)) - def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): + def realize_input(self, sf, inames, shape, vec_iname, vec_shape, buf, ftags): # The input for stage 3 is quadrature permuted. The inames and shape # passed to this method are quadrature and cost permuted. This means we # need to take the cost permutation back to get the right inames and @@ -368,7 +370,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): # Get a temporary that interprets the base storage of the input as a # column-major matrix. - inp = buf.get_temporary("buff_step0_in", + inp = buf.get_temporary(sf, + "buff_step0_in", shape=shape + vec_shape, dim_tags=ftags, ) @@ -382,9 +385,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): def realize_direct_output(self, result, inames, shape, **args): outputs = set(self.interfaces) - # If multiple horizontal_add's are to be performed with 'result' - # we need to precompute the result! if len(outputs) > 1: + # Introduce substrule for the argument of the horizontal add substname = "haddsubst_{}".format("_".join([i.name for i in inames])) subst_rule(substname, (), result) result = prim.Call(prim.Variable(substname), ()) @@ -503,6 +505,9 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): the transformation brings the next reduction index in the fastest position. + In einsum notation from numpy this can be written as three contractions + of the form: 'ij,jkl->kli' + It can make sense to permute the order of directions. If you have a small m_l (e.g. stage 1 on faces) it is better to do direction l first. This can be done by: @@ -533,6 +538,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): other. interface: An SumfactKernelInterfaceBase instance describing the input (stage 1) or output (stage 3) of the kernel + """ # Assert the inputs! assert isinstance(matrix_sequence, tuple) @@ -828,6 +834,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) vertical_width=1, buffer=None, insn_dep=frozenset(), + transformations=(), ): # Assert the input data structure assert isinstance(kernels, tuple) @@ -880,8 +887,9 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) # @property def function_name(self): - return "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted), + name = "sfimpl_{}{}".format("_".join(str(m) for m in self.matrix_sequence_quadrature_permuted), self.interface.function_name_suffix) + return name @property def cache_key(self): diff --git a/python/dune/codegen/sumfact/tabulation.py b/python/dune/codegen/sumfact/tabulation.py index 9def97eb3ba4fdae280cc65e7f12ca73164d1146..e2e4a771ec9918f960e5ea88821aa7acb885a0c2 100644 --- a/python/dune/codegen/sumfact/tabulation.py +++ b/python/dune/codegen/sumfact/tabulation.py @@ -50,6 +50,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction=None, slice_size=None, slice_index=None, + additional_tabulation=None, ): """ Arguments: @@ -61,6 +62,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction: Direction corresponding to this matrix slice_size: Number of slices for this direction slice_index: To which slice does this belong + additional_tabulation: A factor to be multiplied with this basis tabulation matrix. """ assert(isinstance(basis_size, int)) ImmutableRecord.__init__(self, @@ -71,6 +73,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): direction=direction, slice_size=slice_size, slice_index=slice_index, + additional_tabulation=additional_tabulation, ) @property @@ -90,6 +93,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): if self.slice_size is not None: infos.append("s{}".format(self.slice_index)) + if self.additional_tabulation is not None: + infos.append("prod{}".format(self.additional_tabulation._shortname)) + return "".join(infos) def __str__(self): @@ -122,7 +128,9 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase, ImmutableRecord): def pymbolic(self, indices): name = str(self) define_theta(name, self) - return prim.Subscript(prim.Variable(name), indices) + ret = prim.Subscript(prim.Variable(name), indices) + + return ret @property def vectorized(self): @@ -461,22 +469,31 @@ def define_theta(name, tabmat, additional_indices=(), width=None): else: args.append(tabmat.face) + # Get right hand side of basis evaluation matrix assignment + expr = prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)) + + # Maybe multiply another matrix (needed for the very special case of assembling point diagonals) + if tabmat.additional_tabulation is not None: + expr = prim.Product((expr, prim.Call(PolynomialLookup(polynomials, tabmat.additional_tabulation.derivative), tuple(args)))) + instruction(assignee=prim.Subscript(prim.Variable(name), (i, j) + additional_indices), - expression=prim.Call(PolynomialLookup(polynomials, tabmat.derivative), tuple(args)), + expression=expr, kernel="operator", depends_on=dep, ) -def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None): +def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=None, facemod=None, basis_size=None, additional_sequence=None): dim = world_dimension() result = [None] * dim + if additional_sequence is None: + additional_sequence = [None] * dim quadrature_size = quadrature_points_per_direction() assert (basis_size is not None) if facedir is not None: quadrature_size = quadrature_size[:facedir] + (1,) + quadrature_size[facedir:] - for i in range(dim): + for i, add_seq in zip(range(dim), additional_sequence): onface = None if facedir == i: onface = facemod @@ -485,6 +502,7 @@ def construct_basis_matrix_sequence(transpose=False, derivative=None, facedir=No basis_size=basis_size[i], transpose=transpose, derivative=derivative == i, - face=onface) + face=onface, + additional_tabulation=add_seq) return tuple(result) diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py new file mode 100644 index 0000000000000000000000000000000000000000..a752d217add1013bbcb22771824cf992103b2237 --- /dev/null +++ b/python/dune/codegen/sumfact/transformations.py @@ -0,0 +1,422 @@ +import re +import logging + +import loopy as lp +import pymbolic.primitives as prim +import islpy as isl + +from dune.codegen.generation import (get_counted_variable, + get_global_context_value, + ) +from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions, remove_reduction +from dune.codegen.options import get_form_option, get_option +from dune.codegen.pdelab.geometry import world_dimension +from dune.codegen.error import CodegenAutotuneError +from dune.codegen.sumfact.autotune import autotune_realization + + +def _current_iname_order(current_inames, new_iname_order): + """Sort the inames for this contraction according to new_iname order""" + current_iname_order = [] + for i in new_iname_order: + for j in current_inames: + if i in j: + current_iname_order.append(j) + return current_iname_order + + +def _get_inames_of_reduction(instr, iname_permutation): + # Check which loops are inside the reduction loop + reduction_index = iname_permutation[-1] + within_reduction = tuple(i > reduction_index for i in iname_permutation) + + # Basename of inner inames + dim = world_dimension() + inner_inames_base = [] + for index, within in enumerate(within_reduction): + if within: + inner_inames_base.append('sf_out_inames_{}'.format(dim - 1 - index)) + + # Name of inner inames and outer inames + inner_inames = [] + outer_inames = [] + vec_inames = [] + for i in [iname for iname in instr.within_inames]: + for j in inner_inames_base: + if j in i: + inner_inames.append(i) + if 'sf_vec' in i: + vec_inames.append(i) + elif i not in inner_inames: + outer_inames.append(i) + + # Reduction iname + regex = re.compile('sf_red_([0-9]*)') + reduction_index = set(regex.findall(str(instr))) + if len(reduction_index) == 0: + reduction_iname = None + else: + assert len(reduction_index) == 1 + reduction_index = reduction_index.pop() + reduction_iname = 'sf_red_{}'.format(reduction_index) + + return outer_inames, reduction_iname, inner_inames, vec_inames + + +class FindReductionMapper(lp.symbolic.WalkMapper): + def __init__(self): + self.reductions = [] + + def map_reduction(self, expr, **kwargs): + self.reductions.append(expr) + return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs) + + +def _reorder_reduction_loops(kernel, match, iname_order): + """Reorder loops in instructions containing one reduction + + In order to reorder the loops we need to manually do the accumulation in a + temporary that is large enough to fit all the data created in loops inside + the reduction loop. + + Parameters + ---------- + kernel: loopy.kernel.LoopKernel + match: + iname_order: tuple of str + prefered loop order for this instruction + """ + instructions = lp.find_instructions(kernel, match) + for instr in instructions: + # Get reduction + reduction = FindReductionMapper() + reduction(instr.expression) + assert len(reduction.reductions) == 1 + reduction = reduction.reductions[0] + + # Reduction iname, inner inames and vetcor inames + assert len(reduction.inames) == 1 + reduction_iname = reduction.inames[0] + assert set([reduction_iname]) == instr.reduction_inames() + if reduction_iname not in iname_order: + lp.prioritize_loops(kernel, iname_order) + continue + inner_inames = iname_order[iname_order.index(reduction_iname) + 1:] + if len(inner_inames) == 0: + lp.prioritize_loops(kernel, iname_order) + continue + # TODO: There should be a better way to do that + vec_inames = [i for i in instr.within_inames if 'sf_vec' in i] + assert len(vec_inames) < 2 + + # {{{ Create new temporary variable + + # Create dim_tags + dim_tags = ','.join(['f'] * len(inner_inames)) + vectorized = len(vec_inames) > 0 + if vectorized: + assert len(vec_inames) == 1 + dim_tags = dim_tags + ',vec' + + # Create shape + shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames) + + # Update temporary_variables of this kernel + from dune.codegen.loopy.temporary import DuneTemporaryVariable + accum_variable = get_counted_variable('accum_variable') + from dune.codegen.loopy.target import dtype_floatingpoint + dtype = lp.types.NumpyType(dtype_floatingpoint()) + var = {accum_variable: DuneTemporaryVariable(accum_variable, + dtype=dtype, + shape=shape, + dim_tags=dim_tags, + managed=True)} + tv = kernel.temporary_variables.copy() + tv.update(var) + kernel = kernel.copy(temporary_variables=tv) + + # }}} + + # Set accumulation variable to zero + accum_init_inames = tuple(prim.Variable(i) for i in inner_inames) + if vectorized: + accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),) + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + accum_init_id = instr.id + '_accum_init' + accum_init_instr = lp.Assignment(assignee, + 0, + within_inames=instr.within_inames, + id=accum_init_id, + depends_on=instr.depends_on, + tags=('accum_init',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr]) + + # Accumulate in temporary variable + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + expression = prim.Sum((assignee, reduction.expr)) + within_inames = frozenset(tuple(instr.within_inames) + reduction.inames) + accum_id = instr.id + '_accum' + accum_instr = lp.Assignment(assignee, + expression, + within_inames=within_inames, + id=accum_id, + depends_on=frozenset([accum_init_id]), + tags=('accum',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_instr]) + + # Replace reduction in insn with accumulation result + def map_reduction(expr, rec): + return assignee + mapper = lp.symbolic.ReductionCallbackMapper(map_reduction) + new_expression = mapper(instr.expression) + assign_id = instr.id + '_assign' + new_instr = instr.copy(expression=new_expression, + id=assign_id, + depends_on=frozenset(list(instr.depends_on) + [accum_id])) + kernel = kernel.copy(instructions=kernel.instructions + [new_instr]) + + # Fix dependencies and remove old instruction + for i in kernel.instructions: + if instr.id in i.depends_on: + match = lp.match.Id(i.id) + kernel = lp.add_dependency(kernel, match, assign_id) + kernel = lp.remove_instructions(kernel, set([instr.id])) + + # Duplicate and reorder loops for accumulation + ids = [accum_init_id, accum_id] + duplicate_inames = tuple(inner_inames) + for idx in ids: + match = lp.match.Id(idx) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames) + current_iname_order = _current_iname_order(match_inames, iname_order) + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + + # Reorder assignment loops + kernel = lp.prioritize_loops(kernel, iname_order) + return kernel + + +def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation): + """Reorder the loop nests of a tensor contraction accumulating directly in the data structure""" + dim = world_dimension() + + # Nothing to do if permutation is identity + if iname_permutation == tuple(range(dim + 1)): + return kernel + + # Use names used in sum factorization kernel (without the index that distinguishes the different directions) + default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red'] + from dune.codegen.sumfact.permutation import permute_backward + new_iname_order = permute_backward(default_iname_order, iname_permutation) + + for instr in kernel.instructions: + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + if reduction_iname: + current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames + else: + current_inames = outer_inames + inner_inames + vec_inames + current_iname_order = _current_iname_order(current_inames, + new_iname_order) + + # We can directly use lp.prioritize_loops if + # - The reduction is the innermost loop + # - There is no reduction (eg reduced direction on faces) + if iname_permutation[-1] == dim or reduction_iname is None: + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + continue + + if isinstance(instr.expression, lp.symbolic.Reduction): + # If the instruction is a reduction we can directly accumulate in the assignee + assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple)) + match = lp.match.Id(instr.id) + kernel = remove_reduction(kernel, match) + + lp.prioritize_loops(kernel, current_iname_order) + duplicate_inames = tuple(inner_inames) + match = lp.match.Id(instr.id + '_set_zero') + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames) + set_zero_iname_order = _current_iname_order(match_inames, new_iname_order) + lp.prioritize_loops(kernel, tuple(set_zero_iname_order)) + else: + # In stage 3 this is usually not a reduction. In this case direct + # accumulation is not possible and we accumulate in a temporay + # variable instead + match = lp.match.Id(instr.id) + kernel = _reorder_reduction_loops(kernel, match, current_iname_order) + + return kernel + + +def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): + """Reorder the loop nests of a tensor contraction using an accumulation variable""" + dim = world_dimension() + + # Nothing to do if permutation is identity + if iname_permutation == tuple(range(dim + 1)): + return kernel + + # Use names used in sum factorization kernel (without the index that distinguishes the different directions) + default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red'] + from dune.codegen.sumfact.permutation import permute_backward + new_iname_order = permute_backward(default_iname_order, iname_permutation) + + for instr in kernel.instructions: + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + if reduction_iname: + current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames + else: + current_inames = outer_inames + inner_inames + vec_inames + current_iname_order = _current_iname_order(current_inames, new_iname_order) + + # We can directly use lp.prioritize_loops if: + # - The reduction is the innermost loop + # - There is no reduction (eg reduced direction on faces) + if iname_permutation[-1] == dim or reduction_iname is None: + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + continue + + match = lp.match.Id(instr.id) + kernel = _reorder_reduction_loops(kernel, match, current_iname_order) + + return kernel + + +def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True): + """Change loop order in tensor contractions of sum factorization kernel + + Since there is a reduction involved this implies more than just reordering + loops. There are two possibilities to handle the reduction: + + 1) Generate an object of correct size for the accumulation and assign to + the result data structure afterwards + + 2) Make sure to set the correct entries of the result data structure to + zero and accumulate inplace + + Parameters + ---------- + kernel: loopy.kernel.LoopKernel + iname_permutation: tuple of ints + How to permute the loop inames. Should contain some permutation of the + numbers 0 to dim+1 + accum_variable: bool + If True use method 1 else use method 2 from above + + Notes + ----- + Using einsum notation from numpy a contraction of a sum factorization + kernel can be written as: + + - 3D: 'ij,jkl->kli' + - 2D: 'ij,jk->ki' + + In the sum factorization kernel itself those inames are called: + + sf_out_inames_2_* : l + sf_out_inames_1_* : k + sf_out_inames_0_* : i + sf_red_* : j + + where * represents the current direction (0,1,2 for 3D problems). The + default order of the loops is + + (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*) + + A permutation vector of (3,2,0,1) would mean that we do + + (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*) + + instead. + + """ + if accum_variable: + kernel = _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation) + return kernel + else: + kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation) + return kernel + + +def tensor_contraction_loop_order_generator(kernel): + yield kernel, ['None'] + + dim = world_dimension() + identity = range(dim + 1) + import itertools + for permutation in itertools.permutations(identity): + # Culling of stupid variant + if permutation[0] == dim: + continue + + new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=True) + yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(permutation)] + + new_kernel = reorder_loops_in_tensor_contraction(kernel, permutation, accum_variable=False) + yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(permutation)] + + +def simple_autotuner(kernel_generator, signature): + kernel, transformations = next(kernel_generator) + best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations) + best_kernel = kernel + best_transformations = transformations + for kernel, transformations in kernel_generator: + cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations) + if cost < best_cost: + best_cost = cost + best_kernel = kernel + best_transformations = transformations + return best_kernel, best_transformations + + +def autotune_tensor_contraction_loop_order(kernel, signature): + logger = logging.getLogger(__name__) + + assert get_option('autotune_google_benchmark') + from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add + kernel = match_fused_multiply_add(kernel) + generator = tensor_contraction_loop_order_generator(kernel) + kernel, transformations = simple_autotuner(generator, signature) + logger.debug('autotute_tensor_contraction_loop_order - kernel {} transformations {}'.format(kernel.name, transformations)) + return kernel + + +def sumfact_performance_transformations(kernel, signature): + if get_form_option('sumfact_performance_transformations'): + assert not get_form_option('sumfact_performance_transformations_testrun') + if kernel.name.startswith('sfimpl'): + kernel = autotune_tensor_contraction_loop_order(kernel, signature) + + # Testing performance transformations + elif get_form_option('sumfact_performance_transformations_testrun'): + assert not get_form_option('sumfact_performance_transformations') + if kernel.name.startswith('sfimpl'): + dim = world_dimension() + testcase = get_form_option('sumfact_performance_transformations_testrun') + if dim == 2: + testrun_dict = { + 1: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), True))], + 2: [(reorder_loops_in_tensor_contraction, ((0, 2, 1), False))], + 3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), True))], + 4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1), False))], + } + for trafo, arguments in testrun_dict[testcase]: + kernel = trafo(kernel, *arguments) + else: + testrun_dict = { + 1: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), True))], + 2: [(reorder_loops_in_tensor_contraction, ((3, 2, 0, 1), False))], + 3: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), True))], + 4: [(reorder_loops_in_tensor_contraction, ((2, 0, 1, 3), False))], + } + for trafo, arguments in testrun_dict[testcase]: + kernel = trafo(kernel, *arguments) + return kernel diff --git a/python/dune/codegen/ufl/finiteelement.py b/python/dune/codegen/ufl/finiteelement.py new file mode 100644 index 0000000000000000000000000000000000000000..ae0ef35f2a3b30cd2837525447233dfe017cd76d --- /dev/null +++ b/python/dune/codegen/ufl/finiteelement.py @@ -0,0 +1,43 @@ +""" Registering finite elements that we have, but FEniCS does not """ + +from ufl import register_element, L2 + + +register_element("Discontinuous Gauss-Lobatto-Legendre", + "DGLL", + 0, + L2, + "identity", + (0, None), + ('interval',), + ) + + +register_element("Monomials", + "Monom", + 0, + L2, + "identity", + (0, None), + ('interval',), + ) + + +register_element("L2-Orthonormal Polynomials", + "OPB", + 0, + L2, + "identity", + (0, None), + ('interval',), + ) + + +register_element("Rannacher-Turek", + "RaTu", + 0, + L2, + "identity", + (1, 1), + ('quadrilateral', 'hexahedron'), + ) diff --git a/python/dune/codegen/ufl/transformations/blockpreconditioner.py b/python/dune/codegen/ufl/transformations/blockpreconditioner.py index c992af12e0cae3499f6f6694027610dfd53c535e..b3fd11249e766bdc8abec4c0d01defbf5de9e59c 100644 --- a/python/dune/codegen/ufl/transformations/blockpreconditioner.py +++ b/python/dune/codegen/ufl/transformations/blockpreconditioner.py @@ -21,7 +21,7 @@ class OffDiagonalBlockSwitcher(MultiFunction): def positive_restricted(self, o): self.res = Restriction.POSITIVE ret = self(o.ufl_operands[0]) - self.rest = Restriction.NONE + self.res = Restriction.NONE if isinstance(ret, uc.Zero): return ret else: @@ -55,6 +55,8 @@ class OffDiagonalBlockSwitcher(MultiFunction): def list_restriction_tuples(diagonal): if diagonal: yield (Restriction.NONE, Restriction.NONE) + yield (Restriction.POSITIVE, Restriction.POSITIVE) + return res = (Restriction.POSITIVE, Restriction.NEGATIVE) amount = 1 if diagonal else 2 diff --git a/python/dune/codegen/ufl/visitor.py b/python/dune/codegen/ufl/visitor.py index ab7c334f323bbc51221d926d357466d6ebc8cf88..e774e03d90e8bc61a1108767867ab046bfa56c81 100644 --- a/python/dune/codegen/ufl/visitor.py +++ b/python/dune/codegen/ufl/visitor.py @@ -37,6 +37,7 @@ from ufl.classes import (Coefficient, JacobianDeterminant, ) +from pytools import product as ptproduct import pymbolic.primitives as prim import numpy as np @@ -278,7 +279,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): # def product(self, o): - return prim.flattened_product(tuple(self.call(op) for op in o.ufl_operands)) + ops = tuple(self.call(op) for op in o.ufl_operands) + if all(isinstance(op, (int, float)) for op in ops): + return ptproduct(ops) + return prim.flattened_product(ops) def float_value(self, o): return o.value() @@ -290,7 +294,10 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): return prim.quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1])) def sum(self, o): - return prim.flattened_sum(tuple(self.call(op) for op in o.ufl_operands)) + ops = tuple(self.call(op) for op in o.ufl_operands) + if all(isinstance(op, (int, float)) for op in ops): + return sum(ops) + return prim.flattened_sum(ops) def zero(self, o): # UFL has Zeroes with shape. We ignore those indices. diff --git a/python/setup.py b/python/setup.py index 88431ad1b496afc1c41d13fae5608956274f3018..d847cb90064e734efe22ea0646129fa8bb86be35 100644 --- a/python/setup.py +++ b/python/setup.py @@ -47,5 +47,6 @@ setup(name='dune.codegen', "console_scripts": [ "generate_operators = dune.codegen.compile:entry_generate_operators", "generate_driver = dune.codegen.compile:entry_generate_driver", + "show_options = dune.codegen.options:show_options", ] }) diff --git a/test/sumfact/CMakeLists.txt b/test/sumfact/CMakeLists.txt index 187aee212ed125a99bc82670fe34e680379542f7..61b81b76082cfa3c83b04d2632194d5a4a6e6447 100644 --- a/test/sumfact/CMakeLists.txt +++ b/test/sumfact/CMakeLists.txt @@ -2,3 +2,4 @@ add_subdirectory(hyperbolic) add_subdirectory(mass) add_subdirectory(poisson) add_subdirectory(stokes) +add_subdirectory(preconditioner) diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt index 5bb10fbcd303a916ce1ab353d7ecac8972f5dbe4..c81c94bc9fd536645149940d290846abce857715 100644 --- a/test/sumfact/poisson/CMakeLists.txt +++ b/test/sumfact/poisson/CMakeLists.txt @@ -134,3 +134,24 @@ if(benchmark_FOUND) INIFILE poisson_fastdg_volumes_3d_benchmark.mini ) endif() + + +#================================================ +# Poisson fastdg with performance transformations +#================================================ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_poisson_fastdg_2d_performance_transformations + INIFILE poisson_fastdg_2d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_poisson_fastdg_3d_performance_transformations + INIFILE poisson_fastdg_3d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_poisson_dg_2d_performance_transformations + INIFILE poisson_dg_2d_performance_transformations.mini + ) +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_poisson_dg_3d_performance_transformations + INIFILE poisson_dg_3d_performance_transformations.mini + ) diff --git a/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..47b0e5269c9ebf62d03328e0f5370528a14ac9f6 --- /dev/null +++ b/test/sumfact/poisson/poisson_dg_2d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_dg_2d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = explicit, none | expand gradvec +fastdg = 0 +geometry_mixins = sumfact_equidistant +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun + +[formcompiler.ufl_variants] +degree = 2 \ No newline at end of file diff --git a/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..0b6d75f5c1181bcfe9f5c7ab9c6a941cd9653251 --- /dev/null +++ b/test/sumfact/poisson/poisson_dg_3d_performance_transformations.mini @@ -0,0 +1,29 @@ +__name = sumfact_poisson_dg_3d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 8 8 8 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +# vectorization_strategy = explicit, none | expand gradvec +vectorization_strategy = model, none | expand gradvec +fastdg = 0 +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2 diff --git a/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..22461b9cc8bbafc6a9e15c2489bd35bf432bb5de --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_2d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_fastdg_2d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = model, none | expand gradvec +fastdg = 1 +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun +geometry_mixins = sumfact_equidistant + +[formcompiler.ufl_variants] +degree = 2 \ No newline at end of file diff --git a/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini new file mode 100644 index 0000000000000000000000000000000000000000..0e9cfea80bab55167a277ce8e189761e9b780420 --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg_3d_performance_transformations.mini @@ -0,0 +1,28 @@ +__name = sumfact_poisson_fastdg_3d_performance_transformations_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{testrun_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +testrun_suffix = testrun{formcompiler.r.sumfact_performance_transformations_testrun} + +cells = 8 8 8 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-4 +autotune_google_benchmark = 1 + +[formcompiler.r] +numerical_jacobian = 0 +sumfact = 1 +vectorization_quadloop = 1 +vectorization_strategy = explicit, none | expand gradvec +fastdg = 1 +geometry_mixins = sumfact_equidistant +sumfact_performance_transformations_testrun = 1, 2, 3, 4 | expand testrun + +[formcompiler.ufl_variants] +degree = 2 diff --git a/test/sumfact/preconditioner/CMakeLists.txt b/test/sumfact/preconditioner/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea77ba5dca71ce5afdff567be1f872dc0dc604e0 --- /dev/null +++ b/test/sumfact/preconditioner/CMakeLists.txt @@ -0,0 +1,11 @@ +dune_add_formcompiler_system_test(UFLFILE poisson_dg_2d.ufl + BASENAME sumfact_preconditioner_2d + INIFILE preconditioner_2d.mini + SOURCE test_preconditioner_2d.cc + ) + +dune_add_formcompiler_system_test(UFLFILE poisson_dg_3d.ufl + BASENAME sumfact_preconditioner_3d + INIFILE preconditioner_3d.mini + SOURCE test_preconditioner_3d.cc + ) diff --git a/test/sumfact/preconditioner/poisson_dg_2d.ufl b/test/sumfact/preconditioner/poisson_dg_2d.ufl new file mode 100644 index 0000000000000000000000000000000000000000..ff41f6164df46aa1909db03768aef4041011afae --- /dev/null +++ b/test/sumfact/preconditioner/poisson_dg_2d.ufl @@ -0,0 +1,38 @@ +cell = "quadrilateral" +dim = 2 +degree = 1 + +x = SpatialCoordinate(cell) +c = (0.5-x[0])**2 + (0.5-x[1])**2 +g = exp(-1.*c) +f = 2*(2.-2*c)*g + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +# penalty factor +alpha = 1.0 +h_ext = CellVolume(cell) / FacetArea(cell) +gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0 +theta = 1.0 + +r = inner(grad(u), grad(v))*dx \ + - f*v*dx \ + - inner(n, avg(grad(u)))*jump(v)*dS \ + + gamma_int*jump(u)*jump(v)*dS \ + + theta*jump(u)*inner(avg(grad(v)), n)*dS \ + - inner(n, grad(u))*v*ds \ + + gamma_ext*u*v*ds \ + + theta*u*inner(grad(v), n)*ds \ + - gamma_ext*g*v*ds \ + - theta*g*inner(grad(v), n)*ds + +exact_solution = g diff --git a/test/sumfact/preconditioner/poisson_dg_3d.ufl b/test/sumfact/preconditioner/poisson_dg_3d.ufl new file mode 100644 index 0000000000000000000000000000000000000000..cc6594573762ee12e1b11206eeba07a4bd038726 --- /dev/null +++ b/test/sumfact/preconditioner/poisson_dg_3d.ufl @@ -0,0 +1,38 @@ +cell = hexahedron +dim = 3 +degree = 1 + +x = SpatialCoordinate(cell) +c = (0.5-x[0])**2 + (0.5-x[1])**2 + (0.5-x[2])**2 +g = exp(-1.*c) +f = 2*(3.-2*c)*g + +V = FiniteElement("DG", cell, degree) + +u = TrialFunction(V) +v = TestFunction(V) + +n = FacetNormal(cell)('+') + +# penalty factor +alpha = 1.0 +h_ext = CellVolume(cell) / FacetArea(cell) +gamma_ext = (alpha * degree * (degree + dim - 1)) / h_ext +h_int = Min(CellVolume(cell)('+'), CellVolume(cell)('-')) / FacetArea(cell) +gamma_int = (alpha * degree * (degree + dim - 1)) / h_int + +# SIPG: -1.0, IIPG: 0.0, NIPG: 1.0 +theta = 1.0 + +r = inner(grad(u), grad(v))*dx \ + - f*v*dx \ + - inner(n, avg(grad(u)))*jump(v)*dS \ + + gamma_int*jump(u)*jump(v)*dS \ + + theta*jump(u)*inner(avg(grad(v)), n)*dS \ + - inner(n, grad(u))*v*ds \ + + gamma_ext*u*v*ds \ + + theta*u*inner(grad(v), n)*ds \ + - gamma_ext*g*v*ds \ + - theta*g*inner(grad(v), n)*ds + +exact_solution = g diff --git a/test/sumfact/preconditioner/preconditioner_2d.mini b/test/sumfact/preconditioner/preconditioner_2d.mini new file mode 100644 index 0000000000000000000000000000000000000000..adcf214dde7018a9c5aa093cff89c51cf67d6a3a --- /dev/null +++ b/test/sumfact/preconditioner/preconditioner_2d.mini @@ -0,0 +1,46 @@ +__name = preconditioner_2d +__exec_suffix = exec + +cells = 2 2 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +operators = r, blockdiag, blockoffdiag, pointdiag + +[formcompiler.r] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +classname = FullOperator +filename = full_2d_operator.hh + +[formcompiler.blockdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_diagonal = 1 +form = r +classname = BlockDiagonalOperator +filename = block_diagonal_2d_operator.hh + +[formcompiler.blockoffdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_offdiagonal = 1 +form = r +classname = BlockOffDiagonalOperator +filename = block_offdiagonal_2d_operator.hh + +[formcompiler.pointdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_pointdiagonal = 1 +form = r +classname = PointDiagonalOperator +filename = point_diagonal_2d_operator.hh diff --git a/test/sumfact/preconditioner/preconditioner_3d.mini b/test/sumfact/preconditioner/preconditioner_3d.mini new file mode 100644 index 0000000000000000000000000000000000000000..304d6e28f0e6f152d6327929984c0121dd55e978 --- /dev/null +++ b/test/sumfact/preconditioner/preconditioner_3d.mini @@ -0,0 +1,46 @@ +__name = preconditioner_3d +__exec_suffix = exec + +cells = 2 2 2 +extension = 1. 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +operators = r, blockdiag, blockoffdiag, pointdiag + +[formcompiler.r] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +classname = FullOperator +filename = full_3d_operator.hh + +[formcompiler.blockdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_diagonal = 1 +form = r +classname = BlockDiagonalOperator +filename = block_diagonal_3d_operator.hh + +[formcompiler.blockoffdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_offdiagonal = 1 +form = r +classname = BlockOffDiagonalOperator +filename = block_offdiagonal_3d_operator.hh + +[formcompiler.pointdiag] +sumfact = 1 +fastdg = 1 +geometry_mixins = sumfact_equidistant +block_preconditioner_pointdiagonal = 1 +form = r +classname = PointDiagonalOperator +filename = point_diagonal_3d_operator.hh diff --git a/test/sumfact/preconditioner/test_preconditioner_2d.cc b/test/sumfact/preconditioner/test_preconditioner_2d.cc new file mode 100644 index 0000000000000000000000000000000000000000..2781b32399f800d1d6334565aefbede360fdbfa9 --- /dev/null +++ b/test/sumfact/preconditioner/test_preconditioner_2d.cc @@ -0,0 +1,129 @@ +#include "config.h" + +#include "dune/common/parallel/mpihelper.hh" +#include "dune/pdelab/stationary/linearproblem.hh" +#include "dune/pdelab/backend/istl.hh" +#include "dune/grid/yaspgrid.hh" +#include "dune/pdelab/finiteelementmap/qkdg.hh" +#include "dune/pdelab/gridoperator/fastdg.hh" +#include "dune/testtools/gridconstruction.hh" +#include "dune/common/parametertree.hh" +#include "dune/common/parametertreeparser.hh" +#include <random> +#include "dune/pdelab/gridfunctionspace/vtk.hh" +#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh" +#include "string" +#include "dune/codegen/vtkpredicate.hh" + +// Include all the generated operators +#include "full_2d_operator.hh" +#include "block_diagonal_2d_operator.hh" +#include "block_offdiagonal_2d_operator.hh" +#include "point_diagonal_2d_operator.hh" + +int main(int argc, char** argv){ + try + { + // Initialize basic stuff... + Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv); + using RangeType = double; + Dune::ParameterTree initree; + Dune::ParameterTreeParser::readINITree(argv[1], initree); + + // Setup grid (view)... + using Grid = Dune::YaspGrid<2, Dune::EquidistantCoordinates<RangeType, 2>>; + using GV = Grid::LeafGridView; + using DF = Grid::ctype; + IniGridFactory<Grid> factory(initree); + std::shared_ptr<Grid> grid = factory.getGrid(); + GV gv = grid->leafGridView(); + + // Set up finite element maps... + using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 2>; + DG2_FEM dg2_fem; + + // Set up grid function spaces... + using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>; + using NoConstraintsAssembler = Dune::PDELab::NoConstraints; + using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>; + DG2_GFS dg2_gfs_(gv, dg2_fem); + dg2_gfs_.name("dg2_gfs_"); + + // Set up constraints container... + using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type; + DG2_GFS_CC dg2_gfs__cc; + dg2_gfs__cc.clear(); + Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc); + + // Set up grid grid operators... + using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>; + using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; + using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree); + dg2_gfs_.update(); + MatrixBackend mb(5); + FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb); + + // Additional grid operators for preconditioner + using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree); + BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb); + + using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree); + BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb); + + using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree); + PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb); + + // Set up solution vectors... + using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>; + V_R x(dg2_gfs_, 0.0); + + // Testing! + + // Assemble all those matrices + using Dune::PDELab::Backend::native; + using M = typename FullGO::Traits::Jacobian; + M m(fullgo); + fullgo.jacobian(x, m); + Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1); + + using BDM = typename BDGO::Traits::Jacobian; + BDM bdm(bdgo); + bdgo.jacobian(x, bdm); + Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1); + + using BODM = typename BODGO::Traits::Jacobian; + BODM bodm(bodgo); + bodgo.jacobian(x, bodm); + Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1); + + V_R pd(dg2_gfs_, 0.0); + pdgo.residual(x, pd); + Dune::printvector(std::cout, native(pd), "point diagonal vector", "row"); + + // test failure boolean + bool testfail(false); + + // TODO: Properly test this stuff given the above matrices. + // Right now, visuals need to suffice. + + // Return statement... + return testfail; + + } + catch (Dune::Exception& e) + { std::cerr << "Dune reported error: " << e << std::endl; + return 1; + } + catch (std::exception& e) + { std::cerr << "Unknown exception thrown!" << std::endl; + return 1; + } +} + diff --git a/test/sumfact/preconditioner/test_preconditioner_3d.cc b/test/sumfact/preconditioner/test_preconditioner_3d.cc new file mode 100644 index 0000000000000000000000000000000000000000..ec84a1efbf7e19f46d8531622374f2902a3139eb --- /dev/null +++ b/test/sumfact/preconditioner/test_preconditioner_3d.cc @@ -0,0 +1,129 @@ +#include "config.h" + +#include "dune/common/parallel/mpihelper.hh" +#include "dune/pdelab/stationary/linearproblem.hh" +#include "dune/pdelab/backend/istl.hh" +#include "dune/grid/yaspgrid.hh" +#include "dune/pdelab/finiteelementmap/qkdg.hh" +#include "dune/pdelab/gridoperator/fastdg.hh" +#include "dune/testtools/gridconstruction.hh" +#include "dune/common/parametertree.hh" +#include "dune/common/parametertreeparser.hh" +#include <random> +#include "dune/pdelab/gridfunctionspace/vtk.hh" +#include "dune/grid/io/file/vtk/subsamplingvtkwriter.hh" +#include "string" +#include "dune/codegen/vtkpredicate.hh" + +// Include all the generated operators +#include "full_3d_operator.hh" +#include "block_diagonal_3d_operator.hh" +#include "block_offdiagonal_3d_operator.hh" +#include "point_diagonal_3d_operator.hh" + +int main(int argc, char** argv){ + try + { + // Initialize basic stuff... + Dune::MPIHelper& mpihelper = Dune::MPIHelper::instance(argc, argv); + using RangeType = double; + Dune::ParameterTree initree; + Dune::ParameterTreeParser::readINITree(argv[1], initree); + + // Setup grid (view)... + using Grid = Dune::YaspGrid<3, Dune::EquidistantCoordinates<RangeType, 3>>; + using GV = Grid::LeafGridView; + using DF = Grid::ctype; + IniGridFactory<Grid> factory(initree); + std::shared_ptr<Grid> grid = factory.getGrid(); + GV gv = grid->leafGridView(); + + // Set up finite element maps... + using DG2_FEM = Dune::PDELab::QkDGLocalFiniteElementMap<DF, RangeType, 1, 3>; + DG2_FEM dg2_fem; + + // Set up grid function spaces... + using VectorBackendDG2 = Dune::PDELab::ISTL::VectorBackend<Dune::PDELab::ISTL::Blocking::fixed>; + using NoConstraintsAssembler = Dune::PDELab::NoConstraints; + using DG2_GFS = Dune::PDELab::GridFunctionSpace<GV, DG2_FEM, NoConstraintsAssembler, VectorBackendDG2>; + DG2_GFS dg2_gfs_(gv, dg2_fem); + dg2_gfs_.name("dg2_gfs_"); + + // Set up constraints container... + using DG2_GFS_CC = DG2_GFS::ConstraintsContainer<RangeType>::Type; + DG2_GFS_CC dg2_gfs__cc; + dg2_gfs__cc.clear(); + Dune::PDELab::constraints(dg2_gfs_, dg2_gfs__cc); + + // Set up grid grid operators... + using FullLOP = FullOperator<DG2_GFS, DG2_GFS, RangeType>; + using MatrixBackend = Dune::PDELab::ISTL::BCRSMatrixBackend<>; + using FullGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, FullLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + FullLOP fulllop(dg2_gfs_, dg2_gfs_, initree); + dg2_gfs_.update(); + MatrixBackend mb(5); + FullGO fullgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, fulllop, mb); + + // Additional grid operators for preconditioner + using BDLOP = BlockDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using BDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BDLOP bdlop(dg2_gfs_, dg2_gfs_, initree); + BDGO bdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bdlop, mb); + + using BODLOP = BlockOffDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using BODGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, BODLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + BODLOP bodlop(dg2_gfs_, dg2_gfs_, initree); + BODGO bodgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, bodlop, mb); + + using PDLOP = PointDiagonalOperator<DG2_GFS, DG2_GFS, RangeType>; + using PDGO = Dune::PDELab::FastDGGridOperator<DG2_GFS, DG2_GFS, PDLOP, MatrixBackend, DF, RangeType, RangeType, DG2_GFS_CC, DG2_GFS_CC>; + PDLOP pdlop(dg2_gfs_, dg2_gfs_, initree); + PDGO pdgo(dg2_gfs_, dg2_gfs__cc, dg2_gfs_, dg2_gfs__cc, pdlop, mb); + + // Set up solution vectors... + using V_R = Dune::PDELab::Backend::Vector<DG2_GFS,DF>; + V_R x(dg2_gfs_, 0.0); + + // Testing! + + // Assemble all those matrices + using Dune::PDELab::Backend::native; + using M = typename FullGO::Traits::Jacobian; + M m(fullgo); + fullgo.jacobian(x, m); + Dune::printmatrix(std::cout, native(m),"full matrix","row",9,1); + + using BDM = typename BDGO::Traits::Jacobian; + BDM bdm(bdgo); + bdgo.jacobian(x, bdm); + Dune::printmatrix(std::cout, native(bdm),"blockdiagonal matrix","row",9,1); + + using BODM = typename BODGO::Traits::Jacobian; + BODM bodm(bodgo); + bodgo.jacobian(x, bodm); + Dune::printmatrix(std::cout, native(bodm),"blockoffdiagonal matrix","row",9,1); + + V_R pd(dg2_gfs_, 0.0); + pdgo.residual(x, pd); + Dune::printvector(std::cout, native(pd), "point diagonal vector", "row"); + + // test failure boolean + bool testfail(false); + + // TODO: Properly test this stuff given the above matrices. + // Right now, visuals need to suffice. + + // Return statement... + return testfail; + + } + catch (Dune::Exception& e) + { std::cerr << "Dune reported error: " << e << std::endl; + return 1; + } + catch (std::exception& e) + { std::cerr << "Unknown exception thrown!" << std::endl; + return 1; + } +} +