diff --git a/patches/apply_patches.sh b/patches/apply_patches.sh index 9bb1d2e2b98930176835ff7bb7fbba3dfb089d11..f95f1f6a28289d3ef0b62fdef5e818d5b87530c4 100755 --- a/patches/apply_patches.sh +++ b/patches/apply_patches.sh @@ -8,3 +8,7 @@ popd pushd dune/perftool/vectorclass git apply ../../../patches/vectorclass/0001-Better-implementation-of-horizontal_add.patch popd + +pushd python/ufl +git apply ../../patches/ufl/conditional-uflid.patch +popd diff --git a/patches/ufl/conditional-uflid.patch b/patches/ufl/conditional-uflid.patch new file mode 100644 index 0000000000000000000000000000000000000000..b469437ed6507bca7ec4ed4b56fa8578804a9809 --- /dev/null +++ b/patches/ufl/conditional-uflid.patch @@ -0,0 +1,34 @@ +diff --git a/ufl/conditional.py b/ufl/conditional.py +index 352624c..ebd647f 100644 +--- a/ufl/conditional.py ++++ b/ufl/conditional.py +@@ -27,6 +27,7 @@ from ufl.constantvalue import as_ufl + from ufl.precedence import parstr + from ufl.exprequals import expr_equals + from ufl.checks import is_true_ufl_scalar ++from ufl.core.ufl_id import attach_ufl_id + + # --- Condition classes --- + +@@ -221,10 +222,11 @@ class NotCondition(Condition): + + @ufl_type(num_ops=3, inherit_shape_from_operand=1, + inherit_indices_from_operand=1) ++@attach_ufl_id + class Conditional(Operator): +- __slots__ = () ++ __slots__ = ("_ufl_id") + +- def __init__(self, condition, true_value, false_value): ++ def __init__(self, condition, true_value, false_value, ufl_id=None): + if not isinstance(condition, Condition): + error("Expectiong condition as first argument.") + true_value = as_ufl(true_value) +@@ -244,6 +246,7 @@ class Conditional(Operator): + condition.ufl_operands[1].ufl_free_indices == ())): + error("Non-scalar == or != is not allowed.") + ++ self._ufl_id = self._init_ufl_id(ufl_id) + Operator.__init__(self, (condition, true_value, false_value)) + + def evaluate(self, x, mapping, component, index_values): diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 113e3b76ab040bf937006a8c4509cc65414721c5..96b7923a2917aeeddca359733b5dceed91b9ca75 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -130,6 +130,8 @@ class DuneCExpressionToCodeMapper(CExpressionToCodeMapper): self.rec(expr.shift, PREC_SHIFT)), enclosing_prec, PREC_SHIFT) + map_tagged_variable = CExpressionToCodeMapper.map_variable + class DuneASTBuilder(CASTBuilder): def function_manglers(self): diff --git a/python/dune/perftool/pdelab/__init__.py b/python/dune/perftool/pdelab/__init__.py index fbe00bb36a49a7fd3b336b26821b5616a8c2cec6..9396782c60ede97a6173d1fa1b8d5e8f33f9e233 100644 --- a/python/dune/perftool/pdelab/__init__.py +++ b/python/dune/perftool/pdelab/__init__.py @@ -42,6 +42,19 @@ class PDELabInterface(object): # The visitor instance will be registered by its init method self.visitor = None + # Accumulation interfaces + def get_accumulation_info(self, expr, visitor): + from dune.perftool.pdelab.localoperator import get_accumulation_info + return get_accumulation_info(expr, visitor) + + def list_accumulation_infos(self, expr, visitor): + from dune.perftool.pdelab.localoperator import list_accumulation_infos + return list_accumulation_infos(expr, visitor) + + def generate_accumulation_instruction(self, expr, visitor): + from dune.perftool.pdelab.localoperator import generate_accumulation_instruction + return generate_accumulation_instruction(expr, visitor) + # # TODO: The following ones are actually entirely PDELab independent! # They should be placed elsewhere and be used directly in the visitor. @@ -60,6 +73,10 @@ class PDELabInterface(object): def lfs_inames(self, element, restriction, number=None, context=''): return lfs_inames(element, restriction, number, context) + def initialize_function_spaces(self, expr, visitor): + from dune.perftool.pdelab.spaces import initialize_function_spaces + return initialize_function_spaces(expr, visitor) + # # Test and trial function related generator functions # @@ -72,17 +89,17 @@ class PDELabInterface(object): def pymbolic_reference_gradient(self, element, restriction, number, context=''): return pymbolic_reference_gradient(element, restriction, number, context) - def pymbolic_trialfunction_gradient(self, element, restriction, tree_path): - return pymbolic_trialfunction_gradient(self.visitor, element, restriction, tree_path) + def pymbolic_trialfunction_gradient(self, element, restriction, index): + return pymbolic_trialfunction_gradient(element, restriction, index) - def pymbolic_apply_function_gradient(self, element, restriction, tree_path): - return pymbolic_apply_function_gradient(self.visitor, element, restriction, tree_path) + def pymbolic_apply_function_gradient(self, element, restriction, index): + return pymbolic_apply_function_gradient(element, restriction, index) - def pymbolic_trialfunction(self, element, restriction, tree_path): - return pymbolic_trialfunction(self.visitor, element, restriction, tree_path) + def pymbolic_trialfunction(self, element, restriction, index): + return pymbolic_trialfunction(element, restriction, index) - def pymbolic_apply_function(self, element, restriction, tree_path): - return pymbolic_apply_function(self.visitor, element, restriction, tree_path) + def pymbolic_apply_function(self, element, restriction, index): + return pymbolic_apply_function(element, restriction, index) # # Parameter function related generator functions diff --git a/python/dune/perftool/pdelab/argument.py b/python/dune/perftool/pdelab/argument.py index 149dc47ca9461d7950c9216bed21b1e9ee437f3b..5adc0eb48f2ba72a2c29b59ee7a66a046ed78edc 100644 --- a/python/dune/perftool/pdelab/argument.py +++ b/python/dune/perftool/pdelab/argument.py @@ -90,35 +90,35 @@ def accumulation_mangler(target, func, dtypes): ) -def pymbolic_trialfunction_gradient(visitor, element, restriction, tree_path): - rawname = "gradu" + "_".join(str(c) for c in tree_path) +def pymbolic_trialfunction_gradient(element, restriction, index): + rawname = "gradu_{}".format(index) name = restricted_name(rawname, restriction) container = name_coefficientcontainer(restriction) - evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path) + evaluate_coefficient_gradient(element, name, container, restriction, index) return Variable(name) -def pymbolic_trialfunction(visitor, element, restriction, tree_path): - rawname = "u" + "_".join(str(c) for c in tree_path) +def pymbolic_trialfunction(element, restriction, index): + rawname = "u_{}".format(index) name = restricted_name(rawname, restriction) container = name_coefficientcontainer(restriction) - evaluate_coefficient(visitor, element, name, container, restriction, tree_path) + evaluate_coefficient(element, name, container, restriction, index) return Variable(name) -def pymbolic_apply_function_gradient(visitor, element, restriction, tree_path): - rawname = "gradz_func" + "_".join(str(c) for c in tree_path) +def pymbolic_apply_function_gradient(element, restriction, index): + rawname = "gradz_func_{}".format(index) name = restricted_name(rawname, restriction) container = name_applycontainer(restriction) - evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path) + evaluate_coefficient_gradient(element, name, container, restriction, index) return Variable(name) -def pymbolic_apply_function(visitor, element, restriction, tree_path): - rawname = "z_func" + "_".join(str(c) for c in tree_path) +def pymbolic_apply_function(element, restriction, index): + rawname = "z_func_{}".format(index) name = restricted_name(rawname, restriction) container = name_applycontainer(restriction) - evaluate_coefficient(visitor, element, name, container, restriction, tree_path) + evaluate_coefficient(element, name, container, restriction, index) return Variable(name) diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py index f50367d64fc8ffd550345cee71cf9504bf32f110..9d58603aea4b87e909401165c21fb7979bb5ed26 100644 --- a/python/dune/perftool/pdelab/basis.py +++ b/python/dune/perftool/pdelab/basis.py @@ -11,16 +11,22 @@ from dune.perftool.generation import (backend, from dune.perftool.options import (option_switch, get_option ) -from dune.perftool.pdelab.spaces import (lfs_child, +from dune.perftool.pdelab.spaces import (lfs_iname, + lfs_inames, name_leaf_lfs, name_lfs, name_lfs_bound, type_gfs, - lfs_inames + type_leaf_gfs, ) from dune.perftool.pdelab.geometry import (component_iname, world_dimension, + name_jacobian_inverse_transposed, + to_cell_coordinates, ) +from dune.perftool.pdelab.localoperator import (lop_template_ansatz_gfs, + lop_template_test_gfs, + ) from dune.perftool.tools import (get_pymbolic_basename, get_pymbolic_indices, ) @@ -36,7 +42,7 @@ from loopy import Reduction @backend(interface="typedef_localbasis") @class_member(classtag="operator") def typedef_localbasis(element, name): - basis_type = "{}::Traits::FiniteElementMap::Traits::FiniteElementType::Traits::LocalBasisType".format(type_gfs(element)) + basis_type = "{}::Traits::FiniteElementMap::Traits::FiniteElementType::Traits::LocalBasisType".format(type_leaf_gfs(element)) return "using {} = typename {};".format(name, basis_type) @@ -105,9 +111,9 @@ def pymbolic_basis(leaf_element, restriction, number, context=''): name = "phi_{}".format(FEM_name_mangling(leaf_element)) name = restricted_name(name, restriction) evaluate_basis(leaf_element, name, restriction) - iname = lfs_inames(leaf_element, restriction, number, context=context)[0] + iname, = lfs_inames(leaf_element, restriction, number, context=context) - return Subscript(Variable(name), (Variable(iname), )) + return Subscript(Variable(name), (Variable(iname),)) @backend(interface="evaluate_grad") @@ -133,7 +139,7 @@ def pymbolic_reference_gradient(leaf_element, restriction, number, context=''): name = "js_{}".format(FEM_name_mangling(leaf_element)) name = restricted_name(name, restriction) evaluate_reference_gradient(leaf_element, name, restriction) - iname = lfs_inames(leaf_element, restriction, number, context=context)[0] + iname, = lfs_inames(leaf_element, restriction, number, context=context) return Subscript(Variable(name), (Variable(iname), 0)) @@ -148,86 +154,73 @@ def shape_as_pymbolic(shape): @kernel_cached -def evaluate_coefficient(visitor, element, name, container, restriction, tree_path): - from ufl.functionview import select_subelement - sub_element = select_subelement(element, tree_path) +def evaluate_coefficient(element, name, container, restriction, index): + sub_element = element + if element.num_sub_elements() > 0: + sub_element = element.extract_component(index)[1] - # Determine the rank of the trialfunction tensor - rank = len(sub_element.value_shape()) + from ufl import FiniteElement + assert isinstance(sub_element, FiniteElement) - shape = sub_element.value_shape() - shape_impl = ('arr', ) * rank - idims = tuple(component_iname(count=i) for i in range(rank)) - leaf_element = sub_element - from ufl import VectorElement, TensorElement - if isinstance(sub_element, (VectorElement, TensorElement)): - leaf_element = sub_element.sub_elements()[0] + temporary_variable(name, shape=(),) - temporary_variable(name, shape=shape, shape_impl=shape_impl) - lfs = name_lfs(element, restriction, tree_path) - basis = visitor.interface.pymbolic_basis(leaf_element, restriction, 0, context='trial') - index, = get_pymbolic_indices(basis) + lfs = name_lfs(element, restriction, index) + basis = pymbolic_basis(sub_element, restriction, 0, context='trial') + basisindex, = get_pymbolic_indices(basis) - if isinstance(sub_element, (VectorElement, TensorElement)): - lfs = lfs_child(lfs, idims, shape=shape_as_pymbolic(shape), symmetry=element.symmetry()) if get_option("blockstructured"): from dune.perftool.blockstructured.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, element, index) + coeff = pymbolic_coefficient(container, lfs, element, basisindex) else: from dune.perftool.pdelab.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, index) + coeff = pymbolic_coefficient(container, lfs, basisindex) - assignee = Subscript(Variable(name), tuple(Variable(i) for i in idims)) + assignee = Variable(name) reduction_expr = Product((coeff, basis)) - instruction(expression=Reduction("sum", index, reduction_expr, allow_simultaneous=True), + + instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True), assignee=assignee, - forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset(idims)), + forced_iname_deps=frozenset(get_backend("quad_inames")()), forced_iname_deps_is_final=True, ) @kernel_cached -def evaluate_coefficient_gradient(visitor, element, name, container, restriction, tree_path): - # First we determine the rank of the tensor we are talking about - from ufl.functionview import select_subelement - sub_element = select_subelement(element, tree_path) - rank = len(sub_element.value_shape()) + 1 - - # We do then set some variables accordingly - shape = sub_element.value_shape() + (element.cell().geometric_dimension(),) - shape_impl = ('arr',) * rank - - idims = tuple(component_iname(count=i) for i in range(rank)) - leaf_element = sub_element - from ufl import VectorElement, TensorElement - if isinstance(sub_element, (VectorElement, TensorElement)): - leaf_element = sub_element.sub_elements()[0] - - # and proceed to call the necessary generator functions - temporary_variable(name, shape=shape, shape_impl=shape_impl) - lfs = name_lfs(element, restriction, tree_path) - basis = visitor.interface.pymbolic_reference_gradient(leaf_element, restriction, 0, context='trialgrad') - index, _ = get_pymbolic_indices(basis) - - if isinstance(sub_element, (VectorElement, TensorElement)): - lfs = lfs_child(lfs, idims[:-1], shape=shape_as_pymbolic(shape[:-1]), symmetry=element.symmetry()) +def evaluate_coefficient_gradient(element, name, container, restriction, index): + sub_element = element + if element.num_sub_elements() > 0: + sub_element = element.extract_component(index)[1] + from ufl import FiniteElement + assert isinstance(sub_element, FiniteElement) + + temporary_variable(name, + shape=(element.cell().geometric_dimension(),), + shape_impl=("arr",), + ) + + dimindex = component_iname(count=0) + + lfs = name_lfs(element, restriction, index) + basis = pymbolic_reference_gradient(sub_element, restriction, 0, context='trialgrad') + basisindex, _ = get_pymbolic_indices(basis) + from dune.perftool.tools import maybe_wrap_subscript + basis = maybe_wrap_subscript(basis, Variable(dimindex)) if get_option("blockstructured"): from dune.perftool.blockstructured.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, element, index) + coeff = pymbolic_coefficient(container, lfs, element, basisindex) else: from dune.perftool.pdelab.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, index) + coeff = pymbolic_coefficient(container, lfs, basisindex) assignee = Subscript(Variable(name), tuple(Variable(i) for i in idims)) - reduction_expr = Product((coeff, Subscript(Variable(get_pymbolic_basename(basis)), basis.index + (Variable(idims[-1]),)))) + reduction_expr = Product((coeff, basis)) - instruction(expression=Reduction("sum", index, reduction_expr, allow_simultaneous=True), + instruction(expression=Reduction("sum", basisindex, reduction_expr, allow_simultaneous=True), assignee=assignee, - forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset(idims)), + forced_iname_deps=frozenset(get_backend("quad_inames")()).union(frozenset({dimindex})), forced_iname_deps_is_final=True, - tags=frozenset({"quad"}) ) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index ecf578882a001960830e0dad814b5613b01e71e0..6c54752e0e2a1e6067a36c91076e6f1bad45d96b 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -34,7 +34,7 @@ import dune.perftool.loopy.mangler from pymbolic.primitives import Variable import pymbolic.primitives as prim -from pytools import Record +from pytools import Record, ImmutableRecord import ufl.classes as uc import loopy as lp @@ -212,44 +212,33 @@ class AccumulationSpace(Record): return (self.restriction,) -def determine_accumulation_space(expr, number, measure, idims=None): - from dune.perftool.ufl.modified_terminals import extract_modified_arguments - args = extract_modified_arguments(expr, argnumber=number) - - if measure == 'exterior_facet': - for ma in args: - ma.restriction = Restriction.NEGATIVE - - # If this is a residual term we return a dummy object - if len(args) == 0: +# TODO maybe move this onto the visitor as a helper function? +def determine_accumulation_space(info, number): + if info is None: return AccumulationSpace() ma = next(iter(args)) - # Extract information on the finite element - from ufl.functionview import select_subelement - subel = select_subelement(ma.argexpr.ufl_element(), ma.tree_path) + assert info is not None + element = info.element + subel = element + from ufl import MixedElement + if isinstance(element, MixedElement): + subel = element.extract_component(info.element_index)[1] # And generate a local function space for it! - from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_child - lfs = name_lfs(ma.argexpr.ufl_element(), ma.restriction, ma.tree_path) + from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_iname + lfs = name_lfs(element, info.restriction, info.element_index) from dune.perftool.generation import valuearg from loopy.types import NumpyType valuearg(lfs, dtype=NumpyType("str")) - if len(subel.value_shape()) != 0: - from dune.perftool.pdelab.geometry import component_iname - if idims is None: - idims = tuple(component_iname(context='arg', count=i) for i in range(len(subel.value_shape()))) - lfs = lfs_child(lfs, idims, shape=subel.value_shape(), symmetry=subel.symmetry()) - subel = subel.sub_elements()[0] - if get_option("blockstructured"): from dune.perftool.blockstructured.tools import micro_index_to_macro_index from dune.perftool.blockstructured.spaces import lfs_inames lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, ma.restriction, count=number)[0]) else: from dune.perftool.pdelab.spaces import lfs_inames - lfsi = Variable(lfs_inames(subel, ma.restriction, count=number)[0]) + lfsi = Variable(lfs_iname(subel, info.restriction, count=number)) # If the LFS is not yet a pymbolic expression, make it one from pymbolic.primitives import Expression @@ -258,7 +247,7 @@ def determine_accumulation_space(expr, number, measure, idims=None): return AccumulationSpace(lfs=lfs, index=lfsi, - restriction=ma.restriction, + restriction=info.restriction, ) @@ -308,6 +297,82 @@ def boundary_predicates(expr, measure, subdomain_id, visitor): return predicates +class PDELabAccumulationInfo(ImmutableRecord): + def __init__(self, + element=None, + element_index=0, + restriction=None, + inames=(), + ): + ImmutableRecord.__init__(self, + element=element, + element_index=element_index, + restriction=restriction, + inames=inames, + ) + + def __eq__(self, other): + return type(self) == type(other) and self.element_index == other.element_index and self.restriction == other.restriction + + def __hash__(self): + return (self.element_index, self.restriction) + + +def _list_infos(expr, number, visitor): + from dune.perftool.ufl.modified_terminals import extract_modified_arguments + ma = extract_modified_arguments(expr, argnumber=number) + if len(ma) == 0: + if number == 1: + yield None + return + element = ma[0].argexpr.ufl_element() + + from dune.perftool.ufl.modified_terminals import Restriction + if visitor.measure == "cell": + restrictions = (Restriction.NONE,) + elif visitor.measure == "exterior_facet": + restrictions = (Restriction.NEGATIVE,) + elif visitor.measure == "interior_facet": + restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE) + for res in restrictions: + for ei in range(element.num_sub_elements() + 1): + yield PDELabAccumulationInfo(element_index=ei, restriction=res) + + +def list_accumulation_infos(expr, visitor): + testgen = _list_infos(expr, 0, visitor) + trialgen = _list_infos(expr, 1, visitor) + + import itertools + return itertools.product(testgen, trialgen) + + +def get_accumulation_info(expr, visitor): + element = expr.ufl_element() + leaf_element = element + element_index = 0 + from ufl import MixedElement + if isinstance(expr.ufl_element(), MixedElement): + element_index = visitor.indices[0] + leaf_element = element.extract_component(element_index)[1] + + restriction = visitor.restriction + if visitor.measure == 'exterior_facet': + from dune.perftool.pdelab.restriction import Restriction + restriction = Restriction.NEGATIVE + + inames = visitor.interface.lfs_inames(leaf_element, + restriction, + expr.number() + ) + + return PDELabAccumulationInfo(element=expr.ufl_element(), + element_index=element_index, + restriction=restriction, + inames=inames, + ) + + @iname def grad_iname(index, dim): from dune.perftool.pdelab.index import name_index @@ -316,111 +381,60 @@ def grad_iname(index, dim): return name -@backend(interface="accum_insn") -def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): - # When we do not do sumfactorization we do not split the test function - assert(accterm.argument.expr is None) - - # TODO boundary_predicates may reset visitor's inames, is that wanted? - predicates = boundary_predicates(accterm.term, measure, subdomain_id, visitor) - - # Do the tree traversal to get a pymbolic expression representing this expression - pymbolic_expr = visitor(accterm.term) - - # It may happen that an entire accumulation term vanishes. We do nothing in that case - if pymbolic_expr == 0: - return - +def generate_accumulation_instruction(expr, visitor): # Collect the lfs and lfs indices for the accumulate call - test_lfs = determine_accumulation_space(accterm.term, 0, measure) + test_lfs = determine_accumulation_space(visitor.test_info, 0) # In the jacobian case, also determine the space for the ansatz space - ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure) + ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1) # Collect the lfs and lfs indices for the accumulate call from dune.perftool.pdelab.argument import name_accumulation_variable - accumvar = name_accumulation_variable((test_lfs.get_restriction() + ansatz_lfs.get_restriction())) + accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + + predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id, visitor.copy()) rank = 1 if ansatz_lfs.lfs is None else 2 from dune.perftool.pdelab.argument import PDELabAccumulationFunction from pymbolic.primitives import Call - expr = Call(PDELabAccumulationFunction(accumvar, rank), - (test_lfs.get_args() + ansatz_lfs.get_args() + (pymbolic_expr,)) - ) + accexpr = Call(PDELabAccumulationFunction(accumvar, rank), + (test_lfs.get_args() + ansatz_lfs.get_args() + (expr,)) + ) from dune.perftool.generation import instruction from dune.perftool.options import option_switch quad_inames = visitor.interface.quadrature_inames() + lfs_inames = frozenset(visitor.test_info.inames) + if visitor.trial_info: + lfs_inames = lfs_inames.union(visitor.trial_info.inames) instruction(assignees=(), - expression=expr, - forced_iname_deps=frozenset(quad_inames).union(frozenset(visitor.inames)), + expression=accexpr, + forced_iname_deps=lfs_inames.union(frozenset(quad_inames)), forced_iname_deps_is_final=True, predicates=predicates ) -def visit_integrals(integrals): - for integral in integrals: - integrand = integral.integrand() - measure = integral.integral_type() - subdomain_id = integral.subdomain_id() - subdomain_data = integral.subdomain_data() - - # Maybe make the jacobian inverse diagonal! - if get_option('diagonal_transformation_matrix'): - if not get_option('turn_off_diagonal_jacobian'): - from dune.perftool.ufl.transformations.axiparallel import diagonal_jacobian - integrand = diagonal_jacobian(integrand) - - # Generate code for the LFS trees present in the form - from dune.perftool.ufl.modified_terminals import extract_modified_arguments - test_ma = extract_modified_arguments(integrand, argnumber=0) - trial_ma = extract_modified_arguments(integrand, coeffcount=0) - apply_ma = extract_modified_arguments(integrand, coeffcount=1) - - import itertools - for ma in itertools.chain(test_ma, trial_ma, apply_ma): - if measure == 'exterior_facet': - ma.restriction = Restriction.NEGATIVE - - from dune.perftool.pdelab.spaces import traverse_lfs_tree - traverse_lfs_tree(ma) - - # Now split the given integrand into accumulation - # expressions. If we do sumfactorization we cut the test - # argument from the rest of the expression. This gives the - # right input for the sumfactorization kernel of stage 3. - from dune.perftool.ufl.extract_accumulation_terms import split_into_accumulation_terms - if get_option('sumfact'): - accterms = split_into_accumulation_terms(integrand, cut_test_arg=True, split_gradients=True) - else: - accterms = split_into_accumulation_terms(integrand) - - # Iterate over the terms and generate a kernel - for accterm in accterms: - # Get component indices - indexmap = {} - from dune.perftool.ufl.componentindex import component_index_mapping - if accterm.argument.expr is not None: - indexmap.update(component_index_mapping(accterm.indexed_test_arg())) - indexmap.update(component_index_mapping(accterm.term)) - - # Get a transformer instance for this kernel - if get_option('sumfact'): - from dune.perftool.sumfact import SumFactInterface - interface = SumFactInterface() - elif get_option('blockstructured'): - from dune.perftool.blockstructured import BlockStructuredInterface - interface = BlockStructuredInterface() - else: - from dune.perftool.pdelab import PDELabInterface - interface = PDELabInterface() - from dune.perftool.ufl.visitor import UFL2LoopyVisitor - visitor = UFL2LoopyVisitor(interface, measure, indexmap) - - get_backend(interface="accum_insn")(visitor, accterm, measure, subdomain_id) +def visit_integral(integral): + integrand = integral.integrand() + measure = integral.integral_type() + subdomain_id = integral.subdomain_id() + subdomain_data = integral.subdomain_data() + + # Get a transformer instance for this kernel + if get_option('sumfact'): + from dune.perftool.sumfact import SumFactInterface + interface = SumFactInterface() + else: + from dune.perftool.pdelab import PDELabInterface + interface = PDELabInterface() + from dune.perftool.ufl.visitor import UFL2LoopyVisitor + visitor = UFL2LoopyVisitor(interface, measure, subdomain_id) + + # Start the visiting process! + visitor.accumulate(integrand) def generate_kernel(integrals): @@ -429,7 +443,8 @@ def generate_kernel(integrals): # Visit all integrals once to collect information (dry-run)! logger.debug('generate_kernel: visit_integrals (dry run)') with global_context(dry_run=True): - visit_integrals(integrals) + for integral in integrals: + visit_integral(integral) # Now perform some checks on what should be done from dune.perftool.sumfact.vectorization import decide_vectorization_strategy @@ -440,7 +455,8 @@ def generate_kernel(integrals): logger.debug('generate_kernel: visit_integrals (no dry run)') from dune.perftool.generation import delete_cache_items delete_cache_items("kernel_default") - visit_integrals(integrals) + for integral in integrals: + visit_integral(integral) knl = extract_kernel_from_cache("kernel_default") delete_cache_items("kernel_default") diff --git a/python/dune/perftool/pdelab/spaces.py b/python/dune/perftool/pdelab/spaces.py index 791a4ccff5370141036cc60c809a2ba05d719c3c..068672bafbd2c5196d7b4bd01dfb62c2d1e789fc 100644 --- a/python/dune/perftool/pdelab/spaces.py +++ b/python/dune/perftool/pdelab/spaces.py @@ -1,6 +1,7 @@ """ Generator functions for PDELab local/grid function spaces etc. """ -from dune.perftool.generation import (domain, +from dune.perftool.generation import (class_member, + domain, function_mangler, generator_factory, include_file, @@ -8,34 +9,17 @@ from dune.perftool.generation import (domain, backend, ) from dune.perftool.pdelab.restriction import restricted_name +from dune.perftool.ufl.modified_terminals import Restriction from loopy import CallMangleInfo from loopy.symbolic import FunctionIdentifier from loopy.types import NumpyType from pymbolic.primitives import Variable - +from functools import partial import numpy -class LFSChild(FunctionIdentifier): - def __init__(self, lfs): - self.lfs = lfs - - def __getinitargs__(self): - return (self.lfs,) - - @property - def name(self): - return '{}.child'.format(self.lfs) - - -@function_mangler -def lfs_child_mangler(target, func, dtypes): - if isinstance(func, LFSChild): - return CallMangleInfo(func.name, (NumpyType(str),), (NumpyType(numpy.int32),)) - - @preamble def define_lfs_bound(lfs, bound): return 'auto {} = {}.size();'.format(bound, lfs) @@ -57,40 +41,6 @@ def using_indices(): return "using namespace Dune::Indices;" -@preamble -def define_lfs(name, father, child): - using_indices() - return "auto {} = child({}, _{});".format(name, father, child) - - -def lfs_child(lfs, children, shape=None, symmetry=False): - from pymbolic.primitives import Call, Product, Sum - # Old pre-TensorElement implementation kept for comaptibility - if shape is None: - indices = (Variable(children[0]),) - else: - if symmetry and len(children) == 2: - # I do not want to think about tensors of rank > 2 right now - i, j = children - if i > j: - j, i = i, j - i = Variable(i) - j = Variable(j) - n = len(children) - - indices = (Sum((Product((n - 1, i)), Product((.5, i, 1 - i)), j)),) - else: - # If the index is not an int we need to make a variable out of it. - # - # Note: ints occur in the sumfactorisation case with - # vector valued functions (eg. Stokes) - if not isinstance(children[0], int): - children = tuple(Variable(c) for c in children) - indices = (Sum(tuple(Product((Product(tuple(shape[j] for j in range(i))), child)) for i, child in enumerate(children))),) - - return Call(LFSChild(lfs), indices) - - @generator_factory(cache_key_generator=lambda e, r, **kw: (e, r)) def name_leaf_lfs(leaf_element, restriction, val=None): """ This function just caches leaf lfs names based on the @@ -98,90 +48,111 @@ def name_leaf_lfs(leaf_element, restriction, val=None): for size information. OTOH, they are available with just the leaf element available (as seen in basis evaluation). """ - # This generator function should be prepoluted by the lfs tree traversal, - # so val should always be None when we actually want the result assert val + return val + +@generator_factory(cache_key_generator=lambda e, **kw: e) +def type_leaf_gfs(leaf_element, val=None): + """ This function just caches leaf lfs names based on the + element. The resulting local function spaces are useful only + for size information. OTOH, they are available with just the + leaf element available (as seen in basis evaluation). + """ + assert val return val -@generator_factory(cache_key_generator=lambda e, r, t, **kw: (e, r, t), context_tags=("kernel",)) -def name_lfs(element, restriction, tree_path, prefix=None): - # Omitting the prefix is only valid upon a second call, which will - # result in a cache hit. - assert prefix +@generator_factory(cache_key_generator=lambda e, r, **kw: (e, r)) +def available_lfs_names(element, restriction, name=None): + assert name + return name - def _name_lfs(prefix, tree_path): - name = prefix - if len(tree_path) > 0: - name = name + '_' + '_'.join(str(i) for i in tree_path) - return name - name = _name_lfs(prefix, tree_path) - if len(tree_path) > 0: - father = _name_lfs(prefix, tree_path[:-1]) - # If this localfunction space is the child of another one, trigger - # the extraction preamble. Necessary before going into recursion - # for having the correct (top-down) order of preambles - define_lfs(name, father, tree_path[-1]) +@generator_factory(cache_key_generator=lambda e, r, **kw: e) +def available_gfs_names(element, restriction, name=None): + assert name + return name + - # Recurse into the given element to define all other local function spaces! - from ufl import MixedElement - from ufl.functionview import select_subelement - from ufl.classes import FixedIndex - subel = select_subelement(element, tree_path) - if isinstance(subel, MixedElement): - for i in range(subel.num_sub_elements()): - name_lfs(element, restriction, tree_path + (FixedIndex(i),), prefix=prefix) +@preamble +def define_lfs(name, father, child): + using_indices() + return "auto {} = child({}, _{});".format(name, father, child) - # Cache the name for the subelement - name_leaf_lfs(subel, restriction, val=name) - # Now return the prefix! - return name +@class_member(classtag="operator") +def define_gfs(name, father, child): + include_file("dune/typetree/childextraction.hh", filetag="operatorfile") + return 'using {} = Dune::TypeTree::Child<{},{}>;'.format(name, father, child) -@generator_factory(cache_key_generator=lambda e, **kw: e) -def type_gfs(element, basetype=None, index_stack=None): - # Omitting basetype and index_stack is only valid upon a second call, - # which will result in a cache hit. - assert basetype - assert index_stack is not None +def _name_lfs(element, restriction, tp, name): + if len(tp) == 0: + name_leaf_lfs(element, restriction, val=name) + return name - # Additionally, element is expected to be a ufl finite element - from ufl import FiniteElementBase - assert isinstance(element, FiniteElementBase) + childname = "{}_{}".format(name, tp[0]) + define_lfs(childname, name, tp[0]) + return _name_lfs(element.sub_elements()[tp[0]], restriction, tp[1:], childname) + + +def _type_gfs(element, restriction, tp, name): + if len(tp) == 0: + type_leaf_gfs(element, val=name) + return name + + childname = "{}_{}".format(name, tp[0]) + define_gfs(childname, name, tp[0]) + return _type_gfs(element.sub_elements()[tp[0]], restriction, tp[1:], childname) + + +def _function_space_traversal(element, restriction, index, defaultname=None, recfunc=None): + name = defaultname(element, restriction) - # Recurse into the given element to define all other local function spaces! + tp = () from ufl import MixedElement - from ufl.classes import FixedIndex if isinstance(element, MixedElement): - for i, subelem in enumerate(element.sub_elements()): - type_gfs(subelem, basetype=basetype, index_stack=index_stack + (FixedIndex(i),)) + assert index is not None + tp = element.extract_subelement_component(index) + tp = (tp[0],) + tp[1] + + return recfunc(element, restriction, tp, name) - if len(index_stack) == 0: - return basetype - else: - include_file("dune/typetree/childextraction.hh", filetag="operatorfile") - return 'Dune::TypeTree::Child<{},{}>'.format(basetype, ','.join(str(i) for i in index_stack)) +name_lfs = partial(_function_space_traversal, defaultname=available_lfs_names, recfunc=_name_lfs) +type_gfs = partial(_function_space_traversal, defaultname=available_gfs_names, recfunc=_type_gfs) -def traverse_lfs_tree(arg): - from dune.perftool.ufl.modified_terminals import ModifiedArgument - assert isinstance(arg, ModifiedArgument) - # First we need to determine the basename as given in the signature of - # this kernel method! - lfs_basename = name_argumentspace(arg) - from dune.perftool.pdelab.localoperator import lop_template_gfs - gfs_basename = lop_template_gfs(arg) +def initialize_function_spaces(expr, visitor): + restriction = visitor.restriction + if visitor.measure == 'exterior_facet': + restriction = Restriction.NEGATIVE + + index = None + from ufl import MixedElement + if isinstance(expr.ufl_element(), MixedElement): + index = visitor.indices[0] - # Now start recursively extracting local function spaces and fill the cache with - # all those values. That way we can later get a correct local function space with - # just the ufl finite element. - from ufl.classes import MultiIndex - name_lfs(arg.argexpr.ufl_element(), arg.restriction, MultiIndex(()), prefix=lfs_basename) - type_gfs(arg.argexpr.ufl_element(), basetype=gfs_basename, index_stack=()) + from ufl.classes import Argument, Coefficient + if isinstance(expr, Argument) and expr.number() == 0: + available_lfs_names(expr.ufl_element(), + restriction, + name=name_testfunctionspace(restriction)) + name_lfs(expr.ufl_element(), restriction, index) + from dune.perftool.pdelab.localoperator import lop_template_test_gfs + available_gfs_names(expr.ufl_element(), 0, + name=lop_template_test_gfs()) + type_gfs(expr.ufl_element(), restriction, index) + else: + available_lfs_names(expr.ufl_element(), + restriction, + name=name_trialfunctionspace(restriction)) + name_lfs(expr.ufl_element(), restriction, index) + from dune.perftool.pdelab.localoperator import lop_template_ansatz_gfs + available_gfs_names(expr.ufl_element(), 0, + name=lop_template_ansatz_gfs()) + type_gfs(expr.ufl_element(), restriction, index) @generator_factory(item_tags=("iname",), cache_key_generator=lambda e, r, c: (e, c), context_tags=("kernel",)) @@ -222,24 +193,6 @@ def lfs_inames(element, restriction, count=None, context=''): return (lfs_iname(element, restriction, count, context),) -class LFSLocalIndex(FunctionIdentifier): - def __init__(self, lfs): - self.lfs = lfs - - def __getinitargs__(self): - return (self.lfs,) - - @property - def name(self): - return '{}.localIndex'.format(self.lfs) - - -@function_mangler -def lfs_localindex_mangler(target, func, dtypes): - if isinstance(func, LFSLocalIndex): - return CallMangleInfo(func.name, (NumpyType(numpy.int32),), (NumpyType(numpy.int32),)) - - def name_testfunctionspace(restriction): return restricted_name("lfsv", restriction) @@ -248,21 +201,6 @@ def name_trialfunctionspace(restriction): return restricted_name("lfsu", restriction) -def name_argumentspace(ma): - from ufl.classes import Argument, Coefficient - if isinstance(ma.argexpr, Argument): - if ma.argexpr.number() == 0: - return name_testfunctionspace(ma.restriction) - if ma.argexpr.number() == 1: - return name_trialfunctionspace(ma.restriction) - if isinstance(ma.argexpr, Coefficient): - # Index 0 is reserved for trialfunction, index 1 is reserved for jacobian apply function - assert ma.argexpr.count() < 2 - return name_trialfunctionspace(ma.restriction) - # We should never encounter an argument other than 0 or 1 - assert False - - def type_testfunctionspace(): return "LFSV" diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py index a2b5623caa23a641dfaf7d050fd612a0ce6f573a..2da267716037ac75567ff959ce9a93f75aeed816 100644 --- a/python/dune/perftool/sumfact/__init__.py +++ b/python/dune/perftool/sumfact/__init__.py @@ -22,6 +22,18 @@ from dune.perftool.pdelab import PDELabInterface class SumFactInterface(PDELabInterface): + def get_accumulation_info(self, expr, visitor): + from dune.perftool.sumfact.accumulation import get_accumulation_info + return get_accumulation_info(expr, visitor) + + def list_accumulation_infos(self, expr, visitor): + from dune.perftool.sumfact.accumulation import list_accumulation_infos + return list_accumulation_infos(expr, visitor) + + def generate_accumulation_instruction(self, expr, visitor): + from dune.perftool.sumfact.accumulation import generate_accumulation_instruction + return generate_accumulation_instruction(expr, visitor) + def lfs_inames(self, element, restriction, number=None, context=''): return lfs_inames(element, restriction, number, context) @@ -33,23 +45,23 @@ class SumFactInterface(PDELabInterface): self.visitor.indices = indices return ret - def pymbolic_trialfunction_gradient(self, element, restriction, tree_path): - ret, indices = pymbolic_coefficient_gradient(element, restriction, tree_path, name_coefficientcontainer, self.visitor.indices) + def pymbolic_trialfunction_gradient(self, element, restriction, index): + ret, indices = pymbolic_coefficient_gradient(element, restriction, index, name_coefficientcontainer, self.visitor.indices) self.visitor.indices = indices return ret - def pymbolic_trialfunction(self, element, restriction, tree_path): - ret, indices = pymbolic_coefficient(element, restriction, tree_path, name_coefficientcontainer, self.visitor.indices) + def pymbolic_trialfunction(self, element, restriction, index): + ret, indices = pymbolic_coefficient(element, restriction, index, name_coefficientcontainer, self.visitor.indices) self.visitor.indices = indices return ret - def pymbolic_apply_function_gradient(self, element, restriction, tree_path): - ret, indices = pymbolic_coefficient_gradient(element, restriction, tree_path, name_applycontainer, self.visitor.indices) + def pymbolic_apply_function_gradient(self, element, restriction, index): + ret, indices = pymbolic_coefficient_gradient(element, restriction, index, name_applycontainer, self.visitor.indices) self.visitor.indices = indices return ret - def pymbolic_apply_function(self, element, restriction, tree_path): - ret, indices = pymbolic_coefficient(element, restriction, tree_path, name_applycontainer, self.visitor.indices) + def pymbolic_apply_function(self, element, restriction, index): + ret, indices = pymbolic_coefficient(element, restriction, index, name_applycontainer, self.visitor.indices) self.visitor.indices = indices return ret diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 86a19cd249816df2f09ac89a80ff589e0082b23d..f6ce77eb46de74d2ffd47f419448cea0b7798ba1 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -75,222 +75,277 @@ class AlreadyAssembledInput(SumfactKernelInputBase): return hash(self.index) -def _component_gradient_indices(argument): - """Return component and gradient index of test function argument +class SumfactAccumulationInfo(ImmutableRecord): + def __init__(self, + element=None, + element_index=0, + restriction=None, + inames=(), + grad_index=None, + ): + ImmutableRecord.__init__(self, + element=element, + element_index=element_index, + restriction=restriction, + inames=inames, + grad_index=grad_index, + ) + + def __eq__(self, other): + return type(self) == type(other) and self.element_index == other.element_index and self.restriction == other.restriction and self.grad_index == other.grad_index + + def __hash__(self): + return (self.element_index, self.restriction, self.grad_index) + + +def get_accumulation_info(expr, visitor): + element = expr.ufl_element() + leaf_element = element + element_index = 0 + from ufl import MixedElement + if isinstance(expr.ufl_element(), MixedElement): + element_index = visitor.indices[0] + leaf_element = element.extract_component(element_index)[1] + + restriction = visitor.restriction + if visitor.measure == 'exterior_facet': + from dune.perftool.pdelab.restriction import Restriction + restriction = Restriction.NEGATIVE + + inames = visitor.interface.lfs_inames(leaf_element, + restriction, + expr.number() + ) - The component_index is the component of vector valued functions - (eg v in Stokes). The gradient index is the direction of the - derivative (eg in \Delat u in poisson or \grad u in Stoks). - """ grad_index = None - component_index = None - - # If this is a gradient the last index is the grad_index - if argument.reference_grad: - grad_index = argument.expr.ufl_operands[1][-1]._value - - # If this argument has indices there could be a component_index - if isinstance(argument.expr, uc.Indexed): - # More than two indices not supported - if len(argument.expr.ufl_operands[1]) > 2: - assert False - # For two indices the first is the component_index - if len(argument.expr.ufl_operands[1]) == 2: - assert grad_index is not None - component_index = argument.expr.ufl_operands[1].indices()[0]._value - # For there is no gradient index we should have only one index, the component_index - if not argument.reference_grad: - assert len(argument.expr.ufl_operands[1]) == 1 - component_index = argument.expr.ufl_operands[1].indices()[0]._value - - return component_index, grad_index - - -@backend(interface="accum_insn", name="sumfact") -def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): - # When doing sum factorization we want to split the test function - assert(accterm.argument.expr is not None) - - # Get component and gradient index - component_index, grad_index = _component_gradient_indices(accterm.argument) - - # Do the tree traversal to get a pymbolic expression representing this expression - pymbolic_expr = visitor(accterm.term) - if pymbolic_expr == 0: + if visitor.reference_grad and expr.number() == 0: + if isinstance(expr.ufl_element(), MixedElement): + grad_index = visitor.indices[1] + else: + grad_index = visitor.indices[0] + + return SumfactAccumulationInfo(element=expr.ufl_element(), + element_index=element_index, + restriction=restriction, + inames=inames, + grad_index=grad_index, + ) + + +def _test_generator(expr, visitor): + from dune.perftool.ufl.modified_terminals import extract_modified_arguments + ma = extract_modified_arguments(expr, argnumber=0) + if len(ma) == 0: return - - # Number of basis functions + element = ma[0].argexpr.ufl_element() dim = world_dimension() - mod_arg_expr = accterm.argument.expr - while (not isinstance(mod_arg_expr, uc.FunctionView)) and (not isinstance(mod_arg_expr, uc.Argument)): - mod_arg_expr = mod_arg_expr.ufl_operands[0] - degree = mod_arg_expr.ufl_element()._degree - basis_size = degree + 1 + from dune.perftool.ufl.modified_terminals import Restriction + if visitor.measure == "cell": + restrictions = (Restriction.NONE,) + elif visitor.measure == "exterior_facet": + restrictions = (Restriction.NEGATIVE,) + elif visitor.measure == "interior_facet": + restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE) + for res in restrictions: + for ei in range(element.num_sub_elements() + 1): + for grad in (None,) + tuple(range(dim)): + yield SumfactAccumulationInfo(element_index=ei, + restriction=res, + grad_index=grad) + + +def _trial_generator(expr, visitor): + from dune.perftool.ufl.modified_terminals import extract_modified_arguments + ma = extract_modified_arguments(expr, argnumber=1) + if len(ma) == 0: + yield None + return + element = ma[0].argexpr.ufl_element() + + from dune.perftool.ufl.modified_terminals import Restriction + if visitor.measure == "cell": + restrictions = (Restriction.NONE,) + elif visitor.measure == "exterior_facet": + restrictions = (Restriction.NEGATIVE,) + elif visitor.measure == "interior_facet": + restrictions = (Restriction.NEGATIVE, Restriction.POSITIVE) + for res in restrictions: + for ei in range(element.num_sub_elements() + 1): + yield SumfactAccumulationInfo(element_index=ei, restriction=res) + + +def list_accumulation_infos(expr, visitor): + import itertools + return itertools.product(_test_generator(expr, visitor), _trial_generator(expr, visitor)) - jacobian_inames = tuple() - if accterm.is_jacobian: - jacobian_inames = visitor.inames - # Only accumulate boundary conditions on parts where it is defined +def generate_accumulation_instruction(expr, visitor): + dim = world_dimension() + test_info = visitor.test_info + trial_info = visitor.trial_info + + leaf_element = test_info.element + if leaf_element.num_sub_elements() > 0: + leaf_element = leaf_element.extract_component(test_info.element_index)[1] + basis_size = leaf_element.degree() + 1 + from dune.perftool.pdelab.localoperator import boundary_predicates - predicates = boundary_predicates(accterm.term, measure, subdomain_id, visitor) - - def emit_sumfact_kernel(restriction, insn_dep): - test_lfs = determine_accumulation_space(accterm.argument.expr, 0, measure, idims=(component_index,)) - ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure, idims=(component_index,)) - accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) - - # Construct the matrix sequence for this sum factorization - matrix_sequence = construct_basis_matrix_sequence( - transpose=True, - derivative=grad_index, - facedir=get_facedir(accterm.argument.restriction), - facemod=get_facemod(accterm.argument.restriction), - basis_size=basis_size) - - # TODO: Adapt preferred position for stokes sumfact symdiff - sf = SumfactKernel(matrix_sequence=matrix_sequence, - restriction=(accterm.argument.restriction, restriction), - stage=3, - preferred_position=grad_index, - accumvar=accum, - within_inames=jacobian_inames, - input=AlreadyAssembledInput(index=(component_index,)), - component_index=component_index, - ) + predicates = boundary_predicates(expr, + visitor.measure, + visitor.subdomain_id, + visitor) - from dune.perftool.sumfact.vectorization import attach_vectorization_info - vsf = attach_vectorization_info(sf) - - # Make sure we have a buffer that we can set up the input with - buffer = vsf.buffer - if buffer is None: - buffer = get_counted_variable("buffer") - - vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() - - temp = get_buffer_temporary(buffer, - shape=vsf.quadrature_shape, - dim_tags=vsf.quadrature_dimtags, - name="input_{}".format(buffer), - ) - - # Those input fields, that are padded need to be set to zero - # in order to do a horizontal_add later on - for pad in vsf.padded_indices: - assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), pad) - instruction(assignee=assignee, - expression=0, - forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames), - forced_iname_deps_is_final=True, - tags=frozenset(["quadvec", "gradvec"]), - ) - - # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) - timer_dep = frozenset() - if get_option("instrumentation_level") >= 4: - timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' - post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') - dump_accumulate_timer(timer_name) - if(jacobian_inames): - timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), - within_inames=frozenset(jacobian_inames))}) - - # Determine dependencies - from loopy.match import Or, Writes - from loopy.symbolic import DependencyMapper - from dune.perftool.tools import get_pymbolic_basename - deps = Or(tuple(Writes(get_pymbolic_basename(expr)) for expr in DependencyMapper()(pymbolic_expr))) - - # Issue an instruction in the quadrature loop that fills the buffer - # with the evaluation of the contribution at all quadrature points - assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), - vsf.quadrature_index(sf)) - contrib_dep = instruction(assignee=assignee, - expression=pymbolic_expr, - forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames), - forced_iname_deps_is_final=True, - tags=frozenset({"quadvec"}).union(vectag), - depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})), - ) - - if insn_dep is None: - insn_dep = frozenset({contrib_dep}) - - if get_option("instrumentation_level") >= 4: - insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=insn_dep, - within_inames=frozenset(jacobian_inames))}) - - inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) - for i, mat in enumerate(vsf.matrix_sequence)) - - # Collect the lfs and lfs indices for the accumulate call - test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames), - (basis_size,) * dim, - order="f" - ) - - # In the jacobian case, also determine the space for the ansatz space - if accterm.is_jacobian: - # TODO the next line should get its inames from - # elsewhere. This is *NOT* robust (but works right now) - ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i]) - for i in range(world_dimension())), - (basis_functions_per_direction(),) * dim, - order="f" - ) - - # Add a sum factorization kernel that implements the multiplication - # with the test function (stage 3) - from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) - - # Determine the expression to accumulate with. This depends on the vectorization strategy! - result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) - vecinames = () - - if vsf.vectorized: - iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec") - vecinames = (iname,) - transform(lp.tag_inames, [(iname, "vec")]) - from dune.perftool.tools import maybe_wrap_subscript - result = prim.Call(prim.Variable("horizontal_add"), - (maybe_wrap_subscript(result, prim.Variable(iname)),), - ) - - if not get_option("fastdg"): - rank = 2 if accterm.is_jacobian else 1 - expr = prim.Call(PDELabAccumulationFunction(accum, rank), - (test_lfs.get_args() + - ansatz_lfs.get_args() + - (result,) + insn_dep = None + + from dune.perftool.pdelab.localoperator import determine_accumulation_space + test_lfs = determine_accumulation_space(test_info, 0) + ansatz_lfs = determine_accumulation_space(trial_info, 1) + + if trial_info is None: + trial_info = SumfactAccumulationInfo() + from dune.perftool.pdelab.argument import name_accumulation_variable + accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + + matrix_sequence = construct_basis_matrix_sequence( + transpose=True, + derivative=test_info.grad_index, + facedir=get_facedir(test_info.restriction), + facemod=get_facemod(test_info.restriction), + basis_size=basis_size) + + jacobian_inames = trial_info.inames + priority = test_info.grad_index + if priority is None: + priority = 3 + + sf = SumfactKernel(matrix_sequence=matrix_sequence, + restriction=(test_info.restriction, trial_info.restriction), + stage=3, + position_priority=priority, + accumvar=accumvar, + within_inames=jacobian_inames, + input=AlreadyAssembledInput(index=(test_info.element_index,)), + component_index=test_info.element_index, + ) + + from dune.perftool.sumfact.vectorization import attach_vectorization_info + vsf = attach_vectorization_info(sf) + + # Make sure we have a buffer that we can set up the input with + buffer = vsf.buffer + if buffer is None: + buffer = get_counted_variable("buffer") + + vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() + + temp = get_buffer_temporary(buffer, + shape=vsf.quadrature_shape, + dim_tags=vsf.quadrature_dimtags, + name="input_{}".format(buffer), + ) + + # Those input fields, that are padded need to be set to zero + # in order to do a horizontal_add later on + for pad in vsf.padded_indices: + assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), pad) + instruction(assignee=assignee, + expression=0, + forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames), + forced_iname_deps_is_final=True, + tags=frozenset(["quadvec", "gradvec"]), + ) + + # Write timing stuff for jacobian (for alpha methods it is done at the end of stage 1) + timer_dep = frozenset() + if get_option("instrumentation_level") >= 4: + timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' + post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile') + dump_accumulate_timer(timer_name) + if(jacobian_inames): + timer_dep = frozenset({instruction(code="HP_TIMER_START({});".format(timer_name), + within_inames=frozenset(jacobian_inames))}) + + # Determine dependencies + from loopy.match import Or, Writes + from loopy.symbolic import DependencyMapper + from dune.perftool.tools import get_pymbolic_basename + deps = Or(tuple(Writes(get_pymbolic_basename(e)) for e in DependencyMapper()(expr))) + + # Issue an instruction in the quadrature loop that fills the buffer + # with the evaluation of the contribution at all quadrature points + assignee = prim.Subscript(lp.TaggedVariable(temp, vsf.tag), + vsf.quadrature_index(sf)) + contrib_dep = instruction(assignee=assignee, + expression=expr, + forced_iname_deps=frozenset(quadrature_inames() + jacobian_inames), + forced_iname_deps_is_final=True, + tags=frozenset({"quadvec"}).union(vectag), + depends_on=frozenset({deps}).union(timer_dep).union(frozenset({lp.match.Tagged("sumfact_stage1")})), ) - ) - instruction(assignees=(), - expression=expr, - forced_iname_deps=frozenset(inames + vecinames + jacobian_inames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - predicates=predicates - ) - - # Mark the transformation that moves the quadrature loop - # inside the trialfunction loops for application - if accterm.is_jacobian: - transform(nest_quadrature_loops, jacobian_inames) - - return insn_dep - - # Extract the restrictions on argument-1: - jac_restrictions = frozenset(tuple(ma.restriction for ma in - extract_modified_arguments(accterm.term, - argnumber=1, - do_index=True))) - if not jac_restrictions: - jac_restrictions = frozenset({0}) - insn_dep = None - for restriction in jac_restrictions: - insn_dep = emit_sumfact_kernel(restriction, insn_dep) + if insn_dep is None: + insn_dep = frozenset({contrib_dep}) + + if get_option("instrumentation_level") >= 4: + insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), + depends_on=insn_dep, + within_inames=frozenset(jacobian_inames))}) + + inames = tuple(accum_iname((test_info.restriction, trial_info.restriction), mat.rows, i) + for i, mat in enumerate(vsf.matrix_sequence)) + + # Collect the lfs and lfs indices for the accumulate call + test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames), + (basis_size,) * dim, + order="f" + ) + + # In the jacobian case, also determine the space for the ansatz space + if jacobian_inames: + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right now) + ansatz_lfs.index = flatten_index(tuple(prim.Variable(jacobian_inames[i]) + for i in range(world_dimension())), + (basis_functions_per_direction(),) * dim, + order="f" + ) + + # Add a sum factorization kernel that implements the multiplication + # with the test function (stage 3) + from dune.perftool.sumfact.realization import realize_sum_factorization_kernel + result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) + + # Determine the expression to accumulate with. This depends on the vectorization strategy! + result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) + vecinames = () + + if vsf.vectorized: + iname = accum_iname((test_info.restriction, trial_info.restriction), vsf.vector_width, "vec") + vecinames = (iname,) + transform(lp.tag_inames, [(iname, "vec")]) + from dune.perftool.tools import maybe_wrap_subscript + result = prim.Call(prim.Variable("horizontal_add"), + (maybe_wrap_subscript(result, prim.Variable(iname)),), + ) + + if not get_option("fastdg"): + rank = 2 if jacobian_inames else 1 + expr = prim.Call(PDELabAccumulationFunction(accumvar, rank), + (test_lfs.get_args() + + ansatz_lfs.get_args() + + (result,) + ) + ) + instruction(assignees=(), + expression=expr, + forced_iname_deps=frozenset(inames + vecinames + jacobian_inames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + predicates=predicates + ) + + # Mark the transformation that moves the quadrature loop + # inside the trialfunction loops for application + if jacobian_inames: + transform(nest_quadrature_loops, jacobian_inames) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 7a2a0e2f31902282ddfd6b701a7b456c846aae35..9ea50e08c57b52a15843fc8bf3979547ed4d0389 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -37,12 +37,11 @@ from dune.perftool.sumfact.symbolic import SumfactKernel, SumfactKernelInputBase from dune.perftool.options import get_option from dune.perftool.pdelab.driver import FEM_name_mangling from dune.perftool.pdelab.restriction import restricted_name -from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, lfs_child, name_leaf_lfs +from dune.perftool.pdelab.spaces import name_lfs, name_lfs_bound, name_leaf_lfs from dune.perftool.tools import maybe_wrap_subscript from dune.perftool.pdelab.basis import shape_as_pymbolic from dune.perftool.sumfact.accumulation import sumfact_iname -from ufl.functionview import select_subelement from ufl import VectorElement, TensorElement from pytools import product, ImmutableRecord @@ -55,46 +54,23 @@ import pymbolic.primitives as prim class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): def __init__(self, coeff_func=None, - component_index=None, element=None, - tree_path=None, + element_index=0, restriction=0, ): ImmutableRecord.__init__(self, coeff_func=coeff_func, - component_index=component_index, element=element, - tree_path=tree_path, + element_index=element_index, restriction=restriction, ) def realize(self, sf, index, insn_dep): - lfs = name_lfs(self.element, self.restriction, self.tree_path) - sub_element = select_subelement(self.element, self.tree_path) - shape = sub_element.value_shape() + (self.element.cell().geometric_dimension(),) - - if isinstance(sub_element, (VectorElement, TensorElement)): - # Could be 0 but shouldn't be None - assert self.component_index is not None - - lfs_pym = lfs_child(lfs, - (self.component_index,), - shape=shape_as_pymbolic(shape[:-1]), - symmetry=self.element.symmetry()) - - leaf_element = sub_element - if isinstance(sub_element, (VectorElement, TensorElement)): - leaf_element = sub_element.sub_elements()[0] - - lfs = name_leaf_lfs(leaf_element, self.restriction) + lfs = name_lfs(self.element, self.restriction, self.element_index) basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") container = self.coeff_func(self.restriction) - if isinstance(sub_element, (VectorElement, TensorElement)): - from dune.perftool.pdelab.argument import pymbolic_coefficient as pc - coeff = pc(container, lfs_pym, basisiname) - else: - from dune.perftool.pdelab.argument import pymbolic_coefficient as pc - coeff = pc(container, lfs, basisiname) + from dune.perftool.pdelab.argument import pymbolic_coefficient as pc + coeff = pc(container, lfs, basisiname) # Get the input temporary! name = get_buffer_temporary(sf.buffer, @@ -118,39 +94,25 @@ class LFSSumfactKernelInput(SumfactKernelInputBase, ImmutableRecord): return None -def _basis_functions_per_direction(element, tree_path): - """Number of basis functions per direction of a given tree_path of an element""" - assert len(tree_path.indices()) <= 1 - if len(tree_path.indices()) == 0: - degree = element.degree() - else: - index = tree_path.indices()[0]._value - degree = element.sub_elements()[index].degree() - basis_size = degree + 1 - return basis_size +def _basis_functions_per_direction(element): + """Number of basis functions per direction """ + from ufl import FiniteElement + assert isinstance(element, FiniteElement) + return element.degree() + 1 @kernel_cached -def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, visitor_indices): - rawname = "gradu" + "_".join(str(c) for c in tree_path) - name = restricted_name(rawname, restriction) +def pymbolic_coefficient_gradient(element, restriction, index, coeff_func, visitor_indices): + sub_element = element + grad_index = visitor_indices[0] + if element.num_sub_elements() > 0: + sub_element = element.extract_component(index)[1] + + from ufl import FiniteElement + assert isinstance(sub_element, FiniteElement) # Number of basis functions per direction - basis_size = _basis_functions_per_direction(element, tree_path) - - # Get a temporary for the gradient - from ufl.functionview import select_subelement - sub_element = select_subelement(element, tree_path) - rank = len(sub_element.value_shape()) + 1 - shape = sub_element.value_shape() + (world_dimension(),) - shape_impl = ('arr',) * rank - temporary_variable(name, shape=shape, shape_impl=shape_impl) - - if len(visitor_indices) == 1: - component_index = None - grad_index, = visitor_indices - else: - component_index, grad_index = visitor_indices + basis_size = _basis_functions_per_direction(sub_element) # Construct the matrix sequence for this sum factorization matrix_sequence = construct_basis_matrix_sequence(derivative=grad_index, @@ -160,15 +122,14 @@ def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, v ) inp = LFSSumfactKernelInput(coeff_func=coeff_func, - component_index=component_index, element=element, - tree_path=tree_path, + element_index=index, restriction=restriction, ) # The sum factorization kernel object gathering all relevant information sf = SumfactKernel(matrix_sequence=matrix_sequence, - preferred_position=grad_index, + position_priority=grad_index, input=inp, ) @@ -182,29 +143,30 @@ def pymbolic_coefficient_gradient(element, restriction, tree_path, coeff_func, v @kernel_cached -def pymbolic_coefficient(element, restriction, tree_path, coeff_func, visitor_indices): +def pymbolic_coefficient(element, restriction, index, coeff_func, visitor_indices): + sub_element = element + if element.num_sub_elements() > 0: + sub_element = element.extract_component(index)[1] + from ufl import FiniteElement + assert isinstance(sub_element, FiniteElement) + # Basis functions per direction - basis_size = _basis_functions_per_direction(element, tree_path) + basis_size = _basis_functions_per_direction(sub_element) # Construct the matrix sequence for this sum factorization matrix_sequence = construct_basis_matrix_sequence(facedir=get_facedir(restriction), facemod=get_facemod(restriction), basis_size=basis_size) - component_index = None - if visitor_indices: - assert len(visitor_indices) == 1 - component_index = visitor_indices[0] - inp = LFSSumfactKernelInput(coeff_func=coeff_func, - component_index=component_index, element=element, - tree_path=tree_path, + element_index=index, restriction=restriction, ) sf = SumfactKernel(matrix_sequence=matrix_sequence, input=inp, + position_priority=3, ) from dune.perftool.sumfact.vectorization import attach_vectorization_info @@ -227,9 +189,11 @@ def sumfact_lfs_iname(bound, dim): @backend(interface="lfs_inames", name="sumfact") def lfs_inames(element, restriction, number=1, context=''): - assert number == 1 - dim = world_dimension() - return tuple(sumfact_lfs_iname(basis_functions_per_direction(), d) for d in range(dim)) + if number == 0: + return () + else: + dim = world_dimension() + return tuple(sumfact_lfs_iname(basis_functions_per_direction(), d) for d in range(dim)) @backend(interface="evaluate_basis", name="sumfact") @@ -264,7 +228,10 @@ def evaluate_basis(element, name, restriction): def pymbolic_basis(element, restriction, number): - assert number == 1 + # If this is a test function we omit it! + if number == 0: + return 1 + assert element.num_sub_elements() == 0 name = "phi_{}".format(FEM_name_mangling(element)) @@ -313,7 +280,10 @@ def evaluate_reference_gradient(element, name, restriction, index): def pymbolic_reference_gradient(element, restriction, number, indices): - assert number == 1 + # If this is a test function, we omit it. + if number == 0: + return 1, None + assert len(indices) == 1 index, = indices diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 878e047c613da83ce01c94376a49f5b8b1553191..aeb2a2260c8fd914f19414ab87686e147b029924 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -32,7 +32,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): matrix_sequence=None, buffer=None, stage=1, - preferred_position=None, + position_priority=None, restriction=None, within_inames=(), insn_dep=frozenset(), @@ -92,7 +92,7 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): pre-initialized with the input or you have to provide direct_input (FastDGGridOperator). stage: 1 or 3 - preferred_position: Will be used in the dry run to order kernels + position_priority: Will be used in the dry run to order kernels when doing vectorization e.g. (dx u,dy u,dz u, u). restriction: Restriction for faces values. within_inames: Instructions will be executed within those @@ -112,9 +112,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): assert stage in (1, 3) - if preferred_position is not None: - assert isinstance(preferred_position, int) - if stage == 1: assert isinstance(input, SumfactKernelInputBase) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 7df5f236a26855ecd0cbd9207ba210f7a563dc65..790fa59a1f052078800785cf10ed35fbf988da03 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -90,38 +90,22 @@ def vertical_vectorization_strategy(sumfact, depth): def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1): result = {} - todo = set(sumfacts) - while todo: - position_mapping = {} - available = set(range(width)) - - for sf in todo: - if sf.preferred_position is not None and sf.preferred_position in available: - available.discard(sf.preferred_position) - position_mapping[sf.preferred_position] = sf - - for sf in position_mapping.values(): - todo.discard(sf) - - for pos in available: - if todo: - position_mapping[pos] = todo.pop() - - kernels = [None] * len(position_mapping) - for pos in position_mapping: - kernels[pos] = position_mapping[pos] - kernels = tuple(kernels) + kernels = [] + for sf in sorted(todo, key=lambda s: s.position_priority): + if len(kernels) < width: + kernels.append(sf) + todo.discard(sf) buffer = get_counted_variable("joined_buffer") + if len(kernels) in range(width - allow_padding, width + 1): + for sf in kernels: + result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels), + horizontal_width=width, + buffer=buffer, + ) - for sumf in kernels: - if len(kernels) in range(width - allow_padding, width + 1): - result[sumf] = VectorizedSumfactKernel(kernels=kernels, - horizontal_width=width, - buffer=buffer, - ) return result diff --git a/python/dune/perftool/ufl/execution.py b/python/dune/perftool/ufl/execution.py index 3c3583391f46c1f03a98d12dc839cd377675e5bb..1a0ac249cf4bcb1ebd13312c01a2d5249c515618 100644 --- a/python/dune/perftool/ufl/execution.py +++ b/python/dune/perftool/ufl/execution.py @@ -30,7 +30,7 @@ class Coefficient(ufl.Coefficient): def split(obj): - return ufl.split_functions.split2(obj) + return ufl.split_functions.split(obj) def Coefficients(element): diff --git a/python/dune/perftool/ufl/modified_terminals.py b/python/dune/perftool/ufl/modified_terminals.py index da555529faddd7d3157585afeb5cd73706246c4c..cc818a7b563224c21909e7f86eba7f9420ef50d3 100644 --- a/python/dune/perftool/ufl/modified_terminals.py +++ b/python/dune/perftool/ufl/modified_terminals.py @@ -80,11 +80,11 @@ class ModifiedTerminalTracker(MultiFunction): self.reference_grad = False return ret - def function_view(self, o): - self.tree_path = o.ufl_operands[1] - ret = self.call(o.ufl_operands[0]) - self.tree_path = MultiIndex(()) - return ret +# def function_view(self, o): +# self.tree_path = o.ufl_operands[1] +# ret = self.call(o.ufl_operands[0]) +# self.tree_path = MultiIndex(()) +# return ret def reference_value(self, o): self.reference = True @@ -176,7 +176,7 @@ class _ModifiedArgumentExtractor(MultiFunction): positive_restricted = pass_on negative_restricted = pass_on - function_view = pass_on +# function_view = pass_on reference_value = pass_on def argument(self, o): diff --git a/python/dune/perftool/ufl/preprocess.py b/python/dune/perftool/ufl/preprocess.py index fba501dea4918827af809d89d6fe7782f846cc7b..df756bf742628e46d76cb69de431a383463219c0 100644 --- a/python/dune/perftool/ufl/preprocess.py +++ b/python/dune/perftool/ufl/preprocess.py @@ -30,8 +30,8 @@ def apply_default_transformations(form): from dune.perftool.ufl.transformations.reindexing import reindexing from dune.perftool.ufl.transformations.unroll import unroll_dimension_loops - form = transform_form(form, unroll_dimension_loops) +# form = transform_form(form, unroll_dimension_loops) form = transform_form(form, pushdown_indexed) - form = transform_form(form, reindexing) +# form = transform_form(form, reindexing) return form diff --git a/python/dune/perftool/ufl/transformations/unroll.py b/python/dune/perftool/ufl/transformations/unroll.py index 01088008cd4634f62184b79a42175cc66b2085c3..1a43849a63f194138551d01f471351e9e03cd98e 100644 --- a/python/dune/perftool/ufl/transformations/unroll.py +++ b/python/dune/perftool/ufl/transformations/unroll.py @@ -16,6 +16,8 @@ import ufl.classes as uc class UnrollDimensionLoops(MultiFunction): def __init__(self): self.replace = {} + self._indices_backup = [] + self.indices = None MultiFunction.__init__(self) call = MultiFunction.__call__ @@ -23,24 +25,60 @@ class UnrollDimensionLoops(MultiFunction): def expr(self, o): return self.reuse_if_untouched(o, *tuple(self(op) for op in o.ufl_operands)) + def indexed(self, o): + self._indices_backup.append(self.indices) + self.indices = self.call(o.ufl_operands[1]) + + ret = self.call(o.ufl_operands[0]) + + if self.indices is not None: + ret = uc.Indexed(ret, self.indices) + + self.indices = self._indices_backup.pop() + + return ret + def multi_index(self, o): return uc.MultiIndex(tuple(self.replace.get(i, i) for i in o)) def index_sum(self, o): operands = [] - # TODO: What is the correct way to get the shape here? - for i in range(o.geometric_dimension()): + for i in range(o.dimension()): self.replace[o.ufl_operands[1][0]] = uc.FixedIndex(i) operands.append(self.call(o.ufl_operands[0])) del self.replace[o.ufl_operands[1][0]] return construct_binary_operator(tuple(operands), uc.Sum) + def list_tensor(self, o): + index = self.indices[0] + index = index._value + self.indices = self.indices[1:] + if len(self.indices) == 0: + self.indices = None + return self.call(o.ufl_operands[index]) + + def component_tensor(self, o): + assert len(self.indices) == len(o.ufl_operands[1]) + + # Update the index mapping + for i, ind in enumerate(o.ufl_operands[1]): + self.replace[ind] = self.indices[i] + + self.indices = None + ret = self.call(o.ufl_operands[0]) + + for i, ind in enumerate(o.ufl_operands[1]): + del self.replace[ind] + + return ret + @ufl_transformation(name="unroll") def unroll_dimension_loops(expr): - if get_option("unroll_dimension_loops"): - return UnrollDimensionLoops()(expr) - else: - return expr + return UnrollDimensionLoops()(expr) +# if get_option("unroll_dimension_loops"): +# return UnrollDimensionLoops()(expr) +# else: +# return expr diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py index 3692d7548bac09df0f998bbd01f0d4684ee0c0a9..417b8678d471564a3efb4bf1277e6fb30dc8b6a0 100644 --- a/python/dune/perftool/ufl/visitor.py +++ b/python/dune/perftool/ufl/visitor.py @@ -9,6 +9,7 @@ from dune.perftool.ufl.modified_terminals import (ModifiedTerminalTracker, Restriction, ) from dune.perftool.tools import maybe_wrap_subscript +from dune.perftool.options import get_option from loopy import Reduction from pymbolic.primitives import (Call, @@ -21,7 +22,6 @@ from pymbolic.primitives import (Call, from ufl.algorithms import MultiFunction from ufl.checks import is_cellwise_constant -from ufl.functionview import select_subelement from ufl import (VectorElement, TensorElement, ) @@ -34,21 +34,36 @@ import pymbolic.primitives as prim class UFL2LoopyVisitor(ModifiedTerminalTracker): - def __init__(self, interface, measure, component_indices): + def __init__(self, interface, measure, subdomain_id): self.interface = interface self.interface.visitor = self self.measure = measure - self.component_indices = component_indices + self.subdomain_id = subdomain_id # Call base class constructors super(UFL2LoopyVisitor, self).__init__() def __call__(self, o, do_predicates=False): - # Reset some state variables that are reinitialized for each accumulation term + self.current_info = None + return self._call(o, do_predicates) + + def accumulate(self, o): + for info in self.interface.list_accumulation_infos(o, self): + self.current_info = info + expr = self._call(o, False) + if expr != 0: + self.interface.generate_accumulation_instruction(expr, self) + + def _call(self, o, do_predicates): + # Reset state variables + self.indexmap = {} self.indices = None self._indices_backup = [] + self.test_info = None + self.trial_info = None self.inames = () self.do_predicates = do_predicates + return self.call(o) call = MultiFunction.__call__ @@ -59,39 +74,42 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): # def argument(self, o): + self.interface.initialize_function_spaces(o, self) + # Update the information on where to accumulate this + info = self.interface.get_accumulation_info(o, self) + if o.number() == 0: + if info != self.current_info[0]: + self.indices = None + return 0 + else: + self.test_info = info + elif o.number() == 1: + if info != self.current_info[1]: + self.indices = None + return 0 + else: + self.trial_info = info + # Correct the restriction on boundary integrals restriction = self.restriction if self.measure == 'exterior_facet': restriction = Restriction.NEGATIVE + leaf_element = o.ufl_element() - # Select the correct subtree of the finite element - element = select_subelement(o.ufl_element(), self.tree_path) - leaf_element = element - - # Now treat the case of this being a vector finite element - if element.num_sub_elements() > 0: - # I cannot handle general mixed elements here... - assert isinstance(element, (VectorElement, TensorElement)) - - # If this is a vector element, we need add an additional accumulation loop iname - shape = len(element.value_shape()) - self.indices = self.indices[shape:] - for i in range(len(element.value_shape())): - if self.interface.component_iname(context='arg', count=i) not in self.inames: - self.inames = self.inames + (self.interface.component_iname(context='arg', count=i),) + # Select the correct leaf element in the case of this being a mixed finite element + if o.ufl_element().num_sub_elements() > 0: + index = self.indices[0] + assert isinstance(index, int) + self.indices = self.indices[1:] + if len(self.indices) == 0: + self.indices = None # For the purpose of basis evaluation, we need to take the leaf element - leaf_element = element.sub_elements()[0] + leaf_element = leaf_element.extract_component(index)[1] if self.grad: raise PerftoolUFLError("Gradients should have been transformed to reference gradients!!!") - # Have the issued instruction depend on the iname for this localfunction space - inames = self.interface.lfs_inames(leaf_element, restriction, o.number()) - for iname in inames: - if iname not in self.inames: - self.inames = self.inames + (iname,) - if self.reference_grad: return self.interface.pymbolic_reference_gradient(leaf_element, restriction, o.number()) else: @@ -105,19 +123,29 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): if self.measure == 'exterior_facet': restriction = Restriction.NEGATIVE + self.interface.initialize_function_spaces(o, self) + + index = None + if o.ufl_element().num_sub_elements() > 0: + index = self.indices[0] + assert isinstance(index, int) + self.indices = self.indices[1:] + if len(self.indices) == 0: + self.indices = None + if self.grad: raise PerftoolUFLError("Gradients should have been transformed to reference gradients!!!") if self.reference_grad: if o.count() == 0: - return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, self.tree_path) + return self.interface.pymbolic_trialfunction_gradient(o.ufl_element(), restriction, index) else: - return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, self.tree_path) + return self.interface.pymbolic_apply_function_gradient(o.ufl_element(), restriction, index) else: if o.count() == 0: - return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, self.tree_path) + return self.interface.pymbolic_trialfunction(o.ufl_element(), restriction, index) else: - return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, self.tree_path) + return self.interface.pymbolic_apply_function(o.ufl_element(), restriction, index) # Check if this is a parameter function else: @@ -163,44 +191,30 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): self.indices = self._indices_backup.pop() return maybe_wrap_subscript(aggr, indices) - def index_sum(self, o, additional_inames=()): - # There is three scenarios here: - # * This is a normal IndexSum: - # We should collect the reduction inames, go into recursion and issue - # a reduction operation to a temporary - # * This is part of a nested indexsum: - # We only collect the reduction iname and go into recursion - # * This IndexSum is implicitly handled by the accumulation process - # We only go into recursion - - # Get the iname for the reduction index - ind = o.ufl_operands[1][0] - redinames = additional_inames + (ind,) - shape = o.ufl_operands[0].ufl_index_dimensions[0] - domain(self.interface.name_index(ind), shape) - - # If the left operand is an index sum to, we do it in one reduction - if isinstance(o.ufl_operands[0], IndexSum): - return self.index_sum(o.ufl_operands[0], additional_inames=redinames) - else: - # Recurse to get the summation expression - term = self.call(o.ufl_operands[0]) - redinames = tuple(i for i in redinames if i not in self.component_indices) - if len(redinames) > 0: - ret = Reduction("sum", tuple(self.interface.name_index(ind) for ind in redinames), term) - else: - ret = term + def index_sum(self, o): + # This implementation fully unrolls the given indexed sum. + # This is done for a variety of reasons: + # * It eases handling of the given loopy kernel in terms of schedulability + # * The compiler would unroll these anyway + # * It allows handling of arbitrarily bad nesting of ComponentTensor and + # ListTensor, which otherwise becomes a *nightmare*. + index = o.ufl_operands[1][0] + operands = [] + + for i in range(o.dimension()): + self.indexmap[index] = i + operands.append(self.call(o.ufl_operands[0])) + del self.indexmap[index] - return ret + from pymbolic import flattened_sum + return flattened_sum(tuple(operands)) def _index_or_fixed_index(self, index): if isinstance(index, FixedIndex): return index._value else: - if index in self.component_indices: - if self.component_indices[index] not in self.inames: - self.inames = self.inames + (self.component_indices[index],) - return Variable(self.component_indices[index]) + if index in self.indexmap: + return self.indexmap[index] else: return Variable(self.interface.name_index(index)) @@ -218,6 +232,20 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): else: return self.interface.pymbolic_list_tensor(o) + def component_tensor(self, o): + assert len(self.indices) == len(o.ufl_operands[1]) + # Update the index mapping + for i, ind in enumerate(o.ufl_operands[1]): + self.indexmap[ind] = self.indexmap.get(self.indices[i], self.indices[i]) + + self.indices = None + ret = self.call(o.ufl_operands[0]) + + for i, ind in enumerate(o.ufl_operands[1]): + del self.indexmap[ind] + + return ret + def identity(self, o): return self.interface.pymbolic_identity(o) @@ -227,7 +255,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): # def product(self, o): - return Product(tuple(self.call(op) for op in o.ufl_operands)) + from pymbolic import flattened_product + return flattened_product(tuple(self.call(op) for op in o.ufl_operands)) def float_value(self, o): return o.value() @@ -239,7 +268,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): return Quotient(self.call(o.ufl_operands[0]), self.call(o.ufl_operands[1])) def sum(self, o): - return Sum(tuple(self.call(op) for op in o.ufl_operands)) + from pymbolic import flattened_sum + return flattened_sum(tuple(self.call(op) for op in o.ufl_operands)) def zero(self, o): return 0 @@ -347,8 +377,9 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): assert self.restriction is not Restriction.NONE # Optimize facet normal on axiparallel grids + # TODO move this into the sumfact backend, it is only valid there from dune.perftool.options import get_option - if get_option("diagonal_transformation_matrix"): + if get_option("diagonal_transformation_matrix") and get_option("sumfact"): index, = self.indices from dune.perftool.sumfact.switch import get_facedir if isinstance(index, int) and index != get_facedir(self.restriction): @@ -380,8 +411,9 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): self.indices = None # Implement diagonal jacobians for unrolled matrices! - if isinstance(i, int) and isinstance(j, int) and i != j: - return 0 + if get_option("diagonal_transformation_matrix"): + if isinstance(i, int) and isinstance(j, int) and i != j: + return 0 return self.interface.pymbolic_jacobian_inverse_transposed(i, j, restriction) @@ -413,3 +445,7 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): def __hash__(self): return 0 + + def copy(self): + """ Get a copy of this visitor """ + return type(self)(self.interface, self.measure, self.subdomain_id) diff --git a/python/ufl b/python/ufl index 8b7062528ff99e99c7e928e7d08f0c09c8776978..962d56f65821fb9c50ca4a5a858882c472243431 160000 --- a/python/ufl +++ b/python/ufl @@ -1 +1 @@ -Subproject commit 8b7062528ff99e99c7e928e7d08f0c09c8776978 +Subproject commit 962d56f65821fb9c50ca4a5a858882c472243431 diff --git a/test/stokes/CMakeLists.txt b/test/stokes/CMakeLists.txt index dc0e4d36d6d3fb94f6fbaf0643c428850c36e4ec..2a274367c3d7dc652ccfadb580b68571b4d97e96 100644 --- a/test/stokes/CMakeLists.txt +++ b/test/stokes/CMakeLists.txt @@ -27,11 +27,11 @@ dune_add_formcompiler_system_test(UFLFILE stokes_dg_quadrilateral.ufl INIFILE stokes_dg_quadrilateral.mini ) -dune_add_formcompiler_system_test(UFLFILE stokes_stress.ufl - BASENAME stokes_stress - INIFILE stokes_stress.mini - ) - +#dune_add_formcompiler_system_test(UFLFILE stokes_stress.ufl +# BASENAME stokes_stress +# INIFILE stokes_stress.mini +# ) +# # Do not test stokes_stress_sym until the function_view project # has been fully implemented. #