diff --git a/python/dune/codegen/pdelab/argument.py b/python/dune/codegen/pdelab/argument.py index ce16cd1121458ab27325312a727fd74c98d708cc..6bf800d59cc6ddeb683873793a4e72f9d0b11f0b 100644 --- a/python/dune/codegen/pdelab/argument.py +++ b/python/dune/codegen/pdelab/argument.py @@ -143,7 +143,10 @@ def pymbolic_coefficient(container, lfs, index): if not isinstance(lfs, Expression): lfs = Variable(lfs) - return Call(CoefficientAccess(container), (lfs, Variable(index),)) + if isinstance(index, str): + index = Variable(index) + + return Call(CoefficientAccess(container), (lfs, index,)) def type_coefficientcontainer(): diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index e4512dc199f5eacb533e57cafbe9b8eac2bf91d1..34599f55ad26f5859bfc5de4ce4996d6fd5e4098 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -15,6 +15,7 @@ from dune.codegen.generation import (backend, instruction, post_include, kernel_cached, + silenced_warning, temporary_variable, transform, valuearg @@ -30,7 +31,8 @@ from dune.codegen.pdelab.restriction import restricted_name from dune.codegen.pdelab.signatures import assembler_routine_name from dune.codegen.pdelab.geometry import world_dimension from dune.codegen.pdelab.spaces import name_lfs -from dune.codegen.sumfact.permutation import (permute_forward, +from dune.codegen.sumfact.permutation import (permute_backward, + permute_forward, sumfact_cost_permutation_strategy, sumfact_quadrature_permutation_strategy, ) @@ -239,6 +241,25 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): tags=frozenset({"sumfact_stage3"}), **args)}) + def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l): + # TODO: This should happen in stage 2 and not in stage 3 + shape = permute_backward(shape, self.cost_permutation) + inames = permute_backward(inames, self.cost_permutation) + + # Get a temporary that interprets the base storage of the input + # as a column-major matrix. In later iteration of the matrix loop + # this reinterprets the output of the previous iteration. + inp = buffer.get_temporary("buff_step{}_in".format(l), + shape=shape + vec_shape, + dim_tags=ftags, + ) + + # The input temporary will only be read from, so we need to silence + # the loopy warning + silenced_warning('read_no_write({})'.format(inp)) + + return prim.Subscript(prim.Variable(inp), inames + vec_iname) + @property def function_name_suffix(self): if get_form_option("fastdg"): diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index ccc3440698263643f67da433afa0ab53bb436b37..1db2b4bfc117161b261339942b4e2ecd065553ed 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -14,8 +14,10 @@ from dune.codegen.generation import (backend, iname, instruction, kernel_cached, + silenced_warning, temporary_variable, ) +from dune.codegen.loopy.flatten import flatten_index from dune.codegen.loopy.target import type_floatingpoint from dune.codegen.sumfact.tabulation import (basis_functions_per_direction, construct_basis_matrix_sequence, @@ -107,24 +109,50 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): def direct_is_possible(self): return get_form_option("fastdg") - def realize(self, sf, insn_dep, index=0): + def setup_input(self, sf, insn_dep, index=0): + """Setup input for a sum factorization kernel function + + Write the coefficients into an array that can be passed to a sum + factorization kernel function (necessary if direct input "fastdg" is + not possible). + + index: Vectorization index + """ + # Inames for interating over the coefficients. We take them from the + # cost permuted matrix sequence. In order to get the inames in order + # x,y,... we need to take the permutation back. + shape_cost_permuted = tuple(mat.basis_size for mat in sf.matrix_sequence_cost_permuted) + shape_ordered = permute_backward(shape_cost_permuted, self.cost_permutation) + shape_ordered = permute_backward(shape_ordered, self.quadrature_permutation) + inames_cost_permuted = tuple(sumfact_iname(length, "setup_inames_" + str(k)) for k, length in enumerate(shape_cost_permuted)) + inames_ordered = permute_backward(inames_cost_permuted, self.cost_permutation) + inames_ordered = permute_backward(inames_ordered, self.quadrature_permutation) + + # The coefficient needs to be accessed with a flat index of inames ordered x,y,... + flat_index = flatten_index(tuple(prim.Variable(i) for i in inames_ordered), + shape_ordered, + order="f") + + # Get the coefficient container lfs = name_lfs(self.element, self.restriction, self.element_index) - basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") container = self.coeff_func(self.restriction) from dune.codegen.pdelab.argument import pymbolic_coefficient as pc - coeff = pc(container, lfs, basisiname) + coeff = pc(container, lfs, flat_index) - # Get the input temporary! + # The array that will be passed to the sum factorization kernel + # function should contain the coefficients in the cost permuted order! from dune.codegen.sumfact.realization import name_buffer_storage name = "input_{}".format(sf.buffer) + ftags = ",".join(["f"] * (sf.length + 1)) temporary_variable(name, - shape=(product(mat.basis_size for mat in sf.matrix_sequence_quadrature_permuted), sf.vector_width), + shape=(sf.vector_width,) + shape_cost_permuted, custom_base_storage=name_buffer_storage(sf.buffer, 0), managed=True, + dim_tags=ftags, ) - assignee = prim.Subscript(prim.Variable(name), - (prim.Variable(basisiname),) + (index,)) + (index,) + tuple(prim.Variable(i) for i in inames_cost_permuted)) + insn = instruction(assignee=assignee, expression=coeff, depends_on=sf.insn_dep.union(insn_dep), @@ -134,6 +162,16 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): return insn_dep.union(frozenset({insn})) def realize_direct(self, shape, inames, which=0): + # If the input comes directly from a global data structure inames are + # ordered x,y,z,... + # + # The inames and shape passed to this method come from the cost + # permuted matrix sequence so we need to permute them back + shape = permute_backward(shape, self.cost_permutation) + shape = permute_backward(shape, self.quadrature_permutation) + inames = permute_backward(inames, self.cost_permutation) + inames = permute_backward(inames, self.quadrature_permutation) + arg = "fastdg{}".format(which) from dune.codegen.sumfact.accumulation import _dof_offset @@ -145,16 +183,23 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): return prim.Subscript(prim.Variable(arg), inames) - def realize_input(self, shape, inames, which=0): - if self.direct_is_possible: - shape = permute_backward(shape, self.cost_permutation) - shape = permute_backward(shape, self.quadrature_permutation) - inames = permute_backward(inames, self.cost_permutation) - inames = permute_backward(inames, self.quadrature_permutation) + def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l): + # Note: Here we do not need to reverse any permutation since this is + # already done in the setup_input method above! - return self.realize_direct(shape, inames) - else: - raise NotImplementedError("TODO") + # Get a temporary that interprets the base storage of the input + # as a column-major matrix. In later iteration of the matrix loop + # this reinterprets the output of the previous iteration. + inp = buffer.get_temporary("buff_step{}_in".format(l), + shape=shape + vec_shape, + dim_tags=ftags, + ) + + # The input temporary will only be read from, so we need to silence + # the loopy warning + silenced_warning('read_no_write({})'.format(inp)) + + return prim.Subscript(prim.Variable(inp), inames + vec_iname) @property def function_name_suffix(self): diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index ce2922a89739ac7077683f6caba9014dc0357e28..8652822725a24c53816ca7fe5cc20752ceb37bf3 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -15,6 +15,7 @@ from dune.codegen.generation import (backend, globalarg, valuearg, ) +from dune.codegen.loopy.flatten import flatten_index from dune.codegen.options import get_option from dune.codegen.pdelab.geometry import (local_dimension, world_dimension, @@ -26,9 +27,10 @@ from dune.codegen.pdelab.localoperator import (name_ansatz_gfs_constructor_param lop_template_range_field, ) from dune.codegen.pdelab.restriction import restricted_name -from dune.codegen.sumfact.accumulation import basis_sf_kernels +from dune.codegen.sumfact.accumulation import basis_sf_kernels, sumfact_iname from dune.codegen.sumfact.basis import construct_basis_matrix_sequence -from dune.codegen.sumfact.permutation import (permute_forward, +from dune.codegen.sumfact.permutation import (permute_backward, + permute_forward, sumfact_cost_permutation_strategy, sumfact_quadrature_permutation_strategy, ) @@ -113,15 +115,36 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord): def direct_is_possible(self): return False - def realize(self, sf, insn_dep, index=0): - # Note: world_dimension, since we only do evaluation of cell geometry mappings + def setup_input(self, sf, insn_dep, index=0): + # Inames for interating over the coefficients (in this case the + # coordinate of the component 'sefl.direction' of the corners). We take + # them from the cost permuted matrix sequence. In order to get the + # inames in order x,y,... we need to take the permutation back. + shape_cost_permuted = tuple(mat.basis_size for mat in sf.matrix_sequence_cost_permuted) + shape_ordered = permute_backward(shape_cost_permuted, self.cost_permutation) + shape_ordered = permute_backward(shape_ordered, self.quadrature_permutation) + inames_cost_permuted = tuple(sumfact_iname(length, "corner_setup_inames_" + str(k)) for k, length in enumerate(shape_cost_permuted)) + inames_ordered = permute_backward(inames_cost_permuted, self.cost_permutation) + inames_ordered = permute_backward(inames_ordered, self.quadrature_permutation) + + # Flat indices needed to access pdelab corner method + flat_index_ordered = flatten_index(tuple(prim.Variable(i) for i in inames_ordered), + shape_ordered, + order="f") + flat_index_cost_permuted = flatten_index(tuple(prim.Variable(i) for i in inames_cost_permuted), + shape_cost_permuted, + order="f") + + # The array that will be passed to the sum factorization kernel + # function should contain the coefficients in the cost permuted order! name = "input_{}".format(sf.buffer) + ftags = ",".join(["f"] * (sf.length + 1)) temporary_variable(name, - shape=(2 ** world_dimension(), sf.vector_width), + shape=(sf.vector_width,) + shape_cost_permuted, custom_base_storage=name_buffer_storage(sf.buffer, 0), managed=True, + dim_tags=ftags, ) - ciname = global_corner_iname(self.restriction) if self.restriction == 0: geo = name_geometry() @@ -132,23 +155,38 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord): # method does return a non-scalar, which does not fit into the current # loopy philosophy for function calls. This problem will be solved once # #11 is resolved. Admittedly, the code looks *really* ugly until that happens. - code = "{}[{}*{}+{}] = {}.corner({})[{}];".format(name, - sf.vector_width, - ciname, - index, - geo, - ciname, - self.direction, - ) + code = "{}[{}*({})+{}] = {}.corner({})[{}];".format(name, + sf.vector_width, + str(flat_index_cost_permuted), + index, + geo, + str(flat_index_ordered), + self.direction, + ) insn = instruction(code=code, - within_inames=frozenset({ciname}), + within_inames=frozenset(inames_cost_permuted), assignees=(name,), tags=frozenset({"sumfact_stage{}".format(sf.stage)}), ) return insn_dep.union(frozenset({insn})) + def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l): + # Get a temporary that interprets the base storage of the input + # as a column-major matrix. In later iteration of the matrix loop + # this reinterprets the output of the previous iteration. + inp = buffer.get_temporary("buff_step{}_in".format(l), + shape=shape + vec_shape, + dim_tags=ftags, + ) + + # The input temporary will only be read from, so we need to silence + # the loopy warning + silenced_warning('read_no_write({})'.format(inp)) + + return prim.Subscript(prim.Variable(inp), inames + vec_iname) + @backend(interface="spatial_coordinate", name="default") def pymbolic_spatial_coordinate_multilinear(do_predicates, visitor): diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index 7207dd3829a4fc4e192278b2b43b5c9cbeaae8f0..daf872452a4a3c7db992e2831a3ae8f00451f927 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -75,8 +75,8 @@ def _realize_sum_factorization_kernel(sf): for buf in buffers: # Determine the necessary size of the buffer. We assume that we do not # underintegrate the form!!! - size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width, - product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width) + size = max(product(m.quadrature_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width, + product(m.basis_size for m in sf.matrix_sequence_cost_permuted) * sf.vector_width) temporary_variable("{}_dummy".format(buf), shape=(size,), custom_base_storage=buf, @@ -85,7 +85,7 @@ def _realize_sum_factorization_kernel(sf): # Realize the input if it is not direct if sf.stage == 1 and not sf.interface.direct_is_possible: - insn_dep = insn_dep.union(sf.interface.realize(sf, insn_dep)) + insn_dep = insn_dep.union(sf.interface.setup_input(sf, insn_dep)) # Trigger generation of the sum factorization kernel function qp = quadrature_points_per_direction() @@ -190,24 +190,20 @@ def realize_sumfact_kernel_function(sf): # * a value from a global data structure, broadcasted to a vector type # (vectorized + FastDGGridOperator) input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: - input_summand = sf.interface.realize_input(inp_shape, input_inames) + input_summand = sf.interface.realize_direct(inp_shape, input_inames) + elif l == 0: + # TODO: Simplify arguments! + input_summand = sf.interface.realize_input(inp_shape, + input_inames, + vec_shape, + vec_iname, + buffer, + ftags, + l, + ) else: - # If we did permute the order of a matrices above we also - # permuted the order of out_inames. Unfortunately the - # order of our input is from 0 to d-1. This means we need - # to permute _back_ to get the right coefficients. - if l == 0: - inp_shape = permute_backward(inp_shape, sf.cost_permutation) - input_inames = permute_backward(input_inames, sf.cost_permutation) - if sf.stage == 1: - # In the unstructured case the sf.matrix_sequence_quadrature_permuted could - # already be permuted according to - # sf.interface.quadrature_permutation. We also need to reverse this - # permutation to get the input from 0 to d-1. - inp_shape = permute_backward(inp_shape, sf.interface.quadrature_permutation) - input_inames = permute_backward(input_inames, sf.interface.quadrature_permutation) - # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. @@ -237,6 +233,7 @@ def realize_sumfact_kernel_function(sf): # end and reverse permutation output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) if l == len(matrix_sequence) - 1: + # TODO: Move permutations to interface! output_inames = permute_backward(output_inames, sf.cost_permutation) if sf.stage == 3: output_inames = permute_backward(output_inames, sf.interface.quadrature_permutation) @@ -262,6 +259,7 @@ def realize_sumfact_kernel_function(sf): # If we are in the last step we reverse the permutation. output_shape = tuple(out_shape[1:]) + (out_shape[0],) if l == len(matrix_sequence) - 1: + # TODO: Move permutations to interface output_shape = permute_backward(output_shape, sf.cost_permutation) if sf.stage == 3: output_shape = permute_backward(output_shape, sf.interface.quadrature_permutation) diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 0b303c640952a98d40278e3b76fe6958e44a5d91..e51606dccad2fed8f4c5e736717b14ed6e057b0b 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -2,6 +2,7 @@ from dune.codegen.options import get_form_option, get_option from dune.codegen.generation import (get_counted_variable, + silenced_warning, subst_rule, transform, ) @@ -99,9 +100,10 @@ class SumfactKernelInterfaceBase(object): class VectorSumfactKernelInput(SumfactKernelInterfaceBase): - def __init__(self, interfaces): + def __init__(self, interfaces, perm): assert(isinstance(interfaces, tuple)) self.interfaces = interfaces + self.vector_cost_permutation = perm def __repr__(self): return "_".join(repr(i) for i in self.interfaces) @@ -124,7 +126,7 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): for i in self.interfaces: assert i.cost_permutation == cost_permutation - return cost_permutation + return vector_cost_permutation @property def stage(self): @@ -134,12 +136,14 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): def direct_is_possible(self): return all(i.direct_is_possible for i in self.interfaces) - def realize(self, sf, dep): + def setup_input(self, sf, dep): for i, inp in enumerate(self.interfaces): - dep = dep.union(inp.realize(sf, dep, index=i)) + dep = dep.union(inp.setup_input(sf, dep, index=i)) return dep def realize_direct(self, shape, inames): + # TODO: vector_cost_permutation not used! + # Check whether the input exhibits a favorable structure # (whether we can broadcast scalar values into SIMD registers) total = set(self.interfaces) @@ -166,16 +170,22 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): # need to load scalars into the SIMD vector. raise NotImplementedError("SIMD loads from scalars not implemented!") - def realize_input(self, shape, inames): - if self.direct_is_possible: - shape = permute_backward(shape, self.cost_permutation) - shape = permute_backward(shape, self.quadrature_permutation) - inames = permute_backward(inames, self.cost_permutation) - inames = permute_backward(inames, self.quadrature_permutation) + def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l): + # TODO: vector_cost_permutation not used! - return self.realize_direct(shape, inames) - else: - raise NotImplementedError("TODO") + # Get a temporary that interprets the base storage of the input + # as a column-major matrix. In later iteration of the matrix loop + # this reinterprets the output of the previous iteration. + inp = buffer.get_temporary("buff_step{}_in".format(l), + shape=shape + vec_shape, + dim_tags=ftags, + ) + + # The input temporary will only be read from, so we need to silence + # the loopy warning + silenced_warning('read_no_write({})'.format(inp)) + + return prim.Subscript(prim.Variable(inp), inames + vec_iname) @property def function_args(self): @@ -198,8 +208,9 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): - def __init__(self, interfaces): + def __init__(self, interfaces, perm): self.interfaces = interfaces + self.vector_cost_permutation = perm def __repr__(self): return "_".join(repr(o) for o in self.interfaces) @@ -231,6 +242,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return prim.Call(prim.Variable(hadd_function), (result,)) def realize(self, sf, result, insn_dep): + # TODO: vector_cost_permutation not used! + outputs = set(self.interfaces) trial_element, = set(o.trial_element for o in self.interfaces) @@ -250,6 +263,8 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return deps def realize_direct(self, result, inames, shape, **args): + # TODO: vector_cost_permutation not used! + outputs = set(self.interfaces) # If multiple horizontal_add's are to be performed with 'result' @@ -268,6 +283,25 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return deps + def realize_input(self, shape, inames, vec_shape, vec_iname, buffer, ftags, l): + # TODO: Include permutations of scalar kernels as soon as they could be different + shape = permute_backward(shape, self.vector_cost_permutation) + inames = permute_backward(inames, self.vector_cost_permutation) + + # Get a temporary that interprets the base storage of the input + # as a column-major matrix. In later iteration of the matrix loop + # this reinterprets the output of the previous iteration. + inp = buffer.get_temporary("buff_step{}_in".format(l), + shape=shape + vec_shape, + dim_tags=ftags, + ) + + # The input temporary will only be read from, so we need to silence + # the loopy warning + silenced_warning('read_no_write({})'.format(inp)) + + return prim.Subscript(prim.Variable(inp), inames + vec_iname) + @property def function_args(self): if get_form_option("fastdg"): @@ -759,6 +793,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def matrix_sequence_quadrature_permuted(self): + # TODO: This should be turned into a RuntimeError return tuple(BasisTabulationMatrixArray(tuple(k.matrix_sequence_quadrature_permuted[i] for k in self.kernels), width=self.vector_width, ) @@ -819,10 +854,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def interface(self): + perm = self.cost_permutation if self.stage == 1: - return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels)) + return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels), perm) else: - return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels)) + return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels), perm) @property def cache_key(self):