diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index e360c84775a05fdbd30591dc9f8d7c4b9b26d068..516da6f08b6c2825f616c925448175b61d454a43 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -135,7 +135,7 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): return ImmutableRecord.__repr__(self) def get_keyword_arguments(self): - """Get dictionary of keyword arguments needed to initialize this classIFIERS + """Get dictionary of keyword arguments needed to initialize this class Extract keyword arguments from the ImmutableRecord and modify accordingly. You need to set the correct matrix sequence before using @@ -172,7 +172,10 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): return lfs_inames(get_leaf(self.trial_element, self.trial_element_index), self.restriction) def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): - # TODO: This should happen in stage 2 and not in stage 3 + # The result of stage 2 has the correct quadrature permutation but no + # cost permutation is applied. The inames for this method are + # quadrature and cost permuted. This means we need to reverse the cost + # permutation to access the result of stage 2 in the correct way. shape = permute_backward(shape, self.cost_permutation) inames = permute_backward(inames, self.cost_permutation) @@ -258,9 +261,8 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord): return frozenset({dep}) - def realize_direct_output(self, result, inames, shape, which=0, reverse_cost_permutation=True, **args): - if reverse_cost_permutation: - inames = permute_backward(inames, self.cost_permutation) + def realize_direct_output(self, result, inames, shape, which=0, **args): + inames = permute_backward(inames, self.cost_permutation) inames = permute_backward(inames, self.quadrature_permutation) direct_output = "fastdg{}".format(which) diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index 8ab14a7d2f060bf0898bc76d58c1d3efc1e953c1..049e0be9f5adfae7d48ef9adc40a951adcc55924 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -95,7 +95,7 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord): return repr(self) def get_keyword_arguments(self): - """Get dictionary of keyword arguments needed to initialize this classIFIERS + """Get dictionary of keyword arguments needed to initialize this class Extract keyword arguments from the ImmutableRecord and modify accordingly. You need to set the correct matrix sequence before using diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index 7cd4bf0245f297ee52c6ee52bc02575f4b5f5839..b95b348c43d5820aea9e7e6b2d32491f1fc03023 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -104,7 +104,7 @@ class GeoCornersInput(SumfactKernelInterfaceBase, ImmutableRecord): return repr(self) def get_keyword_arguments(self): - """Get dictionary of keyword arguments needed to initialize this classIFIERS + """Get dictionary of keyword arguments needed to initialize this class Extract keyword arguments from the ImmutableRecord and modify accordingly. You need to set the correct matrix sequence before using diff --git a/python/dune/codegen/sumfact/realization.py b/python/dune/codegen/sumfact/realization.py index a9bd0b8baddfbc7c269d92e360bb8ad3bd194910..f14599e5e7009997da267bfd16402d1371e6a6ec 100644 --- a/python/dune/codegen/sumfact/realization.py +++ b/python/dune/codegen/sumfact/realization.py @@ -194,7 +194,6 @@ def realize_sumfact_kernel_function(sf): if l == 0 and sf.stage == 1 and sf.interface.direct_is_possible: input_summand = sf.interface.realize_direct_input(input_inames, inp_shape) elif l == 0: - # TODO: Simplify arguments! input_summand = sf.interface.realize_input(input_inames, inp_shape, vec_iname, diff --git a/python/dune/codegen/sumfact/symbolic.py b/python/dune/codegen/sumfact/symbolic.py index 2b91aafa7ea2597e5b547d99671be3ac85949a81..b8b5b857728ea063edec3812b2132d210ffe93b3 100644 --- a/python/dune/codegen/sumfact/symbolic.py +++ b/python/dune/codegen/sumfact/symbolic.py @@ -31,8 +31,19 @@ import inspect class SumfactKernelInterfaceBase(object): - """ A base class for the input/output of a sum factorization kernel + """A base class for the input/output of a sum factorization kernel In stage 1, this represents the input object, in stage 3 the output object. + + Notes about permutations: + - setup_input: handle cost and quadrature permutation + - realize_input stage 1: no permutations + - realize_input stage 3: only cost permutation + - realize_direct_input: cost and quadrature permutation + - accumulate_output: no permutation + - realize_direct_output: cost and quadrature permutation + + In the vectorized case most permutation handling is forwarded to the scalar + kernels. """ def setup_input(self, sf, insn_dep, index=0): """Create and fill an input array for a stage 1 sumfact kernel function (non fastdg) @@ -57,7 +68,9 @@ class SumfactKernelInterfaceBase(object): Stage 1 : Input is already permuted the right way in setup_input. - Stage 3 : TODO -> Check permutation in accumulation.py + Stage 3 : The inames are cost and quadrature permuted but the input is + only quadrature permuted. This means we need to reverse the cost + permutation on the inames. Parameters ---------- @@ -118,12 +131,12 @@ class SumfactKernelInterfaceBase(object): """ raise NotImplementedError - def realize_direct_output(self, result, iname, shape, which=0, reverse_cost_permutation=True, **kwargs): + def realize_direct_output(self, result, iname, shape, which=0, **kwargs): """Accumulate results directly in the sumfact kernel function (fastdg) This happens inside the sumfact kernel function. - TODO: Add note about permutation + Needs to handle cost and quadrature permutation. Parameters ---------- @@ -133,8 +146,6 @@ class SumfactKernelInterfaceBase(object): shape : tuple of ints which : int TODO Doc me! - reverse_cost_permutation : tuple of ints - TODO Doc me! **kwargs : Key word arguments passed to loopy instruction """ @@ -209,33 +220,30 @@ class SumfactKernelInterfaceBase(object): class VectorSumfactKernelInput(SumfactKernelInterfaceBase): - def __init__(self, interfaces, perm): + def __init__(self, interfaces): assert(isinstance(interfaces, tuple)) self.interfaces = interfaces - self.vector_cost_permutation = perm def __repr__(self): return "_".join(repr(i) for i in self.interfaces) @property def quadrature_permutation(self): - # TODO: For now we assure that all kerneles have the same quadrature_permutation + # TODO: For now we only vectorize sumfact kernels with the same + # quadrature permutation. This should be extended. for i in self.interfaces: assert i.quadrature_permutation == self.interfaces[0].quadrature_permutation return self.interfaces[0].quadrature_permutation @property def cost_permutation(self): - # The cost_permutation of the underlying scalar SumfactKernel can be - # different for each kernel. - # raise RuntimeError("cost_permutation should not be called on VectorSumfactKernelInput") - - # TODO! - cost_permutation = self.interfaces[0].cost_permutation + # This should hold true due to the choice of quadrature + # permutation. For both structured and unstructured grids the order of + # the global directions should be the same leading to the same cost + # permutation for all those sum factorization kernels. for i in self.interfaces: - assert i.cost_permutation == cost_permutation - - return self.vector_cost_permutation + assert i.cost_permutation == self.interfaces[0].cost_permutation + return self.interfaces[0].cost_permutation @property def stage(self): @@ -251,8 +259,6 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): return dep def realize_input(self, inames, shape, vec_iname, vec_shape, buf, ftags): - # TODO: vector_cost_permutation not used! - # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the matrix loop # this reinterprets the output of the previous iteration. @@ -268,8 +274,6 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): return prim.Subscript(prim.Variable(inp), inames + vec_iname) def realize_direct_input(self, inames, shape): - # TODO: vector_cost_permutation not used! - # Check whether the input exhibits a favorable structure # (whether we can broadcast scalar values into SIMD registers) total = set(self.interfaces) @@ -317,25 +321,26 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase): class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): - def __init__(self, interfaces, perm): + def __init__(self, interfaces): self.interfaces = interfaces - self._cost_permutation = perm def __repr__(self): return "_".join(repr(o) for o in self.interfaces) @property def cost_permutation(self): - return self._cost_permutation + # This should hold true due to the choice of quadrature + # permutation. For both structured and unstructured grids the order of + # the global directions should be the same leading to the same cost + # permutation for all those sum factorization kernels. + for i in self.interfaces: + assert i.cost_permutation == self.interfaces[0].cost_permutation + return self.interfaces[0].cost_permutation @property def quadrature_permutation(self): - # TODO: This should be turned into an error. - # - # - # The quadrature permutation could be different for different scalar kernels! - # raise RuntimeError('quadrature_permutation should not be called on VectorSumfactKernelOutput') - # + # TODO: For now we only vectorize sumfact kernels with the same + # quadrature permutation. This should be extended . for i in self.interfaces: assert i.quadrature_permutation == self.interfaces[0].quadrature_permutation return self.interfaces[0].quadrature_permutation @@ -381,10 +386,6 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): return prim.Subscript(prim.Variable(inp), inames + vec_iname) def realize_direct_output(self, result, inames, shape, **args): - # TODO: Find out what needs to happen here - inames = permute_backward(inames, self.cost_permutation) - shape = permute_backward(shape, self.cost_permutation) - outputs = set(self.interfaces) # If multiple horizontal_add's are to be performed with 'result' @@ -403,7 +404,6 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase): inames, shape, which=which, - reverse_cost_permutation=False, **args)) return deps @@ -610,7 +610,6 @@ class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): # quadrature_permutation. This should be handled like upper/lower # vectorization return tuple(m.quadrature_size for m in self.matrix_sequence_quadrature_permuted) + tuple(m.basis_size for m in self.matrix_sequence_quadrature_permuted) + (self.stage, self.buffer, self.interface.within_inames) + (self.interface.direct_is_possible, self.interface.quadrature_permutation) - # return tuple(m.quadrature_size for m in self.matrix_sequence_quadrature_permuted) + tuple(m.basis_size for m in self.matrix_sequence_quadrature_permuted) + (self.stage, self.buffer, self.interface.within_inames) + (self.interface.direct_is_possible,) @property def cache_key(self): @@ -951,7 +950,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def cost_permutation(self): - return self.kernels[0].cost_permutation + raise RuntimeError("cost_permutation should not be used on VectorizedSumfactKernel.") @property def stage(self): @@ -1001,11 +1000,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def interface(self): - perm = self.cost_permutation if self.stage == 1: - return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels), perm) + return VectorSumfactKernelInput(tuple(k.interface for k in self.kernels)) else: - return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels), perm) + return VectorSumfactKernelOutput(tuple(k.interface for k in self.kernels)) @property def cache_key(self):