diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 48e441cb819a07ac1acce36a01edd8174e9d7d0a..ac6b481e4888ef07b93d72bd61c7051ef1f85b25 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -116,7 +116,7 @@ class DuneCExpressionToCodeMapper(CExpressionToCodeMapper): x = self.rec(expr.numerator, PREC_PRODUCT) return "{} & {}".format(x, n - 1) else: - return CExpressionToCodeMapper.map_remainder(expr, enclosing_prec) + return CExpressionToCodeMapper.map_remainder(self, expr, enclosing_prec) class DuneASTBuilder(CASTBuilder): diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 2e84ceac0dfa509046b2435140e2a0ab6d608848..49e154ec0143a599d2b8977aeb8ddf614b9a99d4 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -67,7 +67,7 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v # if the positioning within a SIMD vectors coincides with the index! direct_indexing_is_possible = True - buffers = [] + expressions = [] insn_dep = frozenset() for i in range(world_dimension()): # Construct the matrix sequence for this sum factorization @@ -97,7 +97,7 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v from dune.perftool.sumfact.realization import realize_sum_factorization_kernel var, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep))) - buffers.append(var) + expressions.append(prim.Subscript(var, vsf.quadrature_index(sf))) # Check whether we want to return early with something that has the indexing # already handled! This happens with vectorization when the index coincides @@ -107,12 +107,11 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v return maybe_wrap_subscript(var, vsf.quadrature_index(sf, visitor.indices)), None # TODO this should be quite conditional!!! - for i, buf in enumerate(buffers): + for i, expr in enumerate(expressions): # Write solution from sumfactorization to gradient variable assignee = prim.Subscript(prim.Variable(name), i) - expression = prim.Subscript(buf, vsf.quadrature_index(sf)) instruction(assignee=assignee, - expression=expression, + expression=expr, forced_iname_deps=frozenset(get_backend("quad_inames")()), forced_iname_deps_is_final=True, ) diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index 3bd7fe09e1b2dd1c070e53c1d0eed32be48eb6f8..9a3a1f5666336e33a00b7bd0a0fbbcd756451c5c 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -412,7 +412,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def padded_indices(self): - indices = set(range(self.horizontal_width)) - set(range(len(self.kernels))) + indices = set(range(self.vector_width)) + for h in range(len(self.kernels)): + for v in range(self.vertical_width): + indices.discard(h * self.horizontal_width + v) return tuple(self.kernels[0].quadrature_index(None) + (i,) for i in indices) @property @@ -439,10 +442,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) return True def horizontal_index(self, sf): - try: - return self.kernels.index(sf) - except ValueError: - return 0 + key = tuple(mat.derivative for mat in sf.matrix_sequence) + for i, k in enumerate(self.kernels): + if tuple(mat.derivative for mat in k.matrix_sequence) == key: + return i + return 0 def _sliced_expr(self): quad_inames = quadrature_inames() @@ -486,7 +490,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) sliced = prim.Variable(quad_inames[i]) i = i + 1 - return self.horizontal_index(sf) * self.vertical_width + sliced % self.vertical_width + return self.horizontal_index(sf) + prim.Remainder(sliced, self.vertical_width) def vec_index_tuple(self, sf): return (self.vec_index(sf),) diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 7cef74265d02262fef810b50a3eec2fb68cd010a..b856ccc76afc649e8af310f9fdfebaf5df59f4d7 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -47,37 +47,51 @@ def no_vectorization(sumfacts): def vertical_vectorization_strategy(sumfact, depth): # Assert that this is not already sliced assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) + result = {} # Determine which of the matrices in the kernel should be sliced - def determine_slice_direction(): - for i, mat in enumerate(sumfact.matrix_sequence): + def determine_slice_direction(sf): + for i, mat in enumerate(sf.matrix_sequence): if mat.quadrature_size % depth == 0: return i elif mat.quadrature_size != 1: raise PerftoolError("Vertical vectorization is not possible!") - sliced = determine_slice_direction() - - kernels = [] - oldtab = sumfact.matrix_sequence[sliced] - for i in range(depth): - seq = list(sumfact.matrix_sequence) - seq[sliced] = oldtab.copy(slice_size=depth, - slice_index=i) - kernels.append(sumfact.copy(matrix_sequence=tuple(seq))) - - buffer = get_counted_variable("vertical_buffer") - input = get_counted_variable("vertical_input") + if isinstance(sumfact, SumfactKernel): + kernels = [sumfact] + else: + assert isinstance(sumfact, VectorizedSumfactKernel) + kernels = sumfact.kernels + + newkernels = [] + for sf in kernels: + sliced = determine_slice_direction(sf) + oldtab = sf.matrix_sequence[sliced] + for i in range(depth): + seq = list(sf.matrix_sequence) + seq[sliced] = oldtab.copy(slice_size=depth, + slice_index=i) + newkernels.append(sf.copy(matrix_sequence=tuple(seq))) + + if isinstance(sumfact, SumfactKernel): + buffer = get_counted_variable("vertical_buffer") + input = get_counted_variable("vertical_input") + result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels), + buffer=buffer, + input=input, + vertical_width=depth, + ) + else: + assert isinstance(sumfact, VectorizedSumfactKernel) + for sf in kernels: + result[sf] = sumfact.copy(kernels=tuple(newkernels), + vertical_width=depth, + ) - vsf = VectorizedSumfactKernel(kernels=tuple(kernels), - buffer=buffer, - input=input, - vertical_width=depth, - ) - return {sumfact: vsf} + return result -def horizontal_vectorization_strategy(sumfacts, width): +def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1): result = {} todo = set(sumfacts) @@ -107,20 +121,31 @@ def horizontal_vectorization_strategy(sumfacts, width): input = get_counted_variable("joined_input") for sumf in kernels: - if len(kernels) in (width, width - 1): + if len(kernels) in range(width - allow_padding, width + 1): result[sumf] = VectorizedSumfactKernel(kernels=kernels, horizontal_width=width, buffer=buffer, input=input, ) - else: - result[sumf] = no_vec(sumf) - return result -def diagonal_vectorization_stratget(sumfacts, width): - return horizontal_vectorization_strategy(sumfacts, width) +def diagonal_vectorization_strategy(sumfacts, width): + if width == 4: + horizontal, vertical = 2, 2 + else: + raise NotImplementedError + + result = {} + + horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0) + + for sf in horizontal_kernels: + vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width) + for k in vert: + result[k] = vert[k] + + return result def decide_vectorization_strategy(): @@ -145,11 +170,14 @@ def decide_vectorization_strategy(): width = get_vcl_type_size(np.float64) sfdict.update(**vertical_vectorization_strategy(sumfact, width)) elif get_option("vectorize_diagonal"): - width = get_vcl_type_size(np.float64) - sfdict.update(**diagonal_vectorization_stragegy(sumfact, width)) + inputkeys = set(sf.input_key for sf in sumfacts) + for inputkey in inputkeys: + width = get_vcl_type_size(np.float64) + sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey] + sfdict.update(**diagonal_vectorization_strategy(sumfact_filter, width)) else: sfdict.update(**no_vectorization(sumfacts)) # Register the results - for old, new in sfdict.items(): - _cache_vectorization_info(old, new) + for sf in sumfacts: + _cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf)))