diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index d77f8145a16cece8a7cc877886e24a478da9d709..d09a91b7b27e5248867c268d4136509d357754fa 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -203,7 +203,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): vecinames = () # TODO: evaluate whether the following line would be okay with vsf.vectorized if vsf.vec_index(sf) is not None: - iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec") + iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec") vecinames = (iname,) transform(lp.tag_inames, [(iname, "vec")]) from dune.perftool.tools import maybe_wrap_subscript diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index e2e375a29d993f3fb8890615ce6c932562816dcd..583f8f4d18b7a9d5fcae8e614e8cc158016fa7d7 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -95,7 +95,7 @@ def _realize_sum_factorization_kernel(sf): if sf.vectorized: ftags = ftags + ",vec" ctags = ctags + ",vec" - vec_shape = (4,) + vec_shape = (sf.vector_width,) # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index a9a0dd4c9e12be9e9ca3961f03274554eabf7e05..37fa5e3fc7b453192a8a3dbe974b1e6193659500 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -344,6 +344,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) transpose=self.kernels[0].matrix_sequence[i].transpose, face=self.kernels[0].matrix_sequence[i].face, derivative=tuple(k.matrix_sequence[i].derivative for k in self.kernels), + width=self.vector_width, ) for i in range(self.length)) @@ -419,14 +420,14 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def flat_input_shape(self): - return (product(mat.cols for mat in self.matrix_sequence), 4) + return (product(mat.cols for mat in self.matrix_sequence), self.vector_width) @property def quadrature_shape(self): if self.transposed: - return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (4,) + return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,) else: - return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (4,) + return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,) @property def quadrature_dimtags(self): @@ -436,7 +437,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable) @property def dof_shape(self): - return tuple(mat.rows for mat in self.matrix_sequence) + (4,) + return tuple(mat.rows for mat in self.matrix_sequence) + (self.vector_width,) @property def dof_dimtags(self): diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index c6fa99e3b28a1a9f1a499d6559461a5ec1af7cc2..e64659851c51eea8be01dd3e69f8f258c799f0ad 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -59,7 +59,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase): class BasisTabulationMatrixArray(BasisTabulationMatrixBase): - def __init__(self, rows, cols, transpose, derivative, face): + def __init__(self, rows, cols, transpose, derivative, face, width): assert isinstance(derivative, tuple) BasisTabulationMatrixBase.__init__(self, rows=rows, @@ -67,6 +67,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): transpose=transpose, derivative=derivative, face=face, + width=width, ) @property @@ -77,13 +78,22 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): quadrature_points_per_direction(), ) for i, d in enumerate(self.derivative): - define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,)) + define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,), width=self.width) + + # Apply padding to those fields not used. This is necessary because you may get memory + # initialized with NaN and those NaNs will screw the horizontal_add. + i = theta_iname("i", self.rows) + j = theta_iname("j", self.cols) + for index in set(range(self.width)) - set(range(len(self.derivative))): + instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j), index)), + expression=0, + kernel="operator") return loopy_class_member(name, classtag="operator", dtype=numpy.float64, dim_tags="f,f,vec", - shape=(self.rows, self.cols, 4), + shape=(self.rows, self.cols, self.width), potentially_vectorized=True, managed=True, ) @@ -214,7 +224,7 @@ def polynomial_lookup_mangler(target, func, dtypes): return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64))) -def define_theta(name, shape, transpose, derivative, face=None, additional_indices=()): +def define_theta(name, shape, transpose, derivative, face=None, additional_indices=(), width=None): qp = name_oned_quadrature_points() qw = name_oned_quadrature_weights() sort_quadrature_points_weights(qp, qw) @@ -223,7 +233,7 @@ def define_theta(name, shape, transpose, derivative, face=None, additional_indic dim_tags = "f,f" if additional_indices: dim_tags = dim_tags + ",c" - shape = shape + (4,) + shape = shape + (width,) loopy_class_member(name, dtype=numpy.float64, diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 84c7e6150e9d2d40176f0b8a1817d16752e0d32e..460ae1e9fb77ffac0b26e7fb733b4ffa4e10adb8 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -1,5 +1,6 @@ """ Sum factorization vectorization """ +from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.generation import (generator_factory, get_counted_variable, @@ -13,6 +14,7 @@ from dune.perftool.error import PerftoolError from dune.perftool.options import get_option import loopy as lp +import numpy as np @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) @@ -40,10 +42,16 @@ def no_vectorization(sumfacts): def horizontal_vectorization_strategy(sumfacts): - if len(sumfacts) in (3, 4): + width = get_vcl_type_size(np.float64) + # We currently heuristically allow horizontal vectorization if the number + # of kernels matches the maximum vector width or is one below that. There + # is a lot of TODOs with that procedure: + # * You might have *more* kernels than vector width + # * You might want to combine horizontal and vertical strategies + if len(sumfacts) in (width - 1, width): # Map the sum factorization to their position in the joint kernel position_mapping = {} - available = set(range(4)) + available = set(range(width)) for sf in sumfacts: if sf.preferred_position is not None: # This asserts that no two kernels want to take the same position @@ -69,7 +77,7 @@ def horizontal_vectorization_strategy(sumfacts): for sumf in sumfacts: _cache_vectorization_info(sumf, VectorizedSumfactKernel(kernels=kernels, - vector_width=4, + vector_width=width, buffer=buffer, input=input, )