diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py index a0a7de0172f6310067ddd7a72160a595889fdcaa..a592c4610f87de0d38bb7da242f3ec8f798b7974 100644 --- a/python/dune/perftool/loopy/symbolic.py +++ b/python/dune/perftool/loopy/symbolic.py @@ -3,7 +3,7 @@ Use this module to insert pymbolic nodes and the likes. """ from dune.perftool.error import PerftoolError -from dune.perftool.sumfact.symbolic import SumfactKernel +from dune.perftool.sumfact.symbolic import SumfactKernel, VectorizedSumfactKernel from pymbolic.mapper.substitutor import make_subst_func import loopy as lp @@ -111,6 +111,15 @@ lp.symbolic.DependencyMapper.map_sumfact_kernel = dependency_map_sumfact_kernel lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_sumfact_kernel = needs_resolution lp.type_inference.TypeInferenceMapper.map_sumfact_kernel = needs_resolution +# VectorizedSumfactKernel node +lp.symbolic.IdentityMapper.map_vectorized_sumfact_kernel = identity_map_sumfact_kernel +lp.symbolic.SubstitutionMapper.map_vectorized_sumfact_kernel = lp.symbolic.SubstitutionMapper.map_variable +lp.symbolic.WalkMapper.map_vectorized_sumfact_kernel = walk_map_sumfact_kernel +lp.symbolic.StringifyMapper.map_vectorized_sumfact_kernel = stringify_map_sumfact_kernel +lp.symbolic.DependencyMapper.map_vectorized_sumfact_kernel = dependency_map_sumfact_kernel +lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_vectorized_sumfact_kernel = needs_resolution +lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_resolution + # FusedMultiplyAdd node lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 56ad22cba11a0a501c1c8a285240311d4ddc18c3..d09a91b7b27e5248867c268d4136509d357754fa 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -103,24 +103,24 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ) from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) # Make sure we have a buffer that we can set up the input with - buffer = sf.buffer + buffer = vsf.buffer if buffer is None: buffer = get_counted_variable("buffer") - vectag = frozenset({"gradvec"}) if sf.index is not None else frozenset() + vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() temp = get_buffer_temporary(buffer, - shape=sf.quadrature_shape, - dim_tags=sf.quadrature_dimtags, - name=sf.input, + shape=vsf.quadrature_shape, + dim_tags=vsf.quadrature_dimtags, + name=vsf.input, ) # Those input fields, that are padded need to be set to zero # in order to do a horizontal_add later on - for pad in sf.padding: + for pad in vsf.padding: assignee = prim.Subscript(prim.Variable(temp), tuple(prim.Variable(i) for i in quadrature_inames()) + (pad,)) instruction(assignee=assignee, @@ -156,7 +156,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Issue an instruction in the quadrature loop that fills the buffer # with the evaluation of the contribution at all quadrature points assignee = prim.Subscript(prim.Variable(temp), - tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index) + tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf)) contrib_dep = instruction(assignee=assignee, expression=expression, forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), @@ -174,7 +174,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): within_inames=frozenset(visitor.inames))}) inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) - for i, mat in enumerate(sf.matrix_sequence)) + for i, mat in enumerate(vsf.matrix_sequence)) # Collect the lfs and lfs indices for the accumulate call test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames), @@ -196,13 +196,14 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - result, insn_dep = realize_sum_factorization_kernel(sf.copy(insn_dep=insn_dep)) + result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=insn_dep)) # Determine the expression to accumulate with. This depends on the vectorization strategy! result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) vecinames = () - if sf.index is not None: - iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec") + # TODO: evaluate whether the following line would be okay with vsf.vectorized + if vsf.vec_index(sf) is not None: + iname = accum_iname((accterm.argument.restriction, restriction), vsf.vector_width, "vec") vecinames = (iname,) transform(lp.tag_inames, [(iname, "vec")]) from dune.perftool.tools import maybe_wrap_subscript diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index b4861eebf81923c6253475cfdb7c55cd553d08d4..3630d9f2750fa29874f7ee7e6bf722e581dfa0fc 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -85,16 +85,16 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v ) from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) - if i != sf.index: + if i != vsf.vec_index(sf): direct_indexing_is_possible = False # Add a sum factorization kernel that implements the # evaluation of the gradients of basis functions at quadrature # points (stage 1) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - var, insn_dep = realize_sum_factorization_kernel(sf) + var, insn_dep = realize_sum_factorization_kernel(vsf) buffers.append(var) @@ -132,17 +132,16 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor): component=component, ) - # TODO: Move this away! from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) # Add a sum factorization kernel that implements the evaluation of # the basis functions at quadrature points (stage 1) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - var, _ = realize_sum_factorization_kernel(sf) + var, _ = realize_sum_factorization_kernel(vsf) return prim.Subscript(var, - tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index + tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf) ), visitor.indices diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index dcc06af6b0ee4121934fc4ecc312493342abc1da..583f8f4d18b7a9d5fcae8e614e8cc158016fa7d7 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -37,11 +37,13 @@ def realize_sum_factorization_kernel(sf, **kwargs): if get_global_context_value("dry_run", False): return sf, sf.insn_dep else: - _realize_input(sf) return _realize_sum_factorization_kernel(sf, **kwargs) -def _realize_input(sf): +@generator_factory(item_tags=("sumfactkernel",), + context_tags=("kernel",), + cache_key_generator=lambda s, **kw: s.cache_key) +def _realize_sum_factorization_kernel(sf): # Set up the input for stage 1 if sf.stage == 1 and not get_option("fastdg"): assert sf.coeff_func @@ -53,22 +55,24 @@ def _realize_input(sf): name=sf.input ) - # Write initial coefficients into buffer - lfs = name_lfs(sf.element, sf.restriction, sf.component) - basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") - container = sf.coeff_func(sf.restriction) - coeff = pymbolic_coefficient(container, lfs, basisiname) - assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index) - instruction(assignee=assignee, - expression=coeff, - depends_on=sf.insn_dep, - ) - + def _write_input(inputsf): + # Write initial coefficients into buffer + lfs = name_lfs(inputsf.element, inputsf.restriction, inputsf.component) + basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") + container = inputsf.coeff_func(inputsf.restriction) + coeff = pymbolic_coefficient(container, lfs, basisiname) + assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index_tuple(inputsf)) + instruction(assignee=assignee, + expression=coeff, + depends_on=inputsf.insn_dep, + ) + + if sf.vectorized: + for inputsf in sf.kernels: + _write_input(inputsf) + else: + _write_input(sf) -@generator_factory(item_tags=("sumfactkernel",), - context_tags=("kernel",), - cache_key_generator=lambda s, **kw: s.cache_key) -def _realize_sum_factorization_kernel(sf): # Add a dependency on the input variable insn_dep = sf.insn_dep if sf.input: @@ -91,7 +95,7 @@ def _realize_sum_factorization_kernel(sf): if sf.vectorized: ftags = ftags + ",vec" ctags = ctags + ",vec" - vec_shape = (4,) + vec_shape = (sf.vector_width,) # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index fb03a88d5f423fcd079a94598663ffc69de4937e..37fa5e3fc7b453192a8a3dbe974b1e6193659500 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -1,13 +1,22 @@ """ A pymbolic node representing a sum factorization kernel """ -from pytools import ImmutableRecord +from dune.perftool.options import get_option +from dune.perftool.generation import get_counted_variable +from dune.perftool.sumfact.tabulation import BasisTabulationMatrixArray + +from pytools import ImmutableRecord, product import pymbolic.primitives as prim import loopy as lp +import frozendict import inspect -class SumfactKernel(ImmutableRecord, prim.Variable): +class SumfactKernelBase(object): + pass + + +class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def __init__(self, matrix_sequence=None, buffer=None, @@ -16,8 +25,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): restriction=0, within_inames=(), input=None, - padding=frozenset(), - index=None, insn_dep=frozenset(), coeff_func=None, element=None, @@ -92,7 +99,7 @@ class SumfactKernel(ImmutableRecord, prim.Variable): component: The treepath to the correct component of above element accumvar: The accumulation variable to accumulate into """ - # Check the input and apply defaults where necessary + # Assert the inputs! assert isinstance(matrix_sequence, tuple) from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence) @@ -163,31 +170,25 @@ class SumfactKernel(ImmutableRecord, prim.Variable): @property def vectorized(self): - return next(iter(self.matrix_sequence)).vectorized + return False @property def transposed(self): return next(iter(self.matrix_sequence)).transpose - @property - def vec_index(self): - """ A tuple with the vector index + def vec_index(self, sf): + """ Map an unvectorized sumfact kernel object to its position + in the vectorized kernel + """ + return None - Defaults to the empty tuple to allow addition to any - existing index tuple """ - if self.index is not None: - return (self.index,) - else: - return () + def vec_index_tuple(self, sf): + return () @property def flat_input_shape(self): """ The 'flat' input tensor shape """ - from pytools import product - shape = (product(mat.cols for mat in self.matrix_sequence),) - if self.vectorized: - shape = shape + (4,) - return shape + return (product(mat.cols for mat in self.matrix_sequence),) @property def quadrature_shape(self): @@ -196,12 +197,9 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account the lower dimensionality of faces and vectorization. """ if self.transposed: - shape = tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) else: - shape = tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) - if self.vectorized: - shape = shape + (4,) - return shape + return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) @property def quadrature_dimtags(self): @@ -210,8 +208,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account the lower dimensionality of faces and vectorization. """ tags = ["f"] * len(self.quadrature_shape) - if self.vectorized: - tags[-1] = 'c' return ",".join(tags) @property @@ -221,8 +217,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account vectorization. """ shape = tuple(mat.rows for mat in self.matrix_sequence) - if self.vectorized: - shape = shape + (4,) return shape @property @@ -232,8 +226,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account vectorization. """ tags = ["f"] * len(self.dof_shape) - if self.vectorized: - tags[-1] = 'vec' return ",".join(tags) @property @@ -251,13 +243,221 @@ class SumfactKernel(ImmutableRecord, prim.Variable): return self.dof_dimtags def output_to_pymbolic(self, name): - if self.vectorized: - return lp.TaggedVariable(name, "vector") - else: - return lp.TaggedVariable(name, "sumfac") + return lp.TaggedVariable(name, "sumfac") + + # + # Define properties for conformity with the interface of VectorizedSumfactKernel + # + + @property + def padding(self): + return set() # Extract the argument list and store it on the class. This needs to be done # outside of the class because the SumfactKernel class object needs to be fully # initialized in order to extract the information from __init__. SumfactKernel.init_arg_names = tuple(inspect.getargspec(SumfactKernel.__init__)[0][1:]) + + +class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): + def __init__(self, + kernels=None, + vector_width=get_option('max_vector_width'), + buffer=None, + input=None, + insn_dep=frozenset(), + ): + # Assert the input data structure + assert isinstance(kernels, tuple) + assert all(isinstance(k, SumfactKernel) for k in kernels) + + # Assert all the properties that need to be the same across all subkernels + assert len(set(k.stage for k in kernels)) == 1 + assert len(set(k.length for k in kernels)) == 1 + assert len(set(k.restriction for k in kernels)) == 1 + assert len(set(k.within_inames for k in kernels)) == 1 + + # Assert properties of the matrix sequence of the underlying kernels + for i in range(kernels[0].length): + assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1 + + # Join the instruction dependencies of all subkernels + insn_dep = insn_dep.union(k.insn_dep for k in kernels) + + # We currently assume that all subkernels are consecutive, 0-based within the vector + assert None not in kernels + + ImmutableRecord.__init__(self, + kernels=kernels, + vector_width=vector_width, + buffer=buffer, + input=input, + insn_dep=insn_dep, + ) + + prim.Variable.__init__(self, "VecSUMFAC") + + def __getinitargs__(self): + return (self.kernels, self.vector_width, self.buffer, self.input, self.insn_dep) + + def stringifier(self): + return lp.symbolic.StringifyMapper + + mapper_method = "map_vectorized_sumfact_kernel" + + init_arg_names = ("kernels", "vector_width", "buffer", "input", "insn_dep") + + # + # Some cache key definitions + # Watch out for the documentation to see which key is used unter what circumstances + # + + @property + def cache_key(self): + """ The cache key that can be used in generation magic + Any two sum factorization kernels having the same cache_key + are realized simulatenously! + """ + return (self.matrix_sequence, self.restriction, self.stage, self.buffer) + + @property + def input_key(self): + """ A cache key for the input coefficients + Any two sum factorization kernels having the same input_key + work on the same input coefficient (and are suitable for simultaneous + treatment because of that) + """ + return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar) + + # + # Deduce all data fields of normal sum factorization kernels from the underlying kernels + # + + @property + def matrix_sequence(self): + return tuple(BasisTabulationMatrixArray(rows=self.kernels[0].matrix_sequence[i].rows, + cols=self.kernels[0].matrix_sequence[i].cols, + transpose=self.kernels[0].matrix_sequence[i].transpose, + face=self.kernels[0].matrix_sequence[i].face, + derivative=tuple(k.matrix_sequence[i].derivative for k in self.kernels), + width=self.vector_width, + ) + for i in range(self.length)) + + @property + def stage(self): + return self.kernels[0].stage + + @property + def restriction(self): + return self.kernels[0].restriction + + @property + def within_inames(self): + return self.kernels[0].within_inames + + @property + def coeff_func(self): + assert len(set(k.coeff_func for k in self.kernels)) == 1 + return self.kernels[0].coeff_func + + @property + def element(self): + assert len(set(k.element for k in self.kernels)) == 1 + return self.kernels[0].element + + @property + def component(self): + assert len(set(k.component for k in self.kernels)) == 1 + return self.kernels[0].component + + @property + def accumvar(self): + assert len(set(k.accumvar for k in self.kernels)) == 1 + return self.kernels[0].accumvar + + # + # Define some properties only needed for this one + # + + @property + def padding(self): + return set(range(self.vector_width)) - set(range(len(self.kernels))) + + # + # Define the same properties the normal SumfactKernel defines + # + + @property + def cache_key(self): + return tuple(k.cache_key for k in self.kernels) + + @property + def input_key(self): + return tuple(k.input_key for k in self.kernels) + + @property + def length(self): + return self.kernels[0].length + + @property + def vectorized(self): + return True + + @property + def transposed(self): + return self.kernels[0].transposed + + def vec_index(self, sf): + return self.kernels.index(sf) + + def vec_index_tuple(self, sf): + return (self.vec_index(sf),) + + @property + def flat_input_shape(self): + return (product(mat.cols for mat in self.matrix_sequence), self.vector_width) + + @property + def quadrature_shape(self): + if self.transposed: + return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,) + else: + return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (self.vector_width,) + + @property + def quadrature_dimtags(self): + tags = ["f"] * len(self.quadrature_shape) + tags[-1] = 'c' + return ",".join(tags) + + @property + def dof_shape(self): + return tuple(mat.rows for mat in self.matrix_sequence) + (self.vector_width,) + + @property + def dof_dimtags(self): + tags = ["f"] * len(self.dof_shape) + tags[-1] = 'vec' + return ",".join(tags) + + @property + def output_shape(self): + if self.stage == 1: + return self.quadrature_shape + else: + return self.dof_shape + + @property + def output_dimtags(self): + if self.stage == 1: + return self.quadrature_dimtags + else: + return self.dof_dimtags + + def output_to_pymbolic(self, name): + return lp.TaggedVariable(name, "vector") diff --git a/python/dune/perftool/sumfact/tabulation.py b/python/dune/perftool/sumfact/tabulation.py index c6fa99e3b28a1a9f1a499d6559461a5ec1af7cc2..e64659851c51eea8be01dd3e69f8f258c799f0ad 100644 --- a/python/dune/perftool/sumfact/tabulation.py +++ b/python/dune/perftool/sumfact/tabulation.py @@ -59,7 +59,7 @@ class BasisTabulationMatrix(BasisTabulationMatrixBase): class BasisTabulationMatrixArray(BasisTabulationMatrixBase): - def __init__(self, rows, cols, transpose, derivative, face): + def __init__(self, rows, cols, transpose, derivative, face, width): assert isinstance(derivative, tuple) BasisTabulationMatrixBase.__init__(self, rows=rows, @@ -67,6 +67,7 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): transpose=transpose, derivative=derivative, face=face, + width=width, ) @property @@ -77,13 +78,22 @@ class BasisTabulationMatrixArray(BasisTabulationMatrixBase): quadrature_points_per_direction(), ) for i, d in enumerate(self.derivative): - define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,)) + define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,), width=self.width) + + # Apply padding to those fields not used. This is necessary because you may get memory + # initialized with NaN and those NaNs will screw the horizontal_add. + i = theta_iname("i", self.rows) + j = theta_iname("j", self.cols) + for index in set(range(self.width)) - set(range(len(self.derivative))): + instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j), index)), + expression=0, + kernel="operator") return loopy_class_member(name, classtag="operator", dtype=numpy.float64, dim_tags="f,f,vec", - shape=(self.rows, self.cols, 4), + shape=(self.rows, self.cols, self.width), potentially_vectorized=True, managed=True, ) @@ -214,7 +224,7 @@ def polynomial_lookup_mangler(target, func, dtypes): return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64))) -def define_theta(name, shape, transpose, derivative, face=None, additional_indices=()): +def define_theta(name, shape, transpose, derivative, face=None, additional_indices=(), width=None): qp = name_oned_quadrature_points() qw = name_oned_quadrature_weights() sort_quadrature_points_weights(qp, qw) @@ -223,7 +233,7 @@ def define_theta(name, shape, transpose, derivative, face=None, additional_indic dim_tags = "f,f" if additional_indices: dim_tags = dim_tags + ",c" - shape = shape + (4,) + shape = shape + (width,) loopy_class_member(name, dtype=numpy.float64, diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index ea0584641d81440497d98f7101c8c82a03fbd46f..460ae1e9fb77ffac0b26e7fb733b4ffa4e10adb8 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -1,6 +1,7 @@ """ Sum factorization vectorization """ -from dune.perftool.loopy.symbolic import SumfactKernel +from dune.perftool.loopy.vcl import get_vcl_type_size +from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.generation import (generator_factory, get_counted_variable, get_global_context_value, @@ -13,6 +14,7 @@ from dune.perftool.error import PerftoolError from dune.perftool.options import get_option import loopy as lp +import numpy as np @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o) @@ -40,10 +42,16 @@ def no_vectorization(sumfacts): def horizontal_vectorization_strategy(sumfacts): - if len(sumfacts) in (3, 4): + width = get_vcl_type_size(np.float64) + # We currently heuristically allow horizontal vectorization if the number + # of kernels matches the maximum vector width or is one below that. There + # is a lot of TODOs with that procedure: + # * You might have *more* kernels than vector width + # * You might want to combine horizontal and vertical strategies + if len(sumfacts) in (width - 1, width): # Map the sum factorization to their position in the joint kernel position_mapping = {} - available = set(range(4)) + available = set(range(width)) for sf in sumfacts: if sf.preferred_position is not None: # This asserts that no two kernels want to take the same position @@ -57,39 +65,22 @@ def horizontal_vectorization_strategy(sumfacts): if sumf.preferred_position is None: position_mapping[sumf] = available.pop() - # Enable vectorization strategy: - inp = get_counted_variable("joined_input") - buf = get_counted_variable("joined_buffer") - - # Collect the large matrices! - large_matrix_sequence = [] - for i in range(len(next(iter(sumfacts)).matrix_sequence)): - # Assert that the matrices of all sum factorizations have the same size - assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1 - assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1 - - # Collect the derivative information - derivative = [False] * 4 - for sf in sumfacts: - derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative - - large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows, - cols=next(iter(sumfacts)).matrix_sequence[i].cols, - transpose=next(iter(sumfacts)).matrix_sequence[i].transpose, - derivative=tuple(derivative), - face=next(iter(sumfacts)).matrix_sequence[i].face, - ) - large_matrix_sequence.append(large) + # Store the kernels as tuple according to their positions + sorting = [None] * len(position_mapping) + for sf, pos in position_mapping.items(): + sorting[pos] = sf + kernels = tuple(sorting) + + buffer = get_counted_variable("joined_buffer") + input = get_counted_variable("joined_input") for sumf in sumfacts: _cache_vectorization_info(sumf, - sumf.copy(matrix_sequence=tuple(large_matrix_sequence), - buffer=buf, - input=inp, - index=position_mapping[sumf], - padding=frozenset(available), - insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts), - ) + VectorizedSumfactKernel(kernels=kernels, + vector_width=width, + buffer=buffer, + input=input, + ) ) else: # Disable vectorization strategy diff --git a/python/setup.py b/python/setup.py index 00f6f75e8ec4070b2da6205ed89359e1a9a62fd2..cfc85288c50aa3af535d53ce2a5c22d7f898d13a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -39,7 +39,7 @@ setup(name='dune.perftool', 'dune.perftool.ufl', 'dune.perftool.ufl.transformations', ], - install_requires=['dune.testtools', 'sympy'], + install_requires=['dune.testtools', 'sympy', 'frozendict'], tests_require=['pytest'], cmdclass={'test': PyTest}, entry_points = {