From ea799b0932e3db9dfcb4a0648fc09da8a7b38e85 Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Tue, 4 Apr 2017 14:18:20 +0200 Subject: [PATCH] Refactor treatment of vectorized sumfact kernels --- python/dune/perftool/loopy/symbolic.py | 11 +- python/dune/perftool/sumfact/accumulation.py | 23 +- python/dune/perftool/sumfact/basis.py | 13 +- python/dune/perftool/sumfact/realization.py | 38 +-- python/dune/perftool/sumfact/symbolic.py | 265 +++++++++++++++--- python/dune/perftool/sumfact/vectorization.py | 45 +-- python/setup.py | 2 +- 7 files changed, 294 insertions(+), 103 deletions(-) diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py index a0a7de01..a592c461 100644 --- a/python/dune/perftool/loopy/symbolic.py +++ b/python/dune/perftool/loopy/symbolic.py @@ -3,7 +3,7 @@ Use this module to insert pymbolic nodes and the likes. """ from dune.perftool.error import PerftoolError -from dune.perftool.sumfact.symbolic import SumfactKernel +from dune.perftool.sumfact.symbolic import SumfactKernel, VectorizedSumfactKernel from pymbolic.mapper.substitutor import make_subst_func import loopy as lp @@ -111,6 +111,15 @@ lp.symbolic.DependencyMapper.map_sumfact_kernel = dependency_map_sumfact_kernel lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_sumfact_kernel = needs_resolution lp.type_inference.TypeInferenceMapper.map_sumfact_kernel = needs_resolution +# VectorizedSumfactKernel node +lp.symbolic.IdentityMapper.map_vectorized_sumfact_kernel = identity_map_sumfact_kernel +lp.symbolic.SubstitutionMapper.map_vectorized_sumfact_kernel = lp.symbolic.SubstitutionMapper.map_variable +lp.symbolic.WalkMapper.map_vectorized_sumfact_kernel = walk_map_sumfact_kernel +lp.symbolic.StringifyMapper.map_vectorized_sumfact_kernel = stringify_map_sumfact_kernel +lp.symbolic.DependencyMapper.map_vectorized_sumfact_kernel = dependency_map_sumfact_kernel +lp.target.c.codegen.expression.ExpressionToCExpressionMapper.map_vectorized_sumfact_kernel = needs_resolution +lp.type_inference.TypeInferenceMapper.map_vectorized_sumfact_kernel = needs_resolution + # FusedMultiplyAdd node lp.symbolic.IdentityMapper.map_fused_multiply_add = identity_map_fused_multiply_add lp.symbolic.WalkMapper.map_fused_multiply_add = walk_map_fused_multiply_add diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py index 56ad22cb..d77f8145 100644 --- a/python/dune/perftool/sumfact/accumulation.py +++ b/python/dune/perftool/sumfact/accumulation.py @@ -103,24 +103,24 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ) from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) # Make sure we have a buffer that we can set up the input with - buffer = sf.buffer + buffer = vsf.buffer if buffer is None: buffer = get_counted_variable("buffer") - vectag = frozenset({"gradvec"}) if sf.index is not None else frozenset() + vectag = frozenset({"gradvec"}) if vsf.vectorized else frozenset() temp = get_buffer_temporary(buffer, - shape=sf.quadrature_shape, - dim_tags=sf.quadrature_dimtags, - name=sf.input, + shape=vsf.quadrature_shape, + dim_tags=vsf.quadrature_dimtags, + name=vsf.input, ) # Those input fields, that are padded need to be set to zero # in order to do a horizontal_add later on - for pad in sf.padding: + for pad in vsf.padding: assignee = prim.Subscript(prim.Variable(temp), tuple(prim.Variable(i) for i in quadrature_inames()) + (pad,)) instruction(assignee=assignee, @@ -156,7 +156,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Issue an instruction in the quadrature loop that fills the buffer # with the evaluation of the contribution at all quadrature points assignee = prim.Subscript(prim.Variable(temp), - tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index) + tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf)) contrib_dep = instruction(assignee=assignee, expression=expression, forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), @@ -174,7 +174,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): within_inames=frozenset(visitor.inames))}) inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) - for i, mat in enumerate(sf.matrix_sequence)) + for i, mat in enumerate(vsf.matrix_sequence)) # Collect the lfs and lfs indices for the accumulate call test_lfs.index = flatten_index(tuple(prim.Variable(i) for i in inames), @@ -196,12 +196,13 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Add a sum factorization kernel that implements the multiplication # with the test function (stage 3) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - result, insn_dep = realize_sum_factorization_kernel(sf.copy(insn_dep=insn_dep)) + result, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=insn_dep)) # Determine the expression to accumulate with. This depends on the vectorization strategy! result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) vecinames = () - if sf.index is not None: + # TODO: evaluate whether the following line would be okay with vsf.vectorized + if vsf.vec_index(sf) is not None: iname = accum_iname((accterm.argument.restriction, restriction), 4, "vec") vecinames = (iname,) transform(lp.tag_inames, [(iname, "vec")]) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index b4861eeb..3630d9f2 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -85,16 +85,16 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v ) from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) - if i != sf.index: + if i != vsf.vec_index(sf): direct_indexing_is_possible = False # Add a sum factorization kernel that implements the # evaluation of the gradients of basis functions at quadrature # points (stage 1) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - var, insn_dep = realize_sum_factorization_kernel(sf) + var, insn_dep = realize_sum_factorization_kernel(vsf) buffers.append(var) @@ -132,17 +132,16 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor): component=component, ) - # TODO: Move this away! from dune.perftool.sumfact.vectorization import attach_vectorization_info - sf = attach_vectorization_info(sf) + vsf = attach_vectorization_info(sf) # Add a sum factorization kernel that implements the evaluation of # the basis functions at quadrature points (stage 1) from dune.perftool.sumfact.realization import realize_sum_factorization_kernel - var, _ = realize_sum_factorization_kernel(sf) + var, _ = realize_sum_factorization_kernel(vsf) return prim.Subscript(var, - tuple(prim.Variable(i) for i in quadrature_inames()) + sf.vec_index + tuple(prim.Variable(i) for i in quadrature_inames()) + vsf.vec_index_tuple(sf) ), visitor.indices diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index dcc06af6..e2e375a2 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -37,11 +37,13 @@ def realize_sum_factorization_kernel(sf, **kwargs): if get_global_context_value("dry_run", False): return sf, sf.insn_dep else: - _realize_input(sf) return _realize_sum_factorization_kernel(sf, **kwargs) -def _realize_input(sf): +@generator_factory(item_tags=("sumfactkernel",), + context_tags=("kernel",), + cache_key_generator=lambda s, **kw: s.cache_key) +def _realize_sum_factorization_kernel(sf): # Set up the input for stage 1 if sf.stage == 1 and not get_option("fastdg"): assert sf.coeff_func @@ -53,22 +55,24 @@ def _realize_input(sf): name=sf.input ) - # Write initial coefficients into buffer - lfs = name_lfs(sf.element, sf.restriction, sf.component) - basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") - container = sf.coeff_func(sf.restriction) - coeff = pymbolic_coefficient(container, lfs, basisiname) - assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index) - instruction(assignee=assignee, - expression=coeff, - depends_on=sf.insn_dep, - ) - + def _write_input(inputsf): + # Write initial coefficients into buffer + lfs = name_lfs(inputsf.element, inputsf.restriction, inputsf.component) + basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") + container = inputsf.coeff_func(inputsf.restriction) + coeff = pymbolic_coefficient(container, lfs, basisiname) + assignee = prim.Subscript(prim.Variable(input_setup), (prim.Variable(basisiname),) + sf.vec_index_tuple(inputsf)) + instruction(assignee=assignee, + expression=coeff, + depends_on=inputsf.insn_dep, + ) + + if sf.vectorized: + for inputsf in sf.kernels: + _write_input(inputsf) + else: + _write_input(sf) -@generator_factory(item_tags=("sumfactkernel",), - context_tags=("kernel",), - cache_key_generator=lambda s, **kw: s.cache_key) -def _realize_sum_factorization_kernel(sf): # Add a dependency on the input variable insn_dep = sf.insn_dep if sf.input: diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index fb03a88d..e0f862b0 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -1,13 +1,22 @@ """ A pymbolic node representing a sum factorization kernel """ -from pytools import ImmutableRecord +from dune.perftool.options import get_option +from dune.perftool.generation import get_counted_variable +from dune.perftool.sumfact.tabulation import BasisTabulationMatrixArray + +from pytools import ImmutableRecord, product import pymbolic.primitives as prim import loopy as lp +import frozendict import inspect -class SumfactKernel(ImmutableRecord, prim.Variable): +class SumfactKernelBase(object): + pass + + +class SumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): def __init__(self, matrix_sequence=None, buffer=None, @@ -16,8 +25,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): restriction=0, within_inames=(), input=None, - padding=frozenset(), - index=None, insn_dep=frozenset(), coeff_func=None, element=None, @@ -92,7 +99,7 @@ class SumfactKernel(ImmutableRecord, prim.Variable): component: The treepath to the correct component of above element accumvar: The accumulation variable to accumulate into """ - # Check the input and apply defaults where necessary + # Assert the inputs! assert isinstance(matrix_sequence, tuple) from dune.perftool.sumfact.tabulation import BasisTabulationMatrixBase assert all(isinstance(m, BasisTabulationMatrixBase) for m in matrix_sequence) @@ -163,31 +170,25 @@ class SumfactKernel(ImmutableRecord, prim.Variable): @property def vectorized(self): - return next(iter(self.matrix_sequence)).vectorized + return False @property def transposed(self): return next(iter(self.matrix_sequence)).transpose - @property - def vec_index(self): - """ A tuple with the vector index + def vec_index(self, sf): + """ Map an unvectorized sumfact kernel object to its position + in the vectorized kernel + """ + return None - Defaults to the empty tuple to allow addition to any - existing index tuple """ - if self.index is not None: - return (self.index,) - else: - return () + def vec_index_tuple(self, sf): + return () @property def flat_input_shape(self): """ The 'flat' input tensor shape """ - from pytools import product - shape = (product(mat.cols for mat in self.matrix_sequence),) - if self.vectorized: - shape = shape + (4,) - return shape + return (product(mat.cols for mat in self.matrix_sequence),) @property def quadrature_shape(self): @@ -196,12 +197,9 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account the lower dimensionality of faces and vectorization. """ if self.transposed: - shape = tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) else: - shape = tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) - if self.vectorized: - shape = shape + (4,) - return shape + return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) @property def quadrature_dimtags(self): @@ -210,8 +208,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account the lower dimensionality of faces and vectorization. """ tags = ["f"] * len(self.quadrature_shape) - if self.vectorized: - tags[-1] = 'c' return ",".join(tags) @property @@ -221,8 +217,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account vectorization. """ shape = tuple(mat.rows for mat in self.matrix_sequence) - if self.vectorized: - shape = shape + (4,) return shape @property @@ -232,8 +226,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): Takes into account vectorization. """ tags = ["f"] * len(self.dof_shape) - if self.vectorized: - tags[-1] = 'vec' return ",".join(tags) @property @@ -251,13 +243,216 @@ class SumfactKernel(ImmutableRecord, prim.Variable): return self.dof_dimtags def output_to_pymbolic(self, name): - if self.vectorized: - return lp.TaggedVariable(name, "vector") - else: - return lp.TaggedVariable(name, "sumfac") + return lp.TaggedVariable(name, "sumfac") + + # + # Define properties for conformity with the interface of VectorizedSumfactKernel + # + + @property + def padding(self): + return set() # Extract the argument list and store it on the class. This needs to be done # outside of the class because the SumfactKernel class object needs to be fully # initialized in order to extract the information from __init__. SumfactKernel.init_arg_names = tuple(inspect.getargspec(SumfactKernel.__init__)[0][1:]) + + +class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable): + def __init__(self, + kernels=None, + vector_width=get_option('max_vector_width'), + buffer=None, + input=None, + insn_dep=frozenset(), + ): + # Assert the input data structure + assert isinstance(kernels, tuple) + assert all(isinstance(k, SumfactKernel) for k in kernels) + + # Assert all the properties that need to be the same across all subkernels + assert len(set(k.stage for k in kernels)) == 1 + assert len(set(k.length for k in kernels)) == 1 + assert len(set(k.restriction for k in kernels)) == 1 + assert len(set(k.within_inames for k in kernels)) == 1 + + # Assert properties of the matrix sequence of the underlying kernels + for i in range(kernels[0].length): + assert len(set(tuple(k.matrix_sequence[i].rows for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].cols for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].face for k in kernels))) == 1 + assert len(set(tuple(k.matrix_sequence[i].transpose for k in kernels))) == 1 + + # Join the instruction dependencies of all subkernels + insn_dep = insn_dep.union(k.insn_dep for k in kernels) + + # We currently assume that all subkernels are consecutive, 0-based within the vector + assert None not in kernels + + ImmutableRecord.__init__(self, + kernels=kernels, + vector_width=vector_width, + buffer=buffer, + input=input, + insn_dep=insn_dep, + ) + + prim.Variable.__init__(self, "VecSUMFAC") + + def __getinitargs__(self): + return (self.kernels, self.vector_width, self.buffer, self.input, self.insn_dep) + + def stringifier(self): + return lp.symbolic.StringifyMapper + + mapper_method = "map_vectorized_sumfact_kernel" + + init_arg_names = ("kernels", "vector_width", "buffer", "input", "insn_dep") + + # + # Some cache key definitions + # Watch out for the documentation to see which key is used unter what circumstances + # + + @property + def cache_key(self): + """ The cache key that can be used in generation magic + Any two sum factorization kernels having the same cache_key + are realized simulatenously! + """ + return (self.matrix_sequence, self.restriction, self.stage, self.buffer) + + @property + def input_key(self): + """ A cache key for the input coefficients + Any two sum factorization kernels having the same input_key + work on the same input coefficient (and are suitable for simultaneous + treatment because of that) + """ + return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar) + + # + # Deduce all data fields of normal sum factorization kernels from the underlying kernels + # + + @property + def matrix_sequence(self): + return tuple(BasisTabulationMatrixArray(rows=self.kernels[0].matrix_sequence[i].rows, + cols=self.kernels[0].matrix_sequence[i].cols, + transpose=self.kernels[0].matrix_sequence[i].transpose, + face=self.kernels[0].matrix_sequence[i].face, + derivative=tuple(k.matrix_sequence[i].derivative for k in self.kernels), + ) + for i in range(self.length)) + + @property + def stage(self): + return self.kernels[0].stage + + @property + def restriction(self): + return self.kernels[0].restriction + + @property + def within_inames(self): + return self.kernels[0].within_inames + + @property + def coeff_func(self): + return tuple(k.coeff_func for k in self.kernels) + + @property + def element(self): + return tuple(k.element for k in self.kernels) + + @property + def component(self): + return tuple(k.component for k in self.kernels) + + @property + def accumvar(self): + return tuple(k.accumvar for k in self.kernels) + + # + # Define some properties only needed for this one + # + + @property + def padding(self): + return set(range(self.vector_width)) - set(range(len(self.kernels))) + + # + # Define the same properties the normal SumfactKernel defines + # + + @property + def cache_key(self): + return tuple(k.cache_key for k in self.kernels) + + @property + def input_key(self): + return tuple(k.input_key for k in self.kernels) + + @property + def length(self): + return self.kernels[0].length + + @property + def vectorized(self): + return True + + @property + def transposed(self): + return self.kernels[0].transposed + + def vec_index(self, sf): + return self.kernels.index(sf) + + def vec_index_tuple(self, sf): + return (self.vec_index(sf),) + + @property + def flat_input_shape(self): + return (product(mat.cols for mat in self.matrix_sequence), 4) + + @property + def quadrature_shape(self): + if self.transposed: + return tuple(mat.cols for mat in self.matrix_sequence if mat.face is None) + (4,) + else: + return tuple(mat.rows for mat in self.matrix_sequence if mat.face is None) + (4,) + + @property + def quadrature_dimtags(self): + tags = ["f"] * len(self.quadrature_shape) + tags[-1] = 'c' + return ",".join(tags) + + @property + def dof_shape(self): + return tuple(mat.rows for mat in self.matrix_sequence) + (4,) + + @property + def dof_dimtags(self): + tags = ["f"] * len(self.dof_shape) + tags[-1] = 'vec' + return ",".join(tags) + + @property + def output_shape(self): + if self.stage == 1: + return self.quadrature_shape + else: + return self.dof_shape + + @property + def output_dimtags(self): + if self.stage == 1: + return self.quadrature_dimtags + else: + return self.dof_dimtags + + def output_to_pymbolic(self, name): + return lp.TaggedVariable(name, "vector") diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index ea058464..84c7e615 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -1,6 +1,6 @@ """ Sum factorization vectorization """ -from dune.perftool.loopy.symbolic import SumfactKernel +from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.generation import (generator_factory, get_counted_variable, get_global_context_value, @@ -57,39 +57,22 @@ def horizontal_vectorization_strategy(sumfacts): if sumf.preferred_position is None: position_mapping[sumf] = available.pop() - # Enable vectorization strategy: - inp = get_counted_variable("joined_input") - buf = get_counted_variable("joined_buffer") - - # Collect the large matrices! - large_matrix_sequence = [] - for i in range(len(next(iter(sumfacts)).matrix_sequence)): - # Assert that the matrices of all sum factorizations have the same size - assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1 - assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1 - - # Collect the derivative information - derivative = [False] * 4 - for sf in sumfacts: - derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative - - large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows, - cols=next(iter(sumfacts)).matrix_sequence[i].cols, - transpose=next(iter(sumfacts)).matrix_sequence[i].transpose, - derivative=tuple(derivative), - face=next(iter(sumfacts)).matrix_sequence[i].face, - ) - large_matrix_sequence.append(large) + # Store the kernels as tuple according to their positions + sorting = [None] * len(position_mapping) + for sf, pos in position_mapping.items(): + sorting[pos] = sf + kernels = tuple(sorting) + + buffer = get_counted_variable("joined_buffer") + input = get_counted_variable("joined_input") for sumf in sumfacts: _cache_vectorization_info(sumf, - sumf.copy(matrix_sequence=tuple(large_matrix_sequence), - buffer=buf, - input=inp, - index=position_mapping[sumf], - padding=frozenset(available), - insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts), - ) + VectorizedSumfactKernel(kernels=kernels, + vector_width=4, + buffer=buffer, + input=input, + ) ) else: # Disable vectorization strategy diff --git a/python/setup.py b/python/setup.py index 00f6f75e..cfc85288 100644 --- a/python/setup.py +++ b/python/setup.py @@ -39,7 +39,7 @@ setup(name='dune.perftool', 'dune.perftool.ufl', 'dune.perftool.ufl.transformations', ], - install_requires=['dune.testtools', 'sympy'], + install_requires=['dune.testtools', 'sympy', 'frozendict'], tests_require=['pytest'], cmdclass={'test': PyTest}, entry_points = { -- GitLab