From 2c9cdfc6fc19bfb688030af871f806425550b68e Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Mon, 3 Apr 2017 14:31:14 +0200 Subject: [PATCH] First improvement of vectorization strategy --- python/dune/perftool/sumfact/symbolic.py | 31 +++++++++---- python/dune/perftool/sumfact/vectorization.py | 46 +++++++++---------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py index e0f1c658..f44cecc3 100644 --- a/python/dune/perftool/sumfact/symbolic.py +++ b/python/dune/perftool/sumfact/symbolic.py @@ -130,12 +130,35 @@ class SumfactKernel(ImmutableRecord, prim.Variable): mapper_method = "map_sumfact_kernel" + # + # Some cache key definitions + # Watch out for the documentation to see which key is used unter what circumstances + # + + @property + def cache_key(self): + """ The cache key that can be used in generation magic + Any two sum factorization kernels having the same cache_key + are realized simulatenously! + """ + return (self.matrix_sequence, self.restriction, self.stage, self.buffer) + + @property + def input_key(self): + """ A cache key for the input coefficients + Any two sum factorization kernels having the same input_key + work on the same input coefficient (and are suitable for simultaneous + treatment because of that) + """ + return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar) + # # Some convenience methods to extract information about the sum factorization kernel # @property def length(self): + """ The number of matrices to apply """ return len(self.matrix_sequence) @property @@ -146,14 +169,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): def transposed(self): return next(iter(self.matrix_sequence)).transpose - @property - def cache_key(self): - """ The cache key that can be used in generation magic, - Any two sum factorization kernels having the same cache_key - are realized simulatneously! - """ - return hash((self.matrix_sequence, self.restriction, self.stage, self.buffer)) - @property def vec_index(self): """ A tuple with the vector index diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 8ca28d17..ea058464 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -39,13 +39,12 @@ def no_vectorization(sumfacts): input=get_counted_variable("input"))) -def decide_stage_vectorization_strategy(sumfacts, stage, restriction): - stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction]) - if len(stage_sumfacts) in (3, 4): +def horizontal_vectorization_strategy(sumfacts): + if len(sumfacts) in (3, 4): # Map the sum factorization to their position in the joint kernel position_mapping = {} available = set(range(4)) - for sf in stage_sumfacts: + for sf in sumfacts: if sf.preferred_position is not None: # This asserts that no two kernels want to take the same position # Later on, more complicated stuff might be necessary here. @@ -54,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): position_mapping[sf] = sf.preferred_position # Choose a position for those that have no preferred one! - for sumf in stage_sumfacts: + for sumf in sumfacts: if sumf.preferred_position is None: position_mapping[sumf] = available.pop() @@ -64,37 +63,37 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): # Collect the large matrices! large_matrix_sequence = [] - for i in range(len(next(iter(stage_sumfacts)).matrix_sequence)): + for i in range(len(next(iter(sumfacts)).matrix_sequence)): # Assert that the matrices of all sum factorizations have the same size - assert len(set(tuple(sf.matrix_sequence[i].rows for sf in stage_sumfacts))) == 1 - assert len(set(tuple(sf.matrix_sequence[i].cols for sf in stage_sumfacts))) == 1 + assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1 + assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1 # Collect the derivative information derivative = [False] * 4 - for sf in stage_sumfacts: + for sf in sumfacts: derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative - large = BasisTabulationMatrixArray(rows=next(iter(stage_sumfacts)).matrix_sequence[i].rows, - cols=next(iter(stage_sumfacts)).matrix_sequence[i].cols, - transpose=next(iter(stage_sumfacts)).matrix_sequence[i].transpose, + large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows, + cols=next(iter(sumfacts)).matrix_sequence[i].cols, + transpose=next(iter(sumfacts)).matrix_sequence[i].transpose, derivative=tuple(derivative), - face=next(iter(stage_sumfacts)).matrix_sequence[i].face, + face=next(iter(sumfacts)).matrix_sequence[i].face, ) large_matrix_sequence.append(large) - for sumf in stage_sumfacts: + for sumf in sumfacts: _cache_vectorization_info(sumf, sumf.copy(matrix_sequence=tuple(large_matrix_sequence), buffer=buf, input=inp, index=position_mapping[sumf], padding=frozenset(available), - insn_dep=frozenset().union(sf.insn_dep for sf in stage_sumfacts), + insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts), ) ) else: # Disable vectorization strategy - no_vectorization(stage_sumfacts) + no_vectorization(sumfacts) def decide_vectorization_strategy(): @@ -108,12 +107,9 @@ def decide_vectorization_strategy(): if not get_option("vectorize_grads"): no_vectorization(sumfacts) else: - res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE) - # Stage 1 kernels - for restriction in res: - decide_stage_vectorization_strategy(sumfacts, 1, restriction) - - # Stage 3 kernels - import itertools as it - for restriction in it.product(res, res): - decide_stage_vectorization_strategy(sumfacts, 3, restriction) + # Currently we base our idea here on the fact that we only group sum + # factorization kernels with the same input. + inputkeys = set(sf.input_key for sf in sumfacts) + for inputkey in inputkeys: + sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey] + horizontal_vectorization_strategy(sumfact_filter) -- GitLab