Skip to content
Snippets Groups Projects
Commit 2c9cdfc6 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

First improvement of vectorization strategy

parent f84650ac
No related branches found
No related tags found
No related merge requests found
...@@ -130,12 +130,35 @@ class SumfactKernel(ImmutableRecord, prim.Variable): ...@@ -130,12 +130,35 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
mapper_method = "map_sumfact_kernel" mapper_method = "map_sumfact_kernel"
#
# Some cache key definitions
# Watch out for the documentation to see which key is used unter what circumstances
#
@property
def cache_key(self):
""" The cache key that can be used in generation magic
Any two sum factorization kernels having the same cache_key
are realized simulatenously!
"""
return (self.matrix_sequence, self.restriction, self.stage, self.buffer)
@property
def input_key(self):
""" A cache key for the input coefficients
Any two sum factorization kernels having the same input_key
work on the same input coefficient (and are suitable for simultaneous
treatment because of that)
"""
return (self.restriction, self.stage, self.coeff_func, self.element, self.component, self.accumvar)
# #
# Some convenience methods to extract information about the sum factorization kernel # Some convenience methods to extract information about the sum factorization kernel
# #
@property @property
def length(self): def length(self):
""" The number of matrices to apply """
return len(self.matrix_sequence) return len(self.matrix_sequence)
@property @property
...@@ -146,14 +169,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable): ...@@ -146,14 +169,6 @@ class SumfactKernel(ImmutableRecord, prim.Variable):
def transposed(self): def transposed(self):
return next(iter(self.matrix_sequence)).transpose return next(iter(self.matrix_sequence)).transpose
@property
def cache_key(self):
""" The cache key that can be used in generation magic,
Any two sum factorization kernels having the same cache_key
are realized simulatneously!
"""
return hash((self.matrix_sequence, self.restriction, self.stage, self.buffer))
@property @property
def vec_index(self): def vec_index(self):
""" A tuple with the vector index """ A tuple with the vector index
......
...@@ -39,13 +39,12 @@ def no_vectorization(sumfacts): ...@@ -39,13 +39,12 @@ def no_vectorization(sumfacts):
input=get_counted_variable("input"))) input=get_counted_variable("input")))
def decide_stage_vectorization_strategy(sumfacts, stage, restriction): def horizontal_vectorization_strategy(sumfacts):
stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction]) if len(sumfacts) in (3, 4):
if len(stage_sumfacts) in (3, 4):
# Map the sum factorization to their position in the joint kernel # Map the sum factorization to their position in the joint kernel
position_mapping = {} position_mapping = {}
available = set(range(4)) available = set(range(4))
for sf in stage_sumfacts: for sf in sumfacts:
if sf.preferred_position is not None: if sf.preferred_position is not None:
# This asserts that no two kernels want to take the same position # This asserts that no two kernels want to take the same position
# Later on, more complicated stuff might be necessary here. # Later on, more complicated stuff might be necessary here.
...@@ -54,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): ...@@ -54,7 +53,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
position_mapping[sf] = sf.preferred_position position_mapping[sf] = sf.preferred_position
# Choose a position for those that have no preferred one! # Choose a position for those that have no preferred one!
for sumf in stage_sumfacts: for sumf in sumfacts:
if sumf.preferred_position is None: if sumf.preferred_position is None:
position_mapping[sumf] = available.pop() position_mapping[sumf] = available.pop()
...@@ -64,37 +63,37 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): ...@@ -64,37 +63,37 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
# Collect the large matrices! # Collect the large matrices!
large_matrix_sequence = [] large_matrix_sequence = []
for i in range(len(next(iter(stage_sumfacts)).matrix_sequence)): for i in range(len(next(iter(sumfacts)).matrix_sequence)):
# Assert that the matrices of all sum factorizations have the same size # Assert that the matrices of all sum factorizations have the same size
assert len(set(tuple(sf.matrix_sequence[i].rows for sf in stage_sumfacts))) == 1 assert len(set(tuple(sf.matrix_sequence[i].rows for sf in sumfacts))) == 1
assert len(set(tuple(sf.matrix_sequence[i].cols for sf in stage_sumfacts))) == 1 assert len(set(tuple(sf.matrix_sequence[i].cols for sf in sumfacts))) == 1
# Collect the derivative information # Collect the derivative information
derivative = [False] * 4 derivative = [False] * 4
for sf in stage_sumfacts: for sf in sumfacts:
derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative derivative[position_mapping[sf]] = sf.matrix_sequence[i].derivative
large = BasisTabulationMatrixArray(rows=next(iter(stage_sumfacts)).matrix_sequence[i].rows, large = BasisTabulationMatrixArray(rows=next(iter(sumfacts)).matrix_sequence[i].rows,
cols=next(iter(stage_sumfacts)).matrix_sequence[i].cols, cols=next(iter(sumfacts)).matrix_sequence[i].cols,
transpose=next(iter(stage_sumfacts)).matrix_sequence[i].transpose, transpose=next(iter(sumfacts)).matrix_sequence[i].transpose,
derivative=tuple(derivative), derivative=tuple(derivative),
face=next(iter(stage_sumfacts)).matrix_sequence[i].face, face=next(iter(sumfacts)).matrix_sequence[i].face,
) )
large_matrix_sequence.append(large) large_matrix_sequence.append(large)
for sumf in stage_sumfacts: for sumf in sumfacts:
_cache_vectorization_info(sumf, _cache_vectorization_info(sumf,
sumf.copy(matrix_sequence=tuple(large_matrix_sequence), sumf.copy(matrix_sequence=tuple(large_matrix_sequence),
buffer=buf, buffer=buf,
input=inp, input=inp,
index=position_mapping[sumf], index=position_mapping[sumf],
padding=frozenset(available), padding=frozenset(available),
insn_dep=frozenset().union(sf.insn_dep for sf in stage_sumfacts), insn_dep=frozenset().union(sf.insn_dep for sf in sumfacts),
) )
) )
else: else:
# Disable vectorization strategy # Disable vectorization strategy
no_vectorization(stage_sumfacts) no_vectorization(sumfacts)
def decide_vectorization_strategy(): def decide_vectorization_strategy():
...@@ -108,12 +107,9 @@ def decide_vectorization_strategy(): ...@@ -108,12 +107,9 @@ def decide_vectorization_strategy():
if not get_option("vectorize_grads"): if not get_option("vectorize_grads"):
no_vectorization(sumfacts) no_vectorization(sumfacts)
else: else:
res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE) # Currently we base our idea here on the fact that we only group sum
# Stage 1 kernels # factorization kernels with the same input.
for restriction in res: inputkeys = set(sf.input_key for sf in sumfacts)
decide_stage_vectorization_strategy(sumfacts, 1, restriction) for inputkey in inputkeys:
sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
# Stage 3 kernels horizontal_vectorization_strategy(sumfact_filter)
import itertools as it
for restriction in it.product(res, res):
decide_stage_vectorization_strategy(sumfacts, 3, restriction)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment