Skip to content
Snippets Groups Projects
Commit 0e8b4ea5 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Implement a greedy vectorization strategy

It always aims for maximally horizontal vectorization
where possible.
parent 475df2c0
No related branches found
No related tags found
No related merge requests found
...@@ -38,7 +38,7 @@ extension = vtu ...@@ -38,7 +38,7 @@ extension = vtu
fastdg = 1 fastdg = 1
sumfact = 1 sumfact = 1
vectorize_quad = 1 vectorize_quad = 1
vectorize_grads = 1 vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
...@@ -48,4 +48,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval ...@@ -48,4 +48,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
cell = hexahedron cell = hexahedron
degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree = 1, 3, 5, 7, 9 | expand
...@@ -39,7 +39,7 @@ extension = vtu ...@@ -39,7 +39,7 @@ extension = vtu
fastdg = 1 fastdg = 1
sumfact = 1 sumfact = 1
vectorize_quad = 1 vectorize_quad = 1
vectorize_grads = 1 vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
...@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval ...@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
cell = hexahedron cell = hexahedron
v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree v_degree = 1, 3, 5, 7, 9 | expand degree
p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9 | expand degree p_degree = 0, 2, 4, 6, 8 | expand degree
...@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord): ...@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord):
vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
......
...@@ -51,6 +51,13 @@ def no_vectorization(sumfacts): ...@@ -51,6 +51,13 @@ def no_vectorization(sumfacts):
def vertical_vectorization_strategy(sumfact, depth): def vertical_vectorization_strategy(sumfact, depth):
# If depth is 1, there is nothing do
if depth == 1:
if isinstance(sumfact, SumfactKernel):
return {sumfact: sumfact}
else:
return {k: sumfact for k in sumfact.kernels}
# Assert that this is not already sliced # Assert that this is not already sliced
assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
result = {} result = {}
...@@ -139,6 +146,33 @@ def diagonal_vectorization_strategy(sumfacts, width): ...@@ -139,6 +146,33 @@ def diagonal_vectorization_strategy(sumfacts, width):
return result return result
def greedy_vectorization_strategy(sumfacts, width):
sumfacts = set(sumfacts)
horizontal = width
vertical = 1
result = {}
while horizontal > 0:
if horizontal > 1:
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0)
else:
horizontal_kernels = {sf: sf for sf in sumfacts}
for sf in horizontal_kernels:
if horizontal_kernels[sf].horizontal_width == horizontal:
vert = vertical_vectorization_strategy(horizontal_kernels[sf],
vertical)
for k in vert:
result[k] = vert[k]
sumfacts.discard(sf)
horizontal = horizontal / 2
vertical = vertical * 2
return result
def decide_vectorization_strategy(): def decide_vectorization_strategy():
""" Decide how to vectorize! """ Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this, Note that the vectorization of the quadrature loop is independent of this,
...@@ -188,6 +222,13 @@ def decide_vectorization_strategy(): ...@@ -188,6 +222,13 @@ def decide_vectorization_strategy():
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new sfdict[old] = new
elif get_option("vectorize_greedy"):
inputkeys = set(sf.input_key for sf in active_sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
else: else:
for old, new in no_vectorization(active_sumfacts).items(): for old, new in no_vectorization(active_sumfacts).items():
sfdict[old] = new sfdict[old] = new
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment