Skip to content
Snippets Groups Projects
Commit 62a75918 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Merge branch 'feature/paper-vectorization' into 'master'

Heuristic vectorization strategy

See merge request !190
parents 475df2c0 a75470f1
No related branches found
No related tags found
No related merge requests found
...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount ...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters # Input parameters
dim = 3 dim = 3
mbperrank = 100 mbperrank = 100
ranks = 16 ranks = 32
floatingbytes = 8 floatingbytes = 8
# Metaini Calculations # Metaini Calculations
...@@ -38,7 +38,7 @@ extension = vtu ...@@ -38,7 +38,7 @@ extension = vtu
fastdg = 1 fastdg = 1
sumfact = 1 sumfact = 1
vectorize_quad = 1 vectorize_quad = 1
vectorize_grads = 1 vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
...@@ -46,4 +46,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval ...@@ -46,4 +46,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
cell = hexahedron cell = hexahedron
degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree = 3, 7 | expand
...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount ...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters # Input parameters
dim = 3 dim = 3
mbperrank = 100 mbperrank = 100
ranks = 16 ranks = 32
floatingbytes = 8 floatingbytes = 8
# Metaini Calculations # Metaini Calculations
...@@ -38,12 +38,10 @@ extension = vtu ...@@ -38,12 +38,10 @@ extension = vtu
fastdg = 1 fastdg = 1
sumfact = 1 sumfact = 1
vectorize_quad = 1 vectorize_quad = 1
vectorize_grads = 1 vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
exact_solution_expression = g
compare_l2errorsquared = 1e-6
quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
......
...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount ...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters # Input parameters
dim = 3 dim = 3
mbperrank = 100 mbperrank = 100
ranks = 16 ranks = 32
floatingbytes = 8 floatingbytes = 8
# Metaini Calculations # Metaini Calculations
...@@ -42,8 +42,6 @@ vectorize_slice = 1 ...@@ -42,8 +42,6 @@ vectorize_slice = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
exact_solution_expression = g
compare_l2errorsquared = 1e-6
quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
......
...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount ...@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters # Input parameters
dim = 3 dim = 3
mbperrank = 100 mbperrank = 100
ranks = 16 ranks = 32
floatingbytes = 8 floatingbytes = 8
# Metaini Calculations # Metaini Calculations
...@@ -39,7 +39,7 @@ extension = vtu ...@@ -39,7 +39,7 @@ extension = vtu
fastdg = 1 fastdg = 1
sumfact = 1 sumfact = 1
vectorize_quad = 1 vectorize_quad = 1
vectorize_grads = 1 vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount time_opcounter = 0, 1 | expand opcount
...@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval ...@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
[formcompiler.ufl_variants] [formcompiler.ufl_variants]
cell = hexahedron cell = hexahedron
v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree v_degree = 3, 7 | expand degree
p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9 | expand degree p_degree = 2, 6 | expand degree
cell = hexahedron cell = hexahedron
x = SpatialCoordinate(cell) x = SpatialCoordinate(cell)
g_v = as_vector((16.*x[1]*(1.-x[1])*x[2]*(1.-x[2]), 0.0, 0.0)) g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0))
bctype = conditional(x[0] < 1. - 1e-8, 1, 0) bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
V = VectorElement("DG", cell, v_degree) V = VectorElement("DG", cell, v_degree)
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
# Load modules # Load modules
ml gcc/6.2 ml gcc/6.2
ml tbb
ml intelmpi ml intelmpi
ml openblas ml openblas
ml metis ml metis
...@@ -15,7 +16,7 @@ ml suitesparse ...@@ -15,7 +16,7 @@ ml suitesparse
#SBATCH -J poisson_dg #SBATCH -J poisson_dg
# Number of processes # Number of processes
#SBATCH -n 16 #SBATCH -n 32
# Choose the SLURM partition (sinfo for overview) # Choose the SLURM partition (sinfo for overview)
#SBATCH -p haswell16c #SBATCH -p haswell16c
...@@ -28,7 +29,7 @@ ml suitesparse ...@@ -28,7 +29,7 @@ ml suitesparse
SRUNOPT="--cpu_bind=verbose,core" SRUNOPT="--cpu_bind=verbose,core"
# Search for runnable executables # Search for runnable executables
FILES=$(ls *.ini) FILES=$(ls *.ini | grep -v '^verify')
for inifile in $FILES for inifile in $FILES
do do
line=$(grep ^"opcounter = " $inifile) line=$(grep ^"opcounter = " $inifile)
......
...@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord): ...@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord):
vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
......
...@@ -282,7 +282,7 @@ def _realize_sum_factorization_kernel(sf): ...@@ -282,7 +282,7 @@ def _realize_sum_factorization_kernel(sf):
# Measure times and count operations in c++ code # Measure times and count operations in c++ code
if get_option("instrumentation_level") >= 4: if get_option("instrumentation_level") >= 4:
stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
depends_on=frozenset({tag}), depends_on=frozenset({lp.match.Tagged(tag)}),
within_inames=frozenset(sf.within_inames))}) within_inames=frozenset(sf.within_inames))})
if sf.stage == 1: if sf.stage == 1:
qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
......
...@@ -51,6 +51,13 @@ def no_vectorization(sumfacts): ...@@ -51,6 +51,13 @@ def no_vectorization(sumfacts):
def vertical_vectorization_strategy(sumfact, depth): def vertical_vectorization_strategy(sumfact, depth):
# If depth is 1, there is nothing do
if depth == 1:
if isinstance(sumfact, SumfactKernel):
return {sumfact: sumfact}
else:
return {k: sumfact for k in sumfact.kernels}
# Assert that this is not already sliced # Assert that this is not already sliced
assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
result = {} result = {}
...@@ -139,6 +146,36 @@ def diagonal_vectorization_strategy(sumfacts, width): ...@@ -139,6 +146,36 @@ def diagonal_vectorization_strategy(sumfacts, width):
return result return result
def greedy_vectorization_strategy(sumfacts, width):
sumfacts = set(sumfacts)
horizontal = width
vertical = 1
allowed_padding = 1
result = {}
while horizontal > 0:
if horizontal > 1:
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
else:
horizontal_kernels = {sf: sf for sf in sumfacts}
for sf in horizontal_kernels:
if horizontal_kernels[sf].horizontal_width == horizontal:
vert = vertical_vectorization_strategy(horizontal_kernels[sf],
vertical)
for k in vert:
result[k] = vert[k]
sumfacts.discard(sf)
horizontal = horizontal // 2
vertical = vertical * 2
# We heuristically allow padding only on the full SIMD width
allowed_padding = 0
return result
def decide_vectorization_strategy(): def decide_vectorization_strategy():
""" Decide how to vectorize! """ Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this, Note that the vectorization of the quadrature loop is independent of this,
...@@ -188,6 +225,13 @@ def decide_vectorization_strategy(): ...@@ -188,6 +225,13 @@ def decide_vectorization_strategy():
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new sfdict[old] = new
elif get_option("vectorize_greedy"):
inputkeys = set(sf.input_key for sf in active_sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
else: else:
for old, new in no_vectorization(active_sumfacts).items(): for old, new in no_vectorization(active_sumfacts).items():
sfdict[old] = new sfdict[old] = new
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment