Skip to content
Snippets Groups Projects
Commit 62a75918 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Merge branch 'feature/paper-vectorization' into 'master'

Heuristic vectorization strategy

See merge request !190
parents 475df2c0 a75470f1
No related branches found
No related tags found
No related merge requests found
......@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters
dim = 3
mbperrank = 100
ranks = 16
ranks = 32
floatingbytes = 8
# Metaini Calculations
......@@ -38,7 +38,7 @@ extension = vtu
fastdg = 1
sumfact = 1
vectorize_quad = 1
vectorize_grads = 1
vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount
......@@ -46,4 +46,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants]
cell = hexahedron
degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand
degree = 3, 7 | expand
......@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters
dim = 3
mbperrank = 100
ranks = 16
ranks = 32
floatingbytes = 8
# Metaini Calculations
......@@ -38,12 +38,10 @@ extension = vtu
fastdg = 1
sumfact = 1
vectorize_quad = 1
vectorize_grads = 1
vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount
exact_solution_expression = g
compare_l2errorsquared = 1e-6
quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants]
......
......@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters
dim = 3
mbperrank = 100
ranks = 16
ranks = 32
floatingbytes = 8
# Metaini Calculations
......@@ -42,8 +42,6 @@ vectorize_slice = 1
instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount
exact_solution_expression = g
compare_l2errorsquared = 1e-6
quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
[formcompiler.ufl_variants]
......
......@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
# Input parameters
dim = 3
mbperrank = 100
ranks = 16
ranks = 32
floatingbytes = 8
# Metaini Calculations
......@@ -39,7 +39,7 @@ extension = vtu
fastdg = 1
sumfact = 1
vectorize_quad = 1
vectorize_grads = 1
vectorize_greedy = 1
instrumentation_level = 2, 3, 4 | expand
opcounter = 1, 0 | expand opcount
time_opcounter = 0, 1 | expand opcount
......@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
[formcompiler.ufl_variants]
cell = hexahedron
v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree
p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9 | expand degree
v_degree = 3, 7 | expand degree
p_degree = 2, 6 | expand degree
cell = hexahedron
x = SpatialCoordinate(cell)
g_v = as_vector((16.*x[1]*(1.-x[1])*x[2]*(1.-x[2]), 0.0, 0.0))
g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0))
bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
V = VectorElement("DG", cell, v_degree)
......
......@@ -6,6 +6,7 @@
# Load modules
ml gcc/6.2
ml tbb
ml intelmpi
ml openblas
ml metis
......@@ -15,7 +16,7 @@ ml suitesparse
#SBATCH -J poisson_dg
# Number of processes
#SBATCH -n 16
#SBATCH -n 32
# Choose the SLURM partition (sinfo for overview)
#SBATCH -p haswell16c
......@@ -28,7 +29,7 @@ ml suitesparse
SRUNOPT="--cpu_bind=verbose,core"
# Search for runnable executables
FILES=$(ls *.ini)
FILES=$(ls *.ini | grep -v '^verify')
for inifile in $FILES
do
line=$(grep ^"opcounter = " $inifile)
......
......@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord):
vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
......
......@@ -282,7 +282,7 @@ def _realize_sum_factorization_kernel(sf):
# Measure times and count operations in c++ code
if get_option("instrumentation_level") >= 4:
stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
depends_on=frozenset({tag}),
depends_on=frozenset({lp.match.Tagged(tag)}),
within_inames=frozenset(sf.within_inames))})
if sf.stage == 1:
qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'
......
......@@ -51,6 +51,13 @@ def no_vectorization(sumfacts):
def vertical_vectorization_strategy(sumfact, depth):
# If depth is 1, there is nothing do
if depth == 1:
if isinstance(sumfact, SumfactKernel):
return {sumfact: sumfact}
else:
return {k: sumfact for k in sumfact.kernels}
# Assert that this is not already sliced
assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
result = {}
......@@ -139,6 +146,36 @@ def diagonal_vectorization_strategy(sumfacts, width):
return result
def greedy_vectorization_strategy(sumfacts, width):
sumfacts = set(sumfacts)
horizontal = width
vertical = 1
allowed_padding = 1
result = {}
while horizontal > 0:
if horizontal > 1:
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
else:
horizontal_kernels = {sf: sf for sf in sumfacts}
for sf in horizontal_kernels:
if horizontal_kernels[sf].horizontal_width == horizontal:
vert = vertical_vectorization_strategy(horizontal_kernels[sf],
vertical)
for k in vert:
result[k] = vert[k]
sumfacts.discard(sf)
horizontal = horizontal // 2
vertical = vertical * 2
# We heuristically allow padding only on the full SIMD width
allowed_padding = 0
return result
def decide_vectorization_strategy():
""" Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this,
......@@ -188,6 +225,13 @@ def decide_vectorization_strategy():
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
elif get_option("vectorize_greedy"):
inputkeys = set(sf.input_key for sf in active_sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
else:
for old, new in no_vectorization(active_sumfacts).items():
sfdict[old] = new
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment