Skip to content
Snippets Groups Projects
Commit 94e8e7df authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Implement everything through a costmodel and adapt the ini options

parent 7c27277d
No related branches found
No related tags found
No related merge requests found
...@@ -54,15 +54,12 @@ class PerftoolOptionsArray(ImmutableRecord): ...@@ -54,15 +54,12 @@ class PerftoolOptionsArray(ImmutableRecord):
project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project") project_basedir = PerftoolOption(helpstr="The base (build) directory of the dune-perftool project")
fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.") fastdg = PerftoolOption(default=False, helpstr="Use FastDGGridOperator from PDELab.")
sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization") sumfact = PerftoolOption(default=False, helpstr="Use sumfactorization")
vectorize_quad = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_quadloop = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_strategy = PerftoolOption(default="none", helpstr="The identifier of the vectorization cost model")
vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization read by the 'explicit' strategy")
vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorization_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization read by the 'explicit' strategy")
vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)") vectorization_padding = PerftoolOption(default=None, helpstr="an explicit value for the allowed padding in vectorization")
vectorize_horizontal = PerftoolOption(default=None, helpstr="an explicit value for horizontal vectorization") vectorization_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
vectorize_vertical = PerftoolOption(default=None, helpstr="an explicit value for vertical vectorization")
vectorize_padding = PerftoolOption(default=None, helpstr="an explicit value for padding in vectorization")
vectorize_allow_quadrature_changes = PerftoolOption(default=False, helpstr="whether the vectorization strategy is allowed to alter quadrature point numbers")
turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")
grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.") grid_offset = PerftoolOption(default=False, helpstr="Set to true if you want a yasp grid where the lower left corner is not in the origin.")
......
...@@ -549,7 +549,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): ...@@ -549,7 +549,7 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
kernel = heuristic_duplication(kernel) kernel = heuristic_duplication(kernel)
# Maybe apply vectorization strategies # Maybe apply vectorization strategies
if get_option("vectorize_quad"): if get_option("vectorization_quadloop"):
if get_option("sumfact"): if get_option("sumfact"):
from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop from dune.perftool.loopy.transformations.vectorize_quad import vectorize_quadrature_loop
kernel = vectorize_quadrature_loop(kernel) kernel = vectorize_quadrature_loop(kernel)
......
...@@ -4,7 +4,9 @@ import logging ...@@ -4,7 +4,9 @@ import logging
from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.loopy.vcl import get_vcl_type_size
from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
from dune.perftool.generation import (generator_factory, from dune.perftool.generation import (backend,
generator_factory,
get_backend,
get_counted_variable, get_counted_variable,
get_global_context_value, get_global_context_value,
) )
...@@ -48,165 +50,38 @@ def attach_vectorization_info(sf): ...@@ -48,165 +50,38 @@ def attach_vectorization_info(sf):
return _cache_vectorization_info(sf, None) return _cache_vectorization_info(sf, None)
def no_vec(sf): @backend(interface="vectorization_costfunction", name="greedy")
return sf.copy(buffer=get_counted_variable("buffer")) def greedy_costfunction(sf):
return 1
def no_vectorization(sumfacts): @backend(interface="vectorization_costfunction", name="explicit")
return {sf: no_vec(sf) for sf in sumfacts} def explicit_costfunction(sf):
# Read the explicitly set values for horizontal and vertical vectorization
width = get_vcl_type_size(np.float64)
horizontal = int(get_option("vectorization_horizontal", width))
vertical = int(get_option("vectorization_vertical", 1))
if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
def vertical_vectorization_strategy(sumfact, depth): return 1
# If depth is 1, there is nothing do
if depth == 1:
if isinstance(sumfact, SumfactKernel):
return {sumfact: sumfact}
else:
return {k: sumfact for k in sumfact.kernels}
# Assert that this is not already sliced
assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
result = {}
# Determine which of the matrices in the kernel should be sliced
def determine_slice_direction(sf):
for i, mat in enumerate(sf.matrix_sequence):
if mat.quadrature_size % depth == 0:
return i
elif get_option("vectorize_allow_quadrature_changes") and mat.quadrature_size != 1:
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
set_quadrature_points(tuple(quad))
return i
elif mat.quadrature_size != 1:
raise PerftoolError("Vertical vectorization is not possible!")
if isinstance(sumfact, SumfactKernel):
kernels = [sumfact]
else: else:
assert isinstance(sumfact, VectorizedSumfactKernel) return 2
kernels = sumfact.kernels
newkernels = []
for sf in kernels:
sliced = determine_slice_direction(sf)
oldtab = sf.matrix_sequence[sliced]
for i in range(depth):
seq = list(sf.matrix_sequence)
seq[sliced] = oldtab.copy(slice_size=depth,
slice_index=i)
newkernels.append(sf.copy(matrix_sequence=tuple(seq)))
if isinstance(sumfact, SumfactKernel):
buffer = get_counted_variable("vertical_buffer")
result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels),
buffer=buffer,
vertical_width=depth,
)
else:
assert isinstance(sumfact, VectorizedSumfactKernel)
for sf in kernels:
result[sf] = sumfact.copy(kernels=tuple(newkernels),
vertical_width=depth,
)
return result
def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
result = {}
todo = set(sumfacts)
while todo:
kernels = []
for sf in sorted(todo, key=lambda s: s.position_priority):
if len(kernels) < width:
kernels.append(sf)
todo.discard(sf)
buffer = get_counted_variable("joined_buffer")
if len(kernels) in range(width - allow_padding, width + 1):
for sf in kernels:
result[sf] = VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=width,
buffer=buffer,
)
return result
def diagonal_vectorization_strategy(sumfacts, width):
# Read explicitly set values
horizontal = get_option("vectorize_horizontal")
vertical = get_option("vectorize_vertical")
padding = get_option("vectorize_padding")
if width == 4:
if horizontal is None:
horizontal = 2
if vertical is None:
vertical = 2
if padding is None:
padding = 0
elif width == 8:
if horizontal is None:
horizontal = 4
if vertical is None:
vertical = 2
if padding is None:
padding = 1
else:
raise NotImplementedError
horizontal = int(horizontal)
vertical = int(vertical)
padding = int(padding)
result = {}
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=padding)
for sf in horizontal_kernels:
vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width)
for k in vert:
result[k] = vert[k]
return result
def strategy_cost(strategy):
def greedy_vectorization_strategy(sumfacts, width): qp, strategy = strategy
sumfacts = set(sumfacts) set_quadrature_points(qp)
horizontal = width func = get_backend(interface="vectorization_costfunction",
vertical = 1 name=get_option("vectorization_strategy"))
allowed_padding = 1 return sum(float(func(sf)) for sf in strategy.values())
result = {}
while horizontal > 0:
if horizontal > 1:
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
else:
horizontal_kernels = {sf: sf for sf in sumfacts}
for sf in horizontal_kernels:
if horizontal_kernels[sf].horizontal_width == horizontal:
vert = vertical_vectorization_strategy(horizontal_kernels[sf],
vertical)
for k in vert:
result[k] = vert[k]
sumfacts.discard(sf)
horizontal = horizontal // 2
vertical = vertical * 2
# We heuristically allow padding only on the full SIMD width
allowed_padding = 0
return result
def print_vectorization_strategy(strategy): def stringify_vectorization_strategy(strategy):
result = []
qp, strategy = strategy qp, strategy = strategy
print "\nPrinting potential vectorization strategy:"
print "Quadrature point tuple: {}".format(qp) result.append["Printing potential vectorization strategy:"]
result.append["Quadrature point tuple: {}".format(qp)]
# Look for all realizations in the strategy and iterate over them # Look for all realizations in the strategy and iterate over them
cache_keys = frozenset(v.cache_key for v in strategy.values()) cache_keys = frozenset(v.cache_key for v in strategy.values())
...@@ -214,12 +89,12 @@ def print_vectorization_strategy(strategy): ...@@ -214,12 +89,12 @@ def print_vectorization_strategy(strategy):
# Filter all the kernels that are realized by this and print # Filter all the kernels that are realized by this and print
for key in strategy: for key in strategy:
if strategy[key].cache_key == ck: if strategy[key].cache_key == ck:
print "{}:".format(key) result.append["{}:".format(key)]
# Find one representative to print # Find one representative to print
for val in strategy.values(): for val in strategy.values():
if val.cache_key == ck: if val.cache_key == ck:
print " {}".format(val) result.append[" {}".format(val)]
break break
...@@ -243,45 +118,29 @@ def decide_vectorization_strategy(): ...@@ -243,45 +118,29 @@ def decide_vectorization_strategy():
# All sum factorization kernels that get used # All sum factorization kernels that get used
active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts] active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
# We map inacitve sum factorizatino kernels to 0 # If no vectorization is needed, abort now
sfdict = {} if get_option("vectorization_strategy") == "none":
for sf in inactive_sumfacts: for sf in all_sumfacts:
sfdict[sf] = 0 _cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
return
logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes" logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
.format(len(active_sumfacts))) .format(len(active_sumfacts)))
if get_option("vectorize_grads"): # Find the best vectorization strategy by using a costmodel
# Currently we base our idea here on the fact that we only group sum width = get_vcl_type_size(np.float64)
# factorization kernels with the same input. strategy = min(vectorization_opportunity_generator(active_sumfacts, width),
inputkeys = set(sf.input_key for sf in active_sumfacts) key=strategy_cost)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64) # Treat the quadrature points
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] qp, sfdict = strategy
for old, new in horizontal_vectorization_strategy(sumfact_filter, width).items(): set_quadrature_points(qp)
sfdict[old] = new
elif get_option("vectorize_slice"): logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
for sumfact in active_sumfacts: "\n".join(stringify_vectorization_strategy(strategy)))
width = get_vcl_type_size(np.float64)
for old, new in vertical_vectorization_strategy(sumfact, width).items(): # We map inactive sum factorization kernels to 0
sfdict[old] = new sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
elif get_option("vectorize_diagonal"):
inputkeys = set(sf.input_key for sf in active_sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
elif get_option("vectorize_greedy"):
inputkeys = set(sf.input_key for sf in active_sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
sfdict[old] = new
else:
for old, new in no_vectorization(active_sumfacts).items():
sfdict[old] = new
# Register the results # Register the results
for sf in all_sumfacts: for sf in all_sumfacts:
...@@ -297,7 +156,7 @@ def vectorization_opportunity_generator(sumfacts, width): ...@@ -297,7 +156,7 @@ def vectorization_opportunity_generator(sumfacts, width):
# #
quad_points = [quadrature_points_per_direction()] quad_points = [quadrature_points_per_direction()]
if True or get_option("vectorize_allow_quadrature_changes"): if True or get_option("vectorization_allow_quadrature_changes"):
sf = next(iter(sumfacts)) sf = next(iter(sumfacts))
depth = 1 depth = 1
while depth <= width: while depth <= width:
...@@ -331,7 +190,9 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already= ...@@ -331,7 +190,9 @@ def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=
for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}), for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
width, width,
qp, qp,
add_to_frozendict(already, {sf_to_decide: sf_to_decide}), add_to_frozendict(already,
{sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
),
): ):
yield opp yield opp
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment