Skip to content
Snippets Groups Projects
Commit 9087f6c9 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Implement a generator for *all* vectorization opportunities

parent d09eebf6
No related branches found
No related tags found
No related merge requests found
""" Implementation of the cost model used for vectorization
"""
def ilp_heuristic(sf):
""" A heuristic measure for the ILP capabilities of the kernel
Return is the throttle of insufficient ILP, generally in (0., 1.0].
1.0 indicates perfect instruction level parallelism.
"""
return 1.0
def vectorization_costfunction(sf):
# kernel characteristics necessary
traffic = sf.memory_traffic
intensity = sf.operations / sf.memory_traffic
# hardware characteristics necessary
bandwidth = 100000000000000000.0
peakperformance = 100.0
roofline = min(bandwidth * intensity, ilp_heuristics(sf) * peakperformance)
return (traffic * intensity) / roofline
...@@ -17,8 +17,10 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray, ...@@ -17,8 +17,10 @@ from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
) )
from dune.perftool.error import PerftoolError from dune.perftool.error import PerftoolError
from dune.perftool.options import get_option from dune.perftool.options import get_option
from dune.perftool.tools import round_to_multiple from dune.perftool.tools import add_to_frozendict,round_to_multiple
from frozendict import frozendict
import itertools as it
import loopy as lp import loopy as lp
import numpy as np import numpy as np
...@@ -264,3 +266,120 @@ def decide_vectorization_strategy(): ...@@ -264,3 +266,120 @@ def decide_vectorization_strategy():
# Register the results # Register the results
for sf in all_sumfacts: for sf in all_sumfacts:
_cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf))) _cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf)))
def vectorization_opportunity_generator(sumfacts, width):
""" Generator that yields all vectorization opportunities for the given
sum factorization kernels as tuples of quadrature point tuple and vectorization
dictionary """
#
# Find all the possible quadrature point tuples
#
quad_points = [quadrature_points_per_direction()]
if True or get_option("vectorize_allow_quadrature_changes"):
sf = next(iter(sumfacts))
depth = 1
while depth <= width:
i = 0 if sf.matrix_sequence[0].face is None else 1
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
quad_points.append(tuple(quad))
depth = depth * 2
quad_points = list(set(quad_points))
for qp in quad_points:
#
# Determine vectorization opportunities given a fixed quadrature point number
#
for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp):
yield qp, opp
def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
# We have gone into recursion deep enough to have all sum factorization nodes
# assigned their vectorized counterpart. We can yield the result now!
yield already
return
# Otherwise we pick a random sum factorization kernel and construct all the vectorization
# opportunities realizing this particular kernel and go into recursion.
sf_to_decide = next(iter(sumfacts))
# Have "unvectorized" as an option, although it is not good
for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
width,
qp,
add_to_frozendict(already, {sf_to_decide: sf_to_decide}),
):
yield opp
# Find all the sum factorization kernels that the chosen kernel can be parallelized with
#
# TODO: Right now we check for same input, which is not actually needed in order
# to be a suitable candidate! We should relax this concept at some point!
candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts)
horizontal = 1
while horizontal <= width:
# Iterate over the possible combinations of sum factorization kernels
# taking into account all the permutations of kernels. This also includes
# combinations which use a padding of 1.
for combo in it.chain(it.permutations(candidates, horizontal),
it.permutations(candidates, horizontal - 1)):
# The chosen kernels must be part of the kernels for recursion
# to work correctly
if sf_to_decide not in combo:
continue
# Set up the vectorization dict for this combo
vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yield opp
horizontal = horizontal * 2
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
# Enhance the list of sumfact nodes by adding vertical splittings
kernels = []
for sf in sumfacts:
# No slicing needed in the pure horizontal case
if vertical == 1:
kernels.append(sf)
continue
# Determine the slicing direction
slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
if qp[slice_direction] % vertical != 0:
return None
# Split the basis tabulation matrices
oldtab = sf.matrix_sequence[slice_direction]
for i in range(vertical):
seq = list(sf.matrix_sequence)
seq[slice_direction] = oldtab.copy(slice_size=vertical,
slice_index=i)
kernels.append(sf.copy(matrix_sequence=tuple(seq)))
# Join the new kernels into a sum factorization node
buffer = get_counted_variable("joined_buffer")
return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=horizontal,
vertical_width=vertical,
buffer=buffer,
) for sf in sumfacts}
...@@ -3,6 +3,7 @@ from __future__ import absolute_import ...@@ -3,6 +3,7 @@ from __future__ import absolute_import
import loopy as lp import loopy as lp
import pymbolic.primitives as prim import pymbolic.primitives as prim
import frozendict
def get_pymbolic_basename(expr): def get_pymbolic_basename(expr):
...@@ -67,3 +68,9 @@ def ceildiv(a, b): ...@@ -67,3 +68,9 @@ def ceildiv(a, b):
def round_to_multiple(x, n): def round_to_multiple(x, n):
return n * ceildiv(x, n) return n * ceildiv(x, n)
def add_to_frozendict(fd, valdict):
t = dict(fd)
t.update(valdict)
return frozendict.frozendict(t)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment