Newer
Older
""" Sum factorization vectorization """
from dune.perftool.loopy.vcl import get_vcl_type_size
from dune.perftool.loopy.symbolic import SumfactKernel, VectorizedSumfactKernel
from dune.perftool.generation import (backend,
generator_factory,
get_backend,
get_counted_variable,
from dune.perftool.pdelab.restriction import (Restriction,
restricted_name,
)
from dune.perftool.sumfact.tabulation import (BasisTabulationMatrixArray,
quadrature_points_per_direction,
set_quadrature_points,
from dune.perftool.error import PerftoolError
from dune.perftool.options import get_option
from dune.perftool.tools import add_to_frozendict,round_to_multiple
from frozendict import frozendict
import itertools as it
import loopy as lp
import numpy as np
@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
def _cache_vectorization_info(old, new):
if new is None:
raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!")
return new
_collect_sumfact_nodes = generator_factory(item_tags=("sumfactnodes", "dryrundata"), context_tags="kernel", no_deco=True)
def get_all_sumfact_nodes():
from dune.perftool.generation import retrieve_cache_items
return [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
def attach_vectorization_info(sf):
assert isinstance(sf, SumfactKernel)
if get_global_context_value("dry_run"):
@backend(interface="vectorization_costfunction", name="greedy")
def greedy_costfunction(sf):
return 1
@backend(interface="vectorization_costfunction", name="explicit")
def explicit_costfunction(sf):
# Read the explicitly set values for horizontal and vertical vectorization
width = get_vcl_type_size(np.float64)
horizontal = int(get_option("vectorization_horizontal", width))
vertical = int(get_option("vectorization_vertical", 1))
if sf.horizontal_width == horizontal and sf.vertical_width == vertical:
return 1
return 2
def strategy_cost(strategy):
qp, strategy = strategy
set_quadrature_points(qp)
func = get_backend(interface="vectorization_costfunction",
name=get_option("vectorization_strategy"))
return sum(float(func(sf)) for sf in strategy.values())
def stringify_vectorization_strategy(strategy):
result = []
qp, strategy = strategy
result.append["Printing potential vectorization strategy:"]
result.append["Quadrature point tuple: {}".format(qp)]
# Look for all realizations in the strategy and iterate over them
cache_keys = frozenset(v.cache_key for v in strategy.values())
for ck in cache_keys:
# Filter all the kernels that are realized by this and print
for key in strategy:
if strategy[key].cache_key == ck:
result.append["{}:".format(key)]
# Find one representative to print
for val in strategy.values():
if val.cache_key == ck:
result.append[" {}".format(val)]
def decide_vectorization_strategy():
""" Decide how to vectorize!
Note that the vectorization of the quadrature loop is independent of this,
as it is implemented through a post-processing (== loopy transformation) step.
"""
logger = logging.getLogger(__name__)
# Retrieve all sum factorization kernels for stage 1 and 3
from dune.perftool.generation import retrieve_cache_items
all_sumfacts = [i for i in retrieve_cache_items("kernel_default and sumfactnodes")]
# Stage 1 sumfactorizations that were actually used
basis_sumfacts = [i for i in retrieve_cache_items('kernel_default and basis_sf_kernels')]
# This means we can have sum factorizations that will not get used
inactive_sumfacts = [i for i in all_sumfacts if i.stage == 1 and i not in basis_sumfacts]
# All sum factorization kernels that get used
active_sumfacts = [i for i in all_sumfacts if i.stage == 3 or i in basis_sumfacts]
# If no vectorization is needed, abort now
if get_option("vectorization_strategy") == "none":
for sf in all_sumfacts:
_cache_vectorization_info(sf, sf.copy(buffer=get_counted_variable("buffer")))
return
logger.debug("decide_vectorization_strategy: Found {} active sum factorization nodes"
.format(len(active_sumfacts)))
# Find the best vectorization strategy by using a costmodel
width = get_vcl_type_size(np.float64)
strategy = min(vectorization_opportunity_generator(active_sumfacts, width),
key=strategy_cost)
# Treat the quadrature points
qp, sfdict = strategy
set_quadrature_points(qp)
logger.debug("decide_vectorization_strategy: Decided for the following strategy:"
"\n".join(stringify_vectorization_strategy(strategy)))
# We map inactive sum factorization kernels to 0
sfdict = add_to_frozendict(sfdict, {sf: 0 for sf in inactive_sumfacts})
for sf in all_sumfacts:
def vectorization_opportunity_generator(sumfacts, width):
""" Generator that yields all vectorization opportunities for the given
sum factorization kernels as tuples of quadrature point tuple and vectorization
dictionary """
#
# Find all the possible quadrature point tuples
#
quad_points = [quadrature_points_per_direction()]
if True or get_option("vectorization_allow_quadrature_changes"):
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
sf = next(iter(sumfacts))
depth = 1
while depth <= width:
i = 0 if sf.matrix_sequence[0].face is None else 1
quad = list(quadrature_points_per_direction())
quad[i] = round_to_multiple(quad[i], depth)
quad_points.append(tuple(quad))
depth = depth * 2
quad_points = list(set(quad_points))
for qp in quad_points:
#
# Determine vectorization opportunities given a fixed quadrature point number
#
for opp in fixed_quad_vectorization_opportunity_generator(frozenset(sumfacts), width, qp):
yield qp, opp
def fixed_quad_vectorization_opportunity_generator(sumfacts, width, qp, already=frozendict()):
if len(sumfacts) == 0:
# We have gone into recursion deep enough to have all sum factorization nodes
# assigned their vectorized counterpart. We can yield the result now!
yield already
return
# Otherwise we pick a random sum factorization kernel and construct all the vectorization
# opportunities realizing this particular kernel and go into recursion.
sf_to_decide = next(iter(sumfacts))
# Have "unvectorized" as an option, although it is not good
for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference({sf_to_decide}),
width,
qp,
add_to_frozendict(already,
{sf_to_decide: sf_to_decide.copy(buffer=get_counted_variable("buffer"))}
),
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
):
yield opp
# Find all the sum factorization kernels that the chosen kernel can be parallelized with
#
# TODO: Right now we check for same input, which is not actually needed in order
# to be a suitable candidate! We should relax this concept at some point!
candidates = filter(lambda sf: sf.input_key == sf_to_decide.input_key, sumfacts)
horizontal = 1
while horizontal <= width:
# Iterate over the possible combinations of sum factorization kernels
# taking into account all the permutations of kernels. This also includes
# combinations which use a padding of 1.
for combo in it.chain(it.permutations(candidates, horizontal),
it.permutations(candidates, horizontal - 1)):
# The chosen kernels must be part of the kernels for recursion
# to work correctly
if sf_to_decide not in combo:
continue
# Set up the vectorization dict for this combo
vecdict = get_vectorization_dict(combo, width // horizontal, horizontal, qp)
if vecdict is None:
# This particular choice was rejected for some reason.
# Possible reasons:
# * the quadrature point tuple not being suitable
# for this vectorization strategy
continue
# Go into recursion to also vectorize all kernels not in this combo
for opp in fixed_quad_vectorization_opportunity_generator(sumfacts.difference(combo),
width,
qp,
add_to_frozendict(already, vecdict),
):
yield opp
horizontal = horizontal * 2
def get_vectorization_dict(sumfacts, vertical, horizontal, qp):
# Enhance the list of sumfact nodes by adding vertical splittings
kernels = []
for sf in sumfacts:
# No slicing needed in the pure horizontal case
if vertical == 1:
kernels.append(sf)
continue
# Determine the slicing direction
slice_direction = 0 if sf.matrix_sequence[0].face is None else 1
if qp[slice_direction] % vertical != 0:
return None
# Split the basis tabulation matrices
oldtab = sf.matrix_sequence[slice_direction]
for i in range(vertical):
seq = list(sf.matrix_sequence)
seq[slice_direction] = oldtab.copy(slice_size=vertical,
slice_index=i)
kernels.append(sf.copy(matrix_sequence=tuple(seq)))
# Join the new kernels into a sum factorization node
buffer = get_counted_variable("joined_buffer")
return {sf: VectorizedSumfactKernel(kernels=tuple(kernels),
horizontal_width=horizontal,
vertical_width=vertical,
buffer=buffer,
) for sf in sumfacts}