Skip to content
Snippets Groups Projects
Commit 624dcf0b authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Merge branch 'feature/sf-face-permutation' into 'master'

Permutation in sumfactorization

See merge request !102
parents 9d68f1bc 33ae4ed2
No related branches found
No related tags found
No related merge requests found
import copy import copy
import itertools
from dune.perftool.loopy.symbolic import substitute from dune.perftool.loopy.symbolic import substitute
from dune.perftool.pdelab.argument import (name_accumulation_variable, from dune.perftool.pdelab.argument import (name_accumulation_variable,
...@@ -328,6 +329,85 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -328,6 +329,85 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
insn_dep = emit_sumfact_kernel(None, restriction, insn_dep) insn_dep = emit_sumfact_kernel(None, restriction, insn_dep)
def _sf_permutation_heuristic(permutations, stage):
"""Heuristic to choose a permutation
- Stage 1: Pick the permutation where in permutations[1:] most
elements are ordered by size
- Stage 3: Pick the permutation where in permutations[:-1] most
elements are ordered by size
"""
def cost(perm, stage):
cost = 0
for i in range(0,len(perm)-2):
if stage==1:
if perm[i+1]>perm[i+2]:
cost += 1
if stage==3:
if perm[0]>perm[i+1]:
cost += 1
return cost
perm = min(permutations, key=lambda i:cost(i,stage))
return perm
def _sf_flop_cost(a_matrices):
"""Computational cost of sumfactorization with this list of a_matrices
"""
cost = 0;
for l in range(len(a_matrices)):
cost_m = 1
cost_n = 1
for i in range(l+1):
cost_m *= a_matrices[i].rows
for i in range(l,len(a_matrices)):
cost_n *= a_matrices[i].cols
cost += cost_m * cost_n
return cost
def _sf_permutation_strategy(a_matrices, stage):
"""Choose permutation of a_matices list based on computational cost
Note: If there are multiple permutations with the same cost a
heuristic is used to pick one.
"""
# Combine permutation and a_matrices list
perm = [i for i, _ in enumerate(a_matrices)]
perm_a_matrices = zip(perm, a_matrices)
# Find cost for all possible permutations of a_matrices list
perm_cost = []
for permutation in itertools.permutations(perm_a_matrices):
perm, series = zip(*permutation)
cost = _sf_flop_cost(series)
perm_cost.append((perm,cost))
# Find minimal cost and all permutations with that cost
_, costs = zip(*perm_cost)
minimal_cost = min(costs)
minimal_cost_permutations = [p[0] for p in perm_cost if p[1]==minimal_cost]
# Use heuristic to pic one of the minimal cost permutations
perm = _sf_permutation_heuristic(minimal_cost_permutations, stage)
return perm
def _permute_forward(t, perm):
tmp = []
for pos in perm:
tmp.append(t[pos])
return tuple(tmp)
def _permute_backward(t, perm):
tmp = [None]*len(t)
for i, pos in enumerate(perm):
tmp[pos] = t[i]
return tuple(tmp)
@generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0))) @generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0)))
def sum_factorization_kernel(a_matrices, def sum_factorization_kernel(a_matrices,
buf, buf,
...@@ -364,13 +444,22 @@ def sum_factorization_kernel(a_matrices, ...@@ -364,13 +444,22 @@ def sum_factorization_kernel(a_matrices,
A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
So the multiplication with A_l is reduction over one index and the So the multiplication with A_l is a reduction over one index and
transformation brings the next reduction index in the fastest the transformation brings the next reduction index in the fastest
position. position.
Note: In the code below the transformation step is directly done Note: In the code below the transformation step is directly done
in the reduction instruction by adapting the assignee! in the reduction instruction by adapting the assignee!
It can make sense to permute the order of directions. If you have
a small m_l (e.g. stage 1 on faces) it is better to do direction l
first. This can be done permuting:
- The order of the A matrices.
- Permuting the input tensor.
- Permuting the output tensor (this assures that the directions of
the output tensor are again ordered from 0 to d-1).
Arguments: Arguments:
---------- ----------
a_matrices: An iterable of AMatrix instances a_matrices: An iterable of AMatrix instances
...@@ -391,7 +480,6 @@ def sum_factorization_kernel(a_matrices, ...@@ -391,7 +480,6 @@ def sum_factorization_kernel(a_matrices,
restriction: Restriction for faces values. restriction: Restriction for faces values.
direct_input: Global data structure containing input for direct_input: Global data structure containing input for
sumfactorization (e.g. when using FastDGGridOperator). sumfactorization (e.g. when using FastDGGridOperator).
""" """
if get_global_context_value("dry_run", False): if get_global_context_value("dry_run", False):
return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset() return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
...@@ -418,6 +506,18 @@ def sum_factorization_kernel(a_matrices, ...@@ -418,6 +506,18 @@ def sum_factorization_kernel(a_matrices,
within_inames=additional_inames, within_inames=additional_inames,
)}) )})
# Decide in which order we want to process directions in the
# sumfactorization. A clever ordering can lead to a reduced
# complexity. This will e.g. happen at faces where we only have
# one quadratue point m_l=1 if l is the normal direction of the
# face.
#
# Rule of thumb: small m's early and large n's late.
perm = _sf_permutation_strategy(a_matrices, stage)
# Permute a_matrices
a_matrices = _permute_forward(a_matrices, perm)
# Product of all matrices # Product of all matrices
for l, a_matrix in enumerate(a_matrices): for l, a_matrix in enumerate(a_matrices):
# Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
...@@ -446,16 +546,29 @@ def sum_factorization_kernel(a_matrices, ...@@ -446,16 +546,29 @@ def sum_factorization_kernel(a_matrices,
# * an input temporary (default) # * an input temporary (default)
# * a global data structure (if FastDGGridOperator is in use) # * a global data structure (if FastDGGridOperator is in use)
# * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator) # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])
if l == 0 and direct_input is not None: if l == 0 and direct_input is not None:
# See comment bellow
input_inames = _permute_backward(input_inames, perm)
inp_shape = _permute_backward(inp_shape, perm)
globalarg(direct_input, dtype=np.float64, shape=inp_shape) globalarg(direct_input, dtype=np.float64, shape=inp_shape)
if a_matrix.vectorized: if a_matrix.vectorized:
input_summand = prim.Call(prim.Variable("Vec4d"), input_summand = prim.Call(prim.Variable("Vec4d"),
(prim.Subscript(prim.Variable(direct_input), (prim.Subscript(prim.Variable(direct_input),
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])),)) input_inames),))
else: else:
input_summand = prim.Subscript(prim.Variable(direct_input), input_summand = prim.Subscript(prim.Variable(direct_input),
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname) input_inames + vec_iname)
else: else:
# If we did permute the order of a matrices above we also
# permuted the order of out_inames. Unfortunately the
# order of our input is from 0 to d-1. This means we need
# to permute _back_ to get the right coefficients.
if l == 0:
inp_shape = _permute_backward(inp_shape, perm)
input_inames = _permute_backward(input_inames, perm)
# Get a temporary that interprets the base storage of the input # Get a temporary that interprets the base storage of the input
# as a column-major matrix. In later iteration of the amatrix loop # as a column-major matrix. In later iteration of the amatrix loop
# this reinterprets the output of the previous iteration. # this reinterprets the output of the previous iteration.
...@@ -467,7 +580,7 @@ def sum_factorization_kernel(a_matrices, ...@@ -467,7 +580,7 @@ def sum_factorization_kernel(a_matrices,
silenced_warning('read_no_write({})'.format(inp)) silenced_warning('read_no_write({})'.format(inp))
input_summand = prim.Subscript(prim.Variable(inp), input_summand = prim.Subscript(prim.Variable(inp),
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname) input_inames + vec_iname)
switch_base_storage(buf) switch_base_storage(buf)
...@@ -478,14 +591,16 @@ def sum_factorization_kernel(a_matrices, ...@@ -478,14 +591,16 @@ def sum_factorization_kernel(a_matrices,
# corresponding shape (out_shape[0]) goes to the end (slowest # corresponding shape (out_shape[0]) goes to the end (slowest
# direction) and everything stays column major (ftags->fortran # direction) and everything stays column major (ftags->fortran
# style). # style).
#
# If we are in the last step we reverse the permutation.
output_shape = tuple(out_shape[1:]) + (out_shape[0],)
if l == len(a_matrices)-1:
output_shape = _permute_backward(output_shape, perm)
out = get_buffer_temporary(buf, out = get_buffer_temporary(buf,
shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape, shape=output_shape + vec_shape,
dim_tags=ftags) dim_tags=ftags)
# Write the matrix-matrix multiplication expression # Write the matrix-matrix multiplication expression
# matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
# (prim.Variable(i), k_expr) + vec_iname),
# input_summand))
matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
(prim.Variable(out_inames[0]), k_expr) + vec_iname), (prim.Variable(out_inames[0]), k_expr) + vec_iname),
input_summand)) input_summand))
...@@ -494,8 +609,12 @@ def sum_factorization_kernel(a_matrices, ...@@ -494,8 +609,12 @@ def sum_factorization_kernel(a_matrices,
if a_matrix.cols != 1: if a_matrix.cols != 1:
matprod = lp.Reduction("sum", k, matprod) matprod = lp.Reduction("sum", k, matprod)
# Here we also move the new direction (out_inames[0]) to the end # Here we also move the new direction (out_inames[0]) to the
assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname) # end and reverse permutation
output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
if l == len(a_matrices)-1:
output_inames = _permute_backward(output_inames, perm)
assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
# Issue the reduction instruction that implements the multiplication # Issue the reduction instruction that implements the multiplication
# at the same time store the instruction ID for the next instruction to depend on # at the same time store the instruction ID for the next instruction to depend on
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment