Skip to content
Snippets Groups Projects
Commit 184da59e authored by René Heß's avatar René Heß
Browse files

[skip ci] Rewrite loop reordering

This rewrite was necessary for the cases where the instruction contains a
reduction but the expression itself is not a reduction. Besides fixing this bug
it greatly reduces code duplication and increases code quality.
parent ce3fc8ac
No related branches found
No related tags found
No related merge requests found
......@@ -63,195 +63,53 @@ def _get_inames_of_reduction(instr, iname_permutation):
return outer_inames, reduction_iname, inner_inames, vec_inames
def _get_iname_bound(kernel, iname):
# TODO: Not sure if that works in all cases
ldi, = kernel.get_leaf_domain_indices((iname,))
domain = kernel.domains[ldi]
pwaff = domain.dim_max(0)
bound = lp.symbolic.pw_aff_to_expr(pwaff)
return bound + 1
class FindReductionMapper(lp.symbolic.WalkMapper):
def __init__(self):
self.reductions = []
def map_reduction(self, expr, **kwargs):
self.reductions.append(expr)
return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs)
def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
"""Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
dim = world_dimension()
# Nothing to do if permutation is identity
if iname_permutation == tuple(range(dim + 1)):
return kernel
def _reorder_reduction_loops(kernel, match, iname_order):
"""Reorder loops in instructions containing one reduction
# Use names used in sum factorization kernel (without the index that distinguishes the different directions)
default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
from dune.codegen.sumfact.permutation import permute_backward
new_iname_order = permute_backward(default_iname_order, iname_permutation)
In order to reorder the loops we need to manually do the accumulation in a
temporary that is large enough to fit all the data created in loops inside
the reduction loop.
for instr in kernel.instructions:
# Inames used in this reduction
outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
iname_permutation)
if reduction_iname:
current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
else:
current_inames = outer_inames + inner_inames + vec_inames
current_iname_order = _current_iname_order(current_inames,
new_iname_order)
if iname_permutation[-1] == dim:
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
Parameters
----------
kernel: loopy.kernel.LoopKernel
match:
iname_order: tuple of str
prefered loop order for this instruction
"""
instructions = lp.find_instructions(kernel, match)
for instr in instructions:
# Get reduction
reduction = FindReductionMapper()
reduction(instr.expression)
assert len(reduction.reductions) == 1
reduction = reduction.reductions[0]
# Reduction iname, inner inames and vetcor inames
assert len(reduction.inames) == 1
reduction_iname = reduction.inames[0]
assert set([reduction_iname]) == instr.reduction_inames()
if reduction_iname not in iname_order:
lp.prioritize_loops(kernel, iname_order)
continue
if isinstance(instr.expression, lp.symbolic.Reduction):
if isinstance(instr.assignee, prim.Subscript):
assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
match = lp.match.Id(instr.id)
kernel = remove_reduction(kernel, match)
lp.prioritize_loops(kernel, current_iname_order)
duplicate_inames = tuple(inner_inames)
match = lp.match.Id(instr.id + '_set_zero')
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
else:
# Dependencies
match = lp.match.Id(instr.id)
depends_on = lp.find_instructions(kernel, match)[0].depends_on
depending = []
for i in kernel.instructions:
if instr.id in i.depends_on:
depending.append(i.id)
# Remove reduction
kernel = lp.remove_instructions(kernel, set([instr.id]))
# Create dim_tags
dim_tags = ','.join(['f'] * len(inner_inames))
vectorized = len(vec_inames) > 0
if vectorized:
assert len(vec_inames) == 1
dim_tags = dim_tags + ',vec'
# Create shape
shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames)
# Update temporary_variables of this kernel
from dune.codegen.loopy.temporary import DuneTemporaryVariable
accum_variable = get_counted_variable('accum_variable')
from dune.codegen.loopy.target import dtype_floatingpoint
dtype = lp.types.NumpyType(dtype_floatingpoint())
var = {accum_variable: DuneTemporaryVariable(accum_variable,
dtype=dtype,
shape=shape,
dim_tags=dim_tags,
managed=True)}
tv = kernel.temporary_variables.copy()
tv.update(var)
del tv[instr.assignee.name]
kernel = kernel.copy(temporary_variables=tv)
# Set accumulation variable to zero
accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
if vectorized:
accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
accum_init_id = instr.id + '_accum_init'
accum_init_instr = lp.Assignment(assignee,
0,
within_inames=instr.within_inames,
id=accum_init_id,
depends_on=depends_on,
tags=('accum_init',),
)
kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
# Accumulate in temporary variable
assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
expression = prim.Sum((assignee, instr.expression.expr))
within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
accum_id = instr.id + '_accum'
accum_instr = lp.Assignment(assignee,
expression,
within_inames=within_inames,
id=accum_id,
depends_on=frozenset([accum_init_id]),
tags=('accum',),
)
kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
# Duplicate inames and reorder
duplicate_inames = tuple(inner_inames)
for idx in [accum_init_id, accum_id]:
match = lp.match.Id(idx)
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = lp.find_instructions(kernel, match)[0].within_inames
current_iname_order = _current_iname_order(match_inames, new_iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
# Restore dependencies
for dep in depending:
match = lp.match.Id(dep)
kernel = lp.add_dependency(kernel, match, accum_id)
match = lp.match.Tagged('sumfact_stage3')
assign_instr, = lp.find_instructions(kernel, match)
from dune.codegen.loopy.symbolic import substitute
subst = {instr.assignee.name: assignee}
new_assign_instr = assign_instr.copy(expression=substitute(assign_instr.expression, subst),
id=assign_instr.id + '_mod')
kernel = kernel.copy(instructions=kernel.instructions + [new_assign_instr])
kernel = lp.remove_instructions(kernel, set([assign_instr.id]))
else:
current_iname_order = _current_iname_order(instr.within_inames, new_iname_order)
lp.prioritize_loops(kernel, current_iname_order)
return kernel
def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
"""Reorder the loop nests of a tensor contraction using an accumulation variable"""
dim = world_dimension()
# Nothing to do if permutation is identity
if iname_permutation == tuple(range(dim + 1)):
return kernel
# Use names used in sum factorization kernel (without the index that distinguishes the different directions)
default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
from dune.codegen.sumfact.permutation import permute_backward
new_iname_order = permute_backward(default_iname_order, iname_permutation)
for instr in kernel.instructions:
# Inames used in this reduction
outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
iname_permutation)
if reduction_iname:
current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
else:
current_inames = outer_inames + inner_inames + vec_inames
# We can directly use lp.prioritize_loops if:
# - The reduction is the innermost loop
# - There is no reduction (eg reduced direction on faces)
if iname_permutation[-1] == dim or reduction_iname is None:
current_iname_order = _current_iname_order(current_inames,
new_iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
inner_inames = iname_order[iname_order.index(reduction_iname)+1:]
if len(inner_inames) == 0:
lp.prioritize_loops(kernel, iname_order)
continue
assert isinstance(instr.expression, lp.symbolic.Reduction)
# Dependencies. Note: We change the instructions below so we cannot do
# depends_on=instr.depends_on. But we know that the id of instr is
# still valid so we can use that!
match = lp.match.Id(instr.id)
depends_on = lp.find_instructions(kernel, match)[0].depends_on
depending = []
for i in kernel.instructions:
if instr.id in i.depends_on:
depending.append(i.id)
# TODO: There should be a better way to do that
vec_inames = [i for i in instr.within_inames if 'sf_vec' in i]
assert len(vec_inames) < 2
# Remove reduction
kernel = lp.remove_instructions(kernel, set([instr.id]))
# {{{ Create new temporary variable
# Create dim_tags
dim_tags = ','.join(['f'] * len(inner_inames))
......@@ -261,7 +119,7 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
dim_tags = dim_tags + ',vec'
# Create shape
shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames)
shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames)
# Update temporary_variables of this kernel
from dune.codegen.loopy.temporary import DuneTemporaryVariable
......@@ -277,6 +135,8 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
tv.update(var)
kernel = kernel.copy(temporary_variables=tv)
# }}}
# Set accumulation variable to zero
accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
if vectorized:
......@@ -287,15 +147,15 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
0,
within_inames=instr.within_inames,
id=accum_init_id,
depends_on=depends_on,
depends_on=instr.depends_on,
tags=('accum_init',),
)
kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
# Accumulate in temporary variable
assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
expression = prim.Sum((assignee, instr.expression.expr))
within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
expression = prim.Sum((assignee, reduction.expr))
within_inames = frozenset(tuple(instr.within_inames) + reduction.inames)
accum_id = instr.id + '_accum'
accum_instr = lp.Assignment(assignee,
expression,
......@@ -306,65 +166,125 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
)
kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
if 'haddsubst' not in str(instr):
# Write back the result if this reduction does not come from a substitution
assign_id = instr.id + '_assign'
assign_instr = lp.Assignment(instr.assignee,
assignee,
within_inames=within_inames,
id=assign_id,
depends_on=frozenset([accum_id]),
tags=('accum_assign',),
)
kernel = kernel.copy(instructions=kernel.instructions + [assign_instr])
# Restore dependencies
for dep in depending:
match = lp.match.Id(dep)
# Replace reduction in insn with accumulation result
def map_reduction(expr, rec):
return assignee
mapper = lp.symbolic.ReductionCallbackMapper(map_reduction)
new_expression = mapper(instr.expression)
assign_id = instr.id + '_assign'
new_instr = instr.copy(expression=new_expression,
id=assign_id,
depends_on=frozenset(list(instr.depends_on) + [accum_id]))
kernel = kernel.copy(instructions=kernel.instructions + [new_instr])
# Fix dependencies and remove old instruction
for i in kernel.instructions:
if instr.id in i.depends_on:
match = lp.match.Id(i.id)
kernel = lp.add_dependency(kernel, match, assign_id)
else:
# If this reduction comes from a substitution take the
# corresponding assignment and replace the substitution with the
# accumulation result
assert len(depending) == 1
assignment, = [i for i in kernel.instructions if depending[0] == i.id]
other = [i for i in kernel.instructions if i.id != assignment.id]
replace = {prim.Variable(instr.assignee.name): assignee}
from dune.codegen.loopy.symbolic import substitute
new_expression = substitute(assignment.expression, replace)
new_instr = assignment.copy(expression=new_expression,
depends_on=frozenset(list(assignment.depends_on) + [accum_id]))
kernel = kernel.copy(instructions=other + [new_instr])
# Remove the temporary variable that belongs to the
# substitution. It is no longer used.
tv = kernel.temporary_variables.copy()
del tv[instr.assignee.name]
kernel = kernel.copy(temporary_variables=tv)
# Reordering loops only works if we duplicate some inames
kernel = lp.remove_instructions(kernel, set([instr.id]))
# Duplicate and reorder loops for accumulation
ids = [accum_init_id, accum_id]
duplicate_inames = tuple(inner_inames)
for idx in ids:
match = lp.match.Id(idx)
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
current_iname_order = _current_iname_order(match_inames, iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
match = lp.match.Id(accum_init_id)
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
current_iname_order = _current_iname_order(match_inames, new_iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
# Reorder assignment loops
kernel = lp.prioritize_loops(kernel, iname_order)
return kernel
# Reorder loops of the assignment of the result
if 'haddsubst' not in str(instr):
match = lp.match.Id(assign_id)
else:
match = lp.match.Id(assignment.id)
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
current_iname_order = _current_iname_order(match_inames, new_iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
# Change loop order
def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
"""Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
dim = world_dimension()
# Nothing to do if permutation is identity
if iname_permutation == tuple(range(dim + 1)):
return kernel
# Use names used in sum factorization kernel (without the index that distinguishes the different directions)
default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
from dune.codegen.sumfact.permutation import permute_backward
new_iname_order = permute_backward(default_iname_order, iname_permutation)
for instr in kernel.instructions:
# Inames used in this reduction
outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
iname_permutation)
if reduction_iname:
current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
else:
current_inames = outer_inames + inner_inames + vec_inames
current_iname_order = _current_iname_order(current_inames,
new_iname_order)
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
# We can directly use lp.prioritize_loops if
# - The reduction is the innermost loop
# - There is no reduction (eg reduced direction on faces)
if iname_permutation[-1] == dim or reduction_iname is None:
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
continue
if isinstance(instr.expression, lp.symbolic.Reduction):
# If the instruction is a reduction we can directly accumulate in the assignee
assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
match = lp.match.Id(instr.id)
kernel = remove_reduction(kernel, match)
lp.prioritize_loops(kernel, current_iname_order)
duplicate_inames = tuple(inner_inames)
match = lp.match.Id(instr.id + '_set_zero')
kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
else:
# In stage 3 this is usually not a reduction. In this case direct
# accumulation is not possible and we accumulate in a temporay
# variable instead
match = lp.match.Id(instr.id)
kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
return kernel
def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
"""Reorder the loop nests of a tensor contraction using an accumulation variable"""
dim = world_dimension()
# Nothing to do if permutation is identity
if iname_permutation == tuple(range(dim + 1)):
return kernel
# Use names used in sum factorization kernel (without the index that distinguishes the different directions)
default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
from dune.codegen.sumfact.permutation import permute_backward
new_iname_order = permute_backward(default_iname_order, iname_permutation)
for instr in kernel.instructions:
# Inames used in this reduction
outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
iname_permutation)
if reduction_iname:
current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
else:
current_inames = outer_inames + inner_inames + vec_inames
current_iname_order = _current_iname_order(current_inames, new_iname_order)
# We can directly use lp.prioritize_loops if:
# - The reduction is the innermost loop
# - There is no reduction (eg reduced direction on faces)
if iname_permutation[-1] == dim or reduction_iname is None:
kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
continue
match = lp.match.Id(instr.id)
kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
return kernel
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment