Skip to content
Snippets Groups Projects
Commit 5c1058d9 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Merge branch 'feature/sf-stage3-direct-output' into 'master'

Directly accumulate results in last step of sumfactorization

See merge request !107
parents f60941e5 326c9945
No related branches found
No related tags found
No related merge requests found
...@@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
name=inp, name=inp,
) )
# Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron # Those input fields, that are padded need to be set to zero
# in order to do a horizontal_add lateron
for pad in padding: for pad in padding:
instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)), instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)),
expression=0, expression=0,
...@@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
depends_on=insn_dep, depends_on=insn_dep,
within_inames=frozenset(visitor.inames))}) within_inames=frozenset(visitor.inames))})
# Add a sum factorization kernel that implements the multiplication
# with the test function (stage 3)
pref_pos = i if accterm.argument.index else None
result, insn_dep = sum_factorization_kernel(a_matrices,
buf,
3,
insn_dep=insn_dep,
additional_inames=frozenset(visitor.inames),
preferred_position=pref_pos,
restriction=(accterm.argument.restriction, restriction),
)
inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices)) inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices))
...@@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure) ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure)
rank = 2 if visitor.inames else 1 rank = 2 if visitor.inames else 1
if rank == 2: if rank == 2:
# TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now) # TODO the next line should get its inames from
# elsewhere. This is *NOT* robust (but works right now)
ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())), ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())),
(basis_functions_per_direction(),) * dim, (basis_functions_per_direction(),) * dim,
order="f" order="f"
...@@ -251,6 +242,34 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -251,6 +242,34 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
# Construct the expression representing "{r,jac}.accumulate(..)" # Construct the expression representing "{r,jac}.accumulate(..)"
accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
# We can directly accumulate the solution in the last step of
# the sumfactorization. Currently this is only implemented for
# FastDGGridOperator.
direct_output = None
if get_option('fastdg'):
ft = get_global_context_value("form_type")
accum = accum + ".data()"
if ft == 'residual' or ft == 'jacobian_apply':
direct_output = accum
else:
assert ft == 'jacobian'
direct_output = accum
# Add a sum factorization kernel that implements the multiplication
# with the test function (stage 3)
pref_pos = i if accterm.argument.index else None
result, insn_dep = sum_factorization_kernel(a_matrices,
buf,
3,
insn_dep=insn_dep,
additional_inames=frozenset(visitor.inames),
preferred_position=pref_pos,
restriction=(accterm.argument.restriction, restriction),
direct_output=direct_output,
visitor=visitor
)
# Determine the expression to accumulate with. This depends on the vectorization strategy! # Determine the expression to accumulate with. This depends on the vectorization strategy!
result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
vecinames = () vecinames = ()
...@@ -263,38 +282,22 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -263,38 +282,22 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
(maybe_wrap_subscript(result, prim.Variable(iname)),), (maybe_wrap_subscript(result, prim.Variable(iname)),),
) )
# In the case of FastDGGridOperator we can write directly into the resiudal/jacobi # In the fastdg case we accumulate directly in the last step of the SF!
#
# TODO: At the moment this only works if we do not vectorize
# (over gradients) because loopy tries to acces a vectorclass
# variable.
if get_option('fastdg'): if get_option('fastdg'):
ft = get_global_context_value("form_type") # In the dry run we need an instruction with a
if ft == 'residual' or ft == 'jacobian_apply': # sumfactorization node. We use this to decide on a
accum = accum + ".data()" # vectorization strategy. This is just a dummy
size = basis_functions_per_direction() ** world_dimension() # instruction, in the real run the accumulation is done
globalarg(accum, dtype=np.float64, shape=(size,), managed=False) # directly in the sumfactorization.
assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,)) if get_global_context_value("dry_run", False):
expression = prim.Sum((assignee, result)) dummy = "dummy"
instruction(assignee=assignee, globalarg(dummy, dtype=np.float64)
expression=expression, instruction(assignee = prim.Variable(dummy),
expression = result,
forced_iname_deps=frozenset(inames), forced_iname_deps=frozenset(inames),
forced_iname_deps_is_final=True, forced_iname_deps_is_final=True,
depends_on=insn_dep, depends_on=insn_dep,
) )
else:
assert ft == 'jacobian'
accum = accum + ".data()"
size = basis_functions_per_direction() ** world_dimension()
globalarg(accum, dtype=np.float64, shape=(size, size), managed=True)
assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index, ansatz_lfs.index))
expression = prim.Sum((assignee, result))
instruction(assignee=assignee,
expression=expression,
forced_iname_deps=frozenset(inames + visitor.inames),
forced_iname_deps_is_final=True,
depends_on=insn_dep,
)
# Default: Generate accumulation instructions # Default: Generate accumulation instructions
else: else:
expr = Call(PDELabAccumulationFunction(accum, rank), expr = Call(PDELabAccumulationFunction(accum, rank),
...@@ -368,7 +371,7 @@ def _sf_flop_cost(a_matrices): ...@@ -368,7 +371,7 @@ def _sf_flop_cost(a_matrices):
def _sf_permutation_strategy(a_matrices, stage): def _sf_permutation_strategy(a_matrices, stage):
"""Choose permutation of a_matices list based on computational cost """Choose permutation of a_matrices list based on computational cost
Note: If there are multiple permutations with the same cost a Note: If there are multiple permutations with the same cost a
heuristic is used to pick one. heuristic is used to pick one.
...@@ -418,6 +421,8 @@ def sum_factorization_kernel(a_matrices, ...@@ -418,6 +421,8 @@ def sum_factorization_kernel(a_matrices,
outshape=None, outshape=None,
restriction=0, restriction=0,
direct_input=None, direct_input=None,
direct_output=None,
visitor=None,
): ):
"""Create a sum factorization kernel """Create a sum factorization kernel
...@@ -481,6 +486,9 @@ def sum_factorization_kernel(a_matrices, ...@@ -481,6 +486,9 @@ def sum_factorization_kernel(a_matrices,
direct_input: Global data structure containing input for direct_input: Global data structure containing input for
sumfactorization (e.g. when using FastDGGridOperator). sumfactorization (e.g. when using FastDGGridOperator).
""" """
# Return a pymbolic SumfactKernel node in the dry run. This will
# be used to decide on an appropriate vectorization strategy
# before we do the real thing.
if get_global_context_value("dry_run", False): if get_global_context_value("dry_run", False):
return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset() return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
...@@ -614,7 +622,36 @@ def sum_factorization_kernel(a_matrices, ...@@ -614,7 +622,36 @@ def sum_factorization_kernel(a_matrices,
output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),)
if l == len(a_matrices)-1: if l == len(a_matrices)-1:
output_inames = _permute_backward(output_inames, perm) output_inames = _permute_backward(output_inames, perm)
assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
# In case of direct output we directly accumulate the result
# of the Sumfactorization into some global data structure.
if l == len(a_matrices)-1 and direct_output is not None:
ft = get_global_context_value("form_type")
novec_ftags = ",".join(["f"]*len(a_matrices))
if ft == 'residual' or ft == 'jacobian_apply':
globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags)
assignee = prim.Subscript(prim.Variable(direct_output), output_inames)
else:
assert ft == 'jacobian'
globalarg(direct_output,
dtype=np.float64,
shape=output_shape + output_shape,
dim_tags = novec_ftags + "," + novec_ftags)
# TODO the next line should get its inames from
# elsewhere. This is *NOT* robust (but works right
# now)
_ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension()))
assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames)
# In case of vectorization we need to apply a horizontal add
if a_matrix.vectorized:
matprod = prim.Call(prim.Variable("horizontal_add"),
(matprod,))
# We need to accumulate
matprod = prim.Sum((assignee, matprod))
else:
assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname)
# Issue the reduction instruction that implements the multiplication # Issue the reduction instruction that implements the multiplication
# at the same time store the instruction ID for the next instruction to depend on # at the same time store the instruction ID for the next instruction to depend on
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment