From c23c9fcf32a3ffd484ef35ce63c5b86926601a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de> Date: Tue, 24 Jan 2017 13:06:58 +0100 Subject: [PATCH] Directly accumulate the output in stage 3 of sf Note: Right now this is only done for FastDGGridOperator. --- python/dune/perftool/sumfact/sumfact.py | 135 +++++++++++++++++------- 1 file changed, 97 insertions(+), 38 deletions(-) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index d56e080b..65f26ece 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): name=inp, ) - # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron + # Those input fields, that are padded need to be set to zero + # in order to do a horizontal_add lateron for pad in padding: instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)), expression=0, @@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): depends_on=insn_dep, within_inames=frozenset(visitor.inames))}) - # Add a sum factorization kernel that implements the multiplication - # with the test function (stage 3) - pref_pos = i if accterm.argument.index else None - result, insn_dep = sum_factorization_kernel(a_matrices, - buf, - 3, - insn_dep=insn_dep, - additional_inames=frozenset(visitor.inames), - preferred_position=pref_pos, - restriction=(accterm.argument.restriction, restriction), - ) inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices)) @@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure) rank = 2 if visitor.inames else 1 if rank == 2: - # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now) + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right now) ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())), (basis_functions_per_direction(),) * dim, order="f" @@ -251,6 +242,32 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Construct the expression representing "{r,jac}.accumulate(..)" accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + + direct_output = None + if get_option('fastdg'): + ft = get_global_context_value("form_type") + accum = accum + ".data()" + if ft == 'residual' or ft == 'jacobian_apply': + direct_output = accum + else: + assert ft == 'jacobian' + direct_output = accum + + + # Add a sum factorization kernel that implements the multiplication + # with the test function (stage 3) + pref_pos = i if accterm.argument.index else None + result, insn_dep = sum_factorization_kernel(a_matrices, + buf, + 3, + insn_dep=insn_dep, + additional_inames=frozenset(visitor.inames), + preferred_position=pref_pos, + restriction=(accterm.argument.restriction, restriction), + direct_output=direct_output, + visitor=visitor + ) + # Determine the expression to accumulate with. This depends on the vectorization strategy! result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) vecinames = () @@ -271,30 +288,38 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): if get_option('fastdg'): ft = get_global_context_value("form_type") if ft == 'residual' or ft == 'jacobian_apply': - accum = accum + ".data()" - size = basis_functions_per_direction() ** world_dimension() - globalarg(accum, dtype=np.float64, shape=(size,), managed=False) - assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,)) - expression = prim.Sum((assignee, result)) - instruction(assignee=assignee, - expression=expression, - forced_iname_deps=frozenset(inames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - ) + if get_global_context_value("dry_run", False): + shape = (basis_functions_per_direction(),) * world_dimension() + ftags = ",".join(["f"]*len(shape)) + globalarg(accum, dtype=np.float64, shape=shape, dim_tags=ftags) + assignee = prim.Subscript(prim.Variable(accum), tuple(prim.Variable(i) for i in inames)) + + expression = prim.Sum((assignee, result)) + instruction(assignee=assignee, + expression=expression, + forced_iname_deps=frozenset(inames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + ) else: - assert ft == 'jacobian' - accum = accum + ".data()" - size = basis_functions_per_direction() ** world_dimension() - globalarg(accum, dtype=np.float64, shape=(size, size), managed=True) - assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index, ansatz_lfs.index)) - expression = prim.Sum((assignee, result)) - instruction(assignee=assignee, - expression=expression, - forced_iname_deps=frozenset(inames + visitor.inames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - ) + if get_global_context_value("dry_run", False): + assert ft == 'jacobian' + shape = (basis_functions_per_direction(),) * (world_dimension() * 2) + ftags = ",".join(["f"] * len(shape)) + globalarg(accum, dtype=np.float64, shape=shape, dim_tags=ftags) + _test_inames = tuple(prim.Variable(i) for i in inames) + # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now) + _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension())) + assignee = prim.Subscript(prim.Variable(accum), _ansatz_inames + _test_inames) + expression = prim.Sum((assignee, result)) + + + instruction(assignee=assignee, + expression=expression, + forced_iname_deps=frozenset(inames + visitor.inames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + ) # Default: Generate accumulation instructions else: expr = Call(PDELabAccumulationFunction(accum, rank), @@ -368,7 +393,7 @@ def _sf_flop_cost(a_matrices): def _sf_permutation_strategy(a_matrices, stage): - """Choose permutation of a_matices list based on computational cost + """Choose permutation of a_matrices list based on computational cost Note: If there are multiple permutations with the same cost a heuristic is used to pick one. @@ -418,6 +443,8 @@ def sum_factorization_kernel(a_matrices, outshape=None, restriction=0, direct_input=None, + direct_output=None, + visitor=None, ): """Create a sum factorization kernel @@ -481,6 +508,9 @@ def sum_factorization_kernel(a_matrices, direct_input: Global data structure containing input for sumfactorization (e.g. when using FastDGGridOperator). """ + # Return a pymbolic SumfactKernel node in the dry run. This will + # be used to decide on an appropriate vectorization strategy + # before we do the real thing. if get_global_context_value("dry_run", False): return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset() @@ -614,7 +644,36 @@ def sum_factorization_kernel(a_matrices, output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) if l == len(a_matrices)-1: output_inames = _permute_backward(output_inames, perm) - assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) + + # In case of direct output we directly accumulate the result + # of the Sumfactorization into some global data structure. + if l == len(a_matrices)-1 and direct_output is not None: + ft = get_global_context_value("form_type") + novec_ftags = ",".join(["f"]*len(a_matrices)) + if ft == 'residual' or ft == 'jacobian_apply': + globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags) + assignee = prim.Subscript(prim.Variable(direct_output), output_inames) + else: + assert ft == 'jacobian' + globalarg(direct_output, + dtype=np.float64, + shape=output_shape + output_shape, + dim_tags = novec_ftags + "," + novec_ftags) + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right + # now) + _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension())) + assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames) + + # In case of vectorization we need to apply a horizontal add + if a_matrix.vectorized: + matprod = prim.Call(prim.Variable("horizontal_add"), + (matprod,)) + + # We need to accumulate + matprod = prim.Sum((assignee, matprod)) + else: + assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on -- GitLab