diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index d56e080b5856367170f882b2df43fbea7e27f738..408302b22a0bf38fec226c1b80d81b8df78089ed 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -167,7 +167,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): name=inp, ) - # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron + # Those input fields, that are padded need to be set to zero + # in order to do a horizontal_add lateron for pad in padding: instruction(assignee=prim.Subscript(prim.Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + (pad,)), expression=0, @@ -217,17 +218,6 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): depends_on=insn_dep, within_inames=frozenset(visitor.inames))}) - # Add a sum factorization kernel that implements the multiplication - # with the test function (stage 3) - pref_pos = i if accterm.argument.index else None - result, insn_dep = sum_factorization_kernel(a_matrices, - buf, - 3, - insn_dep=insn_dep, - additional_inames=frozenset(visitor.inames), - preferred_position=pref_pos, - restriction=(accterm.argument.restriction, restriction), - ) inames = tuple(accum_iname((accterm.argument.restriction, restriction), mat.rows, i) for i, mat in enumerate(a_matrices)) @@ -242,7 +232,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ansatz_lfs = determine_accumulation_space(accterm.term, 1, measure) rank = 2 if visitor.inames else 1 if rank == 2: - # TODO the next line should get its inames from elsewhere. This is *NOT* robust (but works right now) + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right now) ansatz_lfs.index = flatten_index(tuple(Variable(visitor.inames[i]) for i in range(world_dimension())), (basis_functions_per_direction(),) * dim, order="f" @@ -251,6 +242,34 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Construct the expression representing "{r,jac}.accumulate(..)" accum = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + # We can directly accumulate the solution in the last step of + # the sumfactorization. Currently this is only implemented for + # FastDGGridOperator. + direct_output = None + if get_option('fastdg'): + ft = get_global_context_value("form_type") + accum = accum + ".data()" + if ft == 'residual' or ft == 'jacobian_apply': + direct_output = accum + else: + assert ft == 'jacobian' + direct_output = accum + + + # Add a sum factorization kernel that implements the multiplication + # with the test function (stage 3) + pref_pos = i if accterm.argument.index else None + result, insn_dep = sum_factorization_kernel(a_matrices, + buf, + 3, + insn_dep=insn_dep, + additional_inames=frozenset(visitor.inames), + preferred_position=pref_pos, + restriction=(accterm.argument.restriction, restriction), + direct_output=direct_output, + visitor=visitor + ) + # Determine the expression to accumulate with. This depends on the vectorization strategy! result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames)) vecinames = () @@ -263,38 +282,22 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): (maybe_wrap_subscript(result, prim.Variable(iname)),), ) - # In the case of FastDGGridOperator we can write directly into the resiudal/jacobi - # - # TODO: At the moment this only works if we do not vectorize - # (over gradients) because loopy tries to acces a vectorclass - # variable. + # In the fastdg case we accumulate directly in the last step of the SF! if get_option('fastdg'): - ft = get_global_context_value("form_type") - if ft == 'residual' or ft == 'jacobian_apply': - accum = accum + ".data()" - size = basis_functions_per_direction() ** world_dimension() - globalarg(accum, dtype=np.float64, shape=(size,), managed=False) - assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,)) - expression = prim.Sum((assignee, result)) - instruction(assignee=assignee, - expression=expression, + # In the dry run we need an instruction with a + # sumfactorization node. We use this to decide on a + # vectorization strategy. This is just a dummy + # instruction, in the real run the accumulation is done + # directly in the sumfactorization. + if get_global_context_value("dry_run", False): + dummy = "dummy" + globalarg(dummy, dtype=np.float64) + instruction(assignee = prim.Variable(dummy), + expression = result, forced_iname_deps=frozenset(inames), forced_iname_deps_is_final=True, depends_on=insn_dep, ) - else: - assert ft == 'jacobian' - accum = accum + ".data()" - size = basis_functions_per_direction() ** world_dimension() - globalarg(accum, dtype=np.float64, shape=(size, size), managed=True) - assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index, ansatz_lfs.index)) - expression = prim.Sum((assignee, result)) - instruction(assignee=assignee, - expression=expression, - forced_iname_deps=frozenset(inames + visitor.inames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - ) # Default: Generate accumulation instructions else: expr = Call(PDELabAccumulationFunction(accum, rank), @@ -368,7 +371,7 @@ def _sf_flop_cost(a_matrices): def _sf_permutation_strategy(a_matrices, stage): - """Choose permutation of a_matices list based on computational cost + """Choose permutation of a_matrices list based on computational cost Note: If there are multiple permutations with the same cost a heuristic is used to pick one. @@ -418,6 +421,8 @@ def sum_factorization_kernel(a_matrices, outshape=None, restriction=0, direct_input=None, + direct_output=None, + visitor=None, ): """Create a sum factorization kernel @@ -481,6 +486,9 @@ def sum_factorization_kernel(a_matrices, direct_input: Global data structure containing input for sumfactorization (e.g. when using FastDGGridOperator). """ + # Return a pymbolic SumfactKernel node in the dry run. This will + # be used to decide on an appropriate vectorization strategy + # before we do the real thing. if get_global_context_value("dry_run", False): return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset() @@ -614,7 +622,36 @@ def sum_factorization_kernel(a_matrices, output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) if l == len(a_matrices)-1: output_inames = _permute_backward(output_inames, perm) - assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) + + # In case of direct output we directly accumulate the result + # of the Sumfactorization into some global data structure. + if l == len(a_matrices)-1 and direct_output is not None: + ft = get_global_context_value("form_type") + novec_ftags = ",".join(["f"]*len(a_matrices)) + if ft == 'residual' or ft == 'jacobian_apply': + globalarg(direct_output, dtype=np.float64, shape=output_shape, dim_tags=novec_ftags) + assignee = prim.Subscript(prim.Variable(direct_output), output_inames) + else: + assert ft == 'jacobian' + globalarg(direct_output, + dtype=np.float64, + shape=output_shape + output_shape, + dim_tags = novec_ftags + "," + novec_ftags) + # TODO the next line should get its inames from + # elsewhere. This is *NOT* robust (but works right + # now) + _ansatz_inames = tuple(Variable(visitor.inames[i]) for i in range(world_dimension())) + assignee = prim.Subscript(prim.Variable(direct_output), _ansatz_inames + output_inames) + + # In case of vectorization we need to apply a horizontal add + if a_matrix.vectorized: + matprod = prim.Call(prim.Variable("horizontal_add"), + (matprod,)) + + # We need to accumulate + matprod = prim.Sum((assignee, matprod)) + else: + assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on