diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py index 8342a3435d879d53e1c633a2bb022c2d50d02fdb..41c0d690da9d8aa525871d77dc8672947795b5f8 100644 --- a/python/dune/codegen/sumfact/transformations.py +++ b/python/dune/codegen/sumfact/transformations.py @@ -172,123 +172,317 @@ def _reorder_loops_in_tensor_contraction_direct(kernel, iname_order): return kernel -def _reorder_loops_in_tensor_contraction_accum(kernel, iname_order): +def _get_inames_of_reduction(instr, iname_permutation): + # Check which loops are inside the reduction loop + reduction_index = iname_permutation[-1] + within_reduction = tuple(i > reduction_index for i in iname_permutation) + + # Basename of inner inames dim = world_dimension() - assert dim == 3 + inner_inames_base = [] + for index, within in enumerate(within_reduction): + if within: + inner_inames_base.append('sf_out_inames_{}'.format(dim-1-index)) + + # Name of inner inames and outer inames + inner_inames = [] + outer_inames = [] + vec_inames = [] + for i in [x.name for x in instr.assignee.index_tuple]: + for j in inner_inames_base: + if j in i: + inner_inames.append(i) + if 'vec' in i: + vec_inames.append(i) + elif i not in inner_inames: + outer_inames.append(i) + + # Reduction iname + regex = re.compile('sf_red_([0-9]*)') + reduction_index = set(regex.findall(str(instr))) + if isinstance(instr.expression, lp.symbolic.Reduction): + assert len(reduction_index) == 1 + reduction_index = reduction_index.pop() + reduction_iname = 'sf_red_{}'.format(reduction_index) + else: + assert len(reduction_index) == 0 + reduction_iname = None - if iname_order.endswith('j'): - return kernel + return outer_inames, reduction_iname, inner_inames, vec_inames - # kernel = remove_all_reductions(kernel) - kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_order) - cond = lp.match.Tagged('set_zero') - for instr in lp.find_instructions(kernel, cond): - assert len(instr.depends_on) == 0 +def _duplicate_assignment_inames(kernel, match): + instructions = lp.find_instructions(kernel, match) + for instr in instructions: + assert isinstance(instr, lp.kernel.instruction.Assignment) - # Depending on this instruction + # Dependencies + match = lp.match.Id(instr.id) + depends_on = instr.depends_on depending = [] for i in kernel.instructions: if instr.id in i.depends_on: depending.append(i.id) - assert len(depending) == 1 - active_inames = [i.name.endswith('move_up') for i in instr.assignee.index] - from loopy.kernel.array import VectorArrayDimTag - agg = kernel.temporary_variables[instr.assignee.aggregate.name] - if isinstance(agg.dim_tags[-1], VectorArrayDimTag): - active_inames[-1] = True - - # Instead of setting the variable itself to zero we set an accumulation - # variable to zero + # Remove instruction kernel = lp.remove_instructions(kernel, set([instr.id])) - accum_variable = get_counted_variable('accum_variable') - accum_init_inames = tuple(i for (i,j) in zip(instr.assignee.index, active_inames) if j) - assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) - accum_init_id = instr.id + '_accum_init' - accum_init_instr = lp.Assignment(assignee, - 0, - within_inames=instr.within_inames, - id=accum_init_id, - tags=('accum_variable',) - ) - kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,]) + + def _duplicate_name(iname): + iname_appendix = '_duplicate' + return iname.name + iname_appendix + + agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name] + vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag) + if vectorized: + inames = instr.assignee.index_tuple[:-1] + else: + inames = instr.assignee.index_tuple + + # Create new domains + domains = kernel.domains + new_domains = [] + for iname in inames: + # Find loop bound for the corresponding domain + for dom in domains: + if iname.name in dom.get_var_names(isl.dim_type.set): + # TODO There must be better way to get this information using isl + str_begin = str(dom).find(iname.name + ' =') + len(iname.name) + 3 + if str_begin == len(iname.name) + 3 - 1: + str_begin = str(dom).find(iname.name + ' <=') + len(iname.name) + 4 + str_end = str_begin + str(dom)[str_begin:].find(' ') + loop_bound = int(str(dom)[str_begin:str_end]) + 1 + break + + # Create new domain + domain = "{{ [{0}] : 0<={0}<{1} }}".format(_duplicate_name(iname), loop_bound) + domain = lp.kernel.creation.parse_domains(domain, {}) + new_domains.append(domain) + for domain in new_domains: + domains = domains + domain + + # Create new inames + new_inames = tuple(prim.Variable(_duplicate_name(i)) for i in inames) + if vectorized: + new_inames = new_inames + (instr.assignee.index_tuple[-1],) + + # Create new instruction within the new inames + assignee = prim.Subscript(instr.assignee.aggregate, new_inames) + new_instruction = instr.copy(assignee=assignee, + depends_on=depends_on, + within_inames=frozenset([i.name for i in new_inames])) + kernel = kernel.copy(instructions=kernel.instructions + [new_instruction], + domains=domains) # Restore dependencies for dep in depending: match = lp.match.Id(dep) - kernel = lp.add_dependency(kernel, match, accum_init_id) + kernel = lp.add_dependency(kernel, match, new_instruction.id) + + return kernel + + +def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation): + dim = world_dimension() + + # Nothing to do if permutation is identity + if iname_permutation == tuple(range(dim+1)): + return kernel - # Make accumulation variable a temporary_variable of this kernel - # - # Create dim tags for accum variable - dim_tags = ','.join(['f'] * sum(active_inames)) - if isinstance(agg.dim_tags[-1], VectorArrayDimTag): - dim_tags = ','.join(['f'] * (sum(active_inames)-1)) + ",vec" + # Use names used in sum factorization kernel (without the index that distinguishes the different directions) + default_iname_order = ['sf_out_inames_{}'.format(dim-1-i) for i in range(dim)] + ['sf_red'] + from dune.codegen.sumfact.permutation import permute_backward + new_iname_order = permute_backward(default_iname_order, iname_permutation) + + # Get the real names with direction indices in the right order + def _current_new_iname_order(outer, reduction, inner, new_iname_order): + if reduction: + reduction = [reduction] + else: + reduction = [] + all_inames = outer + reduction + inner + current_iname_order = [] + for i in new_iname_order: + for j in all_inames: + if i in j: + current_iname_order.append(j) + return current_iname_order + + for instr in kernel.instructions: + # We can directly use lp.prioritize_loops if: + # - The reduction is the innermost loop + # - There is no reduction (eg reduced direction on faces) + if iname_permutation[-1] == dim or not isinstance(instr.expression, lp.symbolic.Reduction): + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, + iname_permutation) + current_iname_order = _current_new_iname_order(outer_inames, + reduction_iname, + inner_inames, + new_iname_order) + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + continue - # Create shape for accum variable - shape = tuple(i for (i,j) in zip(agg.shape, active_inames) if j) + assert isinstance(instr.expression.operation, lp.library.reduction.SumReductionOperation) + # Dependencies. Note: We change the instructions below so we cannot do + # depends_on=instr.depends_on. But we know that the id of instr is + # still valid so we can use that! + match = lp.match.Id(instr.id) + depends_on = lp.find_instructions(kernel, match)[0].depends_on + depending = [] + for i in kernel.instructions: + if instr.id in i.depends_on: + depending.append(i.id) + + # Remove reduction + kernel = lp.remove_instructions(kernel, set([instr.id])) + + # Inames used in this reduction + outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr, iname_permutation) + + # Create dim_tags + dim_tags = ','.join(['f'] * len(inner_inames)) + agg_variable = kernel.temporary_variables[instr.assignee.aggregate.name] + vectorized = isinstance(agg_variable.dim_tags[-1], lp.kernel.array.VectorArrayDimTag) + if vectorized: + assert len(vec_inames) == 1 + dim_tags = dim_tags + ',vec' + + # Create shape + agg_inames = tuple(i.name for i in instr.assignee.index_tuple) + agg_shape = agg_variable.shape + shape = tuple(agg_shape[agg_inames.index(i)] for i in inner_inames) + if vectorized: + shape = shape + (agg_shape[-1],) + + # Update temporary_variables of this kernel from dune.codegen.loopy.temporary import DuneTemporaryVariable + accum_variable = get_counted_variable('accum_variable') var = {accum_variable: DuneTemporaryVariable(accum_variable, - dtype=agg.dtype, + dtype=agg_variable.dtype, shape=shape, dim_tags=dim_tags, managed=True)} kernel.temporary_variables.update(var) - # Accumulate in accumulate variable - # - # Find accumulation instruction - accum_instr = lp.find_instructions(kernel, lp.match.Id(depending[0]))[0] - assert accum_instr.assignee == accum_instr.expression.children[0] - - # Dependencies - depends_on = accum_instr.depends_on - depending = [] - for i in kernel.instructions: - if accum_instr.id in i.depends_on: - depending.append(i.id) + # Set accumulation variable to zero + accum_init_inames = tuple(prim.Variable(i) for i in inner_inames) + if vectorized: + accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),) + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + accum_init_id = instr.id + '_accum_init' + accum_init_instr = lp.Assignment(assignee, + 0, + within_inames=instr.within_inames, + id=accum_init_id, + depends_on=depends_on, + tags=('accum_init',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr,]) - # Replace with accumulation in accum_variable - kernel = lp.remove_instructions(kernel, set([accum_instr.id])) - accum_inames = tuple(i for (i,j) in zip(accum_instr.assignee.index, active_inames) if j) - assignee = prim.Subscript(prim.Variable(accum_variable,), accum_inames) - expression = prim.Sum((assignee, accum_instr.expression.children[1])) - accum_id = accum_instr.id + '_accumvar' - new_accum_instr = lp.Assignment(assignee, - expression, - within_inames=accum_instr.within_inames, - id=accum_id, - depends_on=depends_on, - ) - kernel = kernel.copy(instructions=kernel.instructions + [new_accum_instr,]) + # Accumulate in temporary variable + assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames) + expression = prim.Sum((assignee, instr.expression.expr)) + within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames) + accum_id = instr.id + '_accum' + accum_instr = lp.Assignment(assignee, + expression, + within_inames=within_inames, + id=accum_id, + depends_on=frozenset([accum_init_id,]), + tags=('accum',), + ) + kernel = kernel.copy(instructions=kernel.instructions + [accum_instr,]) # Assign accumulation result - # - # The reduction is already done - within_inames = frozenset(i for i in accum_instr.within_inames if 'red' not in i) - assign_id = accum_instr.id + '_assign' - assign_instr = lp.Assignment(accum_instr.assignee, + assign_id = instr.id + '_assign' + assign_instr = lp.Assignment(instr.assignee, assignee, within_inames=within_inames, id=assign_id, depends_on=frozenset([accum_id,]), + tags=('accum_assign',), ) kernel = kernel.copy(instructions=kernel.instructions + [assign_instr,]) + # Restore dependencies for dep in depending: match = lp.match.Id(dep) kernel = lp.add_dependency(kernel, match, assign_id) + # Reordering loops only works if we duplicate some inames + duplicate_inames = tuple(i.name for i in accum_init_inames) + if vectorized: + duplicate_inames = duplicate_inames[:-1] + match = lp.match.Id(accum_init_id) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + match = lp.match.Id(assign_id) + kernel = lp.duplicate_inames(kernel, duplicate_inames, match) + + # Change loop order + current_iname_order = _current_new_iname_order(outer_inames, + reduction_iname, + inner_inames, + new_iname_order) + kernel = lp.prioritize_loops(kernel, tuple(current_iname_order)) + return kernel -def reorder_loops_in_tensor_contraction(kernel, iname_order, accum_variable=True): +def reorder_loops_in_tensor_contraction(kernel, iname_permutation, accum_variable=True): + """Change loop order in tensor contractions of sum factorization kernel + + Since there is a reduction involved this implies more than just reordering + loops. There are two possibilities to handle the reduction: + + 1) Generate an object of correct size for the accumulation and assign to + the result data structure afterwards + + 2) Make sure to set the correct entries of the result data structure to + zero and accumulate inplace + + Parameters + ---------- + kernel: loopy.kernel.LoopKernel + iname_permutation: tuple of ints + How to permute the loop inames. Should contain some permutation of the + numbers 0 to dim+1 + accum_variable: bool + If True use method 1 else use method 2 from above + + Notes + ----- + Using einsum notation from numpy a contraction of a sum factorization + kernel can be written as: + + - 3D: 'ij,jkl->kli' + - 2D: 'ij,jk->ki' + + In the sum factorization kernel itself those inames are called: + + sf_out_inames_2_* : l + sf_out_inames_1_* : k + sf_out_inames_0_* : i + sf_red_* : j + + where * represents the current direction (0,1,2 for 3D problems). The + default order of the loops is + + (sf_out_iname_2_*, sf_out_iname_1_*, sf_out_iname_0_*, red_*) + + A permutation vector of (3,2,0,1) would mean that we do + + (sf_out_iname_0_*, red_*, sf_out_iname_1_*, sf_out_iname_2_*) + + instead. + + """ if accum_variable: - return _reorder_loops_in_tensor_contraction_accum(kernel, iname_order) + return _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation) else: - return _reorder_loops_in_tensor_contraction_direct(kernel, iname_order) + # Need to adapts this! + assert False + return _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation) def tensor_contraction_loop_order_generator(kernel): @@ -336,7 +530,13 @@ def autotune_tensor_contraction_loop_order(kernel, signature): def sumfact_performance_transformations(kernel, signature): if kernel.name.startswith('sfimpl'): - # kernel = reorder_loops_in_tensor_contraction_with_accumulation_variable(kernel, "ljik") + # 2D + # kernel = reorder_loops_in_tensor_contraction(kernel, (2,0,1), True) + + # 3D + # kernel = reorder_loops_in_tensor_contraction(kernel, (3,2,0,1), True) + # kernel = reorder_loops_in_tensor_contraction(kernel, (1,2,0,3), True) + # kernel = autotune_tensor_contraction_loop_order(kernel, signature) pass