[skip ci] Rewrite loop reordering

This rewrite was necessary for the cases where the instruction contains a reduction but the expression itself is not a reduction. Besides fixing this bug it greatly reduces code duplication and increases code quality.

[skip ci] Rewrite loop reordering
184da59e · René Heß · ce3fc8ac · 184da59e
Commit 184da59e authored 5 years ago by René Heß
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -63,195 +63,53 @@ def _get_inames_of_reduction(instr, iname_permutation):
    return outer_inames, reduction_iname, inner_inames, vec_inames


-def _get_iname_bound(kernel, iname):
-    # TODO: Not sure if that works in all cases
-    ldi, = kernel.get_leaf_domain_indices((iname,))
-    domain = kernel.domains[ldi]
-    pwaff = domain.dim_max(0)
-    bound = lp.symbolic.pw_aff_to_expr(pwaff)
-    return bound + 1
+class FindReductionMapper(lp.symbolic.WalkMapper):
+    def __init__(self):
+        self.reductions = []

+    def map_reduction(self, expr, **kwargs):
+        self.reductions.append(expr)
+        return lp.symbolic.WalkMapper.map_reduction(self, expr, **kwargs)

-def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
-    """Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
-    dim = world_dimension()

-    # Nothing to do if permutation is identity
-    if iname_permutation == tuple(range(dim + 1)):
-        return kernel
+def _reorder_reduction_loops(kernel, match, iname_order):
+    """Reorder loops in instructions containing one reduction

-    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
-    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
-    from dune.codegen.sumfact.permutation import permute_backward
-    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+    In order to reorder the loops we need to manually do the accumulation in a
+    temporary that is large enough to fit all the data created in loops inside
+    the reduction loop.

-    for instr in kernel.instructions:
-        # Inames used in this reduction
-        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
-                                                                                           iname_permutation)
-        if reduction_iname:
-            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
-        else:
-            current_inames = outer_inames + inner_inames + vec_inames
-        current_iname_order = _current_iname_order(current_inames,
-                                                   new_iname_order)
-        if iname_permutation[-1] == dim:
-            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+    Parameters
+    ----------
+    kernel: loopy.kernel.LoopKernel
+    match:
+    iname_order: tuple of str
+        prefered loop order for this instruction
+    """
+    instructions = lp.find_instructions(kernel, match)
+    for instr in instructions:
+        # Get reduction
+        reduction = FindReductionMapper()
+        reduction(instr.expression)
+        assert len(reduction.reductions) == 1
+        reduction = reduction.reductions[0]
+
+        # Reduction iname, inner inames and vetcor inames
+        assert len(reduction.inames) == 1
+        reduction_iname = reduction.inames[0]
+        assert set([reduction_iname]) == instr.reduction_inames()
+        if reduction_iname not in iname_order:
+            lp.prioritize_loops(kernel, iname_order)
            continue
-
-        if isinstance(instr.expression, lp.symbolic.Reduction):
-            if isinstance(instr.assignee, prim.Subscript):
-                assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
-                match = lp.match.Id(instr.id)
-                kernel = remove_reduction(kernel, match)
-
-                lp.prioritize_loops(kernel, current_iname_order)
-                duplicate_inames = tuple(inner_inames)
-                match = lp.match.Id(instr.id + '_set_zero')
-                kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
-                match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
-                set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
-                lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
-            else:
-                # Dependencies
-                match = lp.match.Id(instr.id)
-                depends_on = lp.find_instructions(kernel, match)[0].depends_on
-                depending = []
-                for i in kernel.instructions:
-                    if instr.id in i.depends_on:
-                        depending.append(i.id)
-
-                # Remove reduction
-                kernel = lp.remove_instructions(kernel, set([instr.id]))
-
-                # Create dim_tags
-                dim_tags = ','.join(['f'] * len(inner_inames))
-                vectorized = len(vec_inames) > 0
-                if vectorized:
-                    assert len(vec_inames) == 1
-                    dim_tags = dim_tags + ',vec'
-
-                # Create shape
-                shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames)
-
-                # Update temporary_variables of this kernel
-                from dune.codegen.loopy.temporary import DuneTemporaryVariable
-                accum_variable = get_counted_variable('accum_variable')
-                from dune.codegen.loopy.target import dtype_floatingpoint
-                dtype = lp.types.NumpyType(dtype_floatingpoint())
-                var = {accum_variable: DuneTemporaryVariable(accum_variable,
-                                                             dtype=dtype,
-                                                             shape=shape,
-                                                             dim_tags=dim_tags,
-                                                             managed=True)}
-                tv = kernel.temporary_variables.copy()
-                tv.update(var)
-                del tv[instr.assignee.name]
-                kernel = kernel.copy(temporary_variables=tv)
-
-                # Set accumulation variable to zero
-                accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
-                if vectorized:
-                    accum_init_inames = accum_init_inames + (prim.Variable(vec_inames[0]),)
-                assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
-                accum_init_id = instr.id + '_accum_init'
-                accum_init_instr = lp.Assignment(assignee,
-                                                 0,
-                                                 within_inames=instr.within_inames,
-                                                 id=accum_init_id,
-                                                 depends_on=depends_on,
-                                                 tags=('accum_init',),
-                                                 )
-                kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])
-
-                # Accumulate in temporary variable
-                assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
-                expression = prim.Sum((assignee, instr.expression.expr))
-                within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
-                accum_id = instr.id + '_accum'
-                accum_instr = lp.Assignment(assignee,
-                                            expression,
-                                            within_inames=within_inames,
-                                            id=accum_id,
-                                            depends_on=frozenset([accum_init_id]),
-                                            tags=('accum',),
-                                            )
-                kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])
-
-                # Duplicate inames and reorder
-                duplicate_inames = tuple(inner_inames)
-                for idx in [accum_init_id, accum_id]:
-                    match = lp.match.Id(idx)
-                    kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
-                    match_inames = lp.find_instructions(kernel, match)[0].within_inames
-                    current_iname_order = _current_iname_order(match_inames, new_iname_order)
-                    kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
-
-                # Restore dependencies
-                for dep in depending:
-                    match = lp.match.Id(dep)
-                    kernel = lp.add_dependency(kernel, match, accum_id)
-
-                match = lp.match.Tagged('sumfact_stage3')
-                assign_instr, = lp.find_instructions(kernel, match)
-                from dune.codegen.loopy.symbolic import substitute
-                subst = {instr.assignee.name: assignee}
-                new_assign_instr = assign_instr.copy(expression=substitute(assign_instr.expression, subst),
-                                                     id=assign_instr.id + '_mod')
-                kernel = kernel.copy(instructions=kernel.instructions + [new_assign_instr])
-                kernel = lp.remove_instructions(kernel, set([assign_instr.id]))
-
-        else:
-            current_iname_order = _current_iname_order(instr.within_inames, new_iname_order)
-            lp.prioritize_loops(kernel, current_iname_order)
-
-    return kernel
-
-
-def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
-    """Reorder the loop nests of a tensor contraction using an accumulation variable"""
-    dim = world_dimension()
-
-    # Nothing to do if permutation is identity
-    if iname_permutation == tuple(range(dim + 1)):
-        return kernel
-
-    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
-    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
-    from dune.codegen.sumfact.permutation import permute_backward
-    new_iname_order = permute_backward(default_iname_order, iname_permutation)
-
-    for instr in kernel.instructions:
-        # Inames used in this reduction
-        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
-                                                                                           iname_permutation)
-        if reduction_iname:
-            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
-        else:
-            current_inames = outer_inames + inner_inames + vec_inames
-
-        # We can directly use lp.prioritize_loops if:
-        # - The reduction is the innermost loop
-        # - There is no reduction (eg reduced direction on faces)
-        if iname_permutation[-1] == dim or reduction_iname is None:
-            current_iname_order = _current_iname_order(current_inames,
-                                                       new_iname_order)
-            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+        inner_inames = iname_order[iname_order.index(reduction_iname)+1:]
+        if len(inner_inames) == 0:
+            lp.prioritize_loops(kernel, iname_order)
            continue
-        assert isinstance(instr.expression, lp.symbolic.Reduction)
-
-        # Dependencies. Note: We change the instructions below so we cannot do
-        # depends_on=instr.depends_on. But we know that the id of instr is
-        # still valid so we can use that!
-        match = lp.match.Id(instr.id)
-        depends_on = lp.find_instructions(kernel, match)[0].depends_on
-        depending = []
-        for i in kernel.instructions:
-            if instr.id in i.depends_on:
-                depending.append(i.id)
+        # TODO: There should be a better way to do that
+        vec_inames = [i for i in instr.within_inames if 'sf_vec' in i]
+        assert len(vec_inames) < 2

-        # Remove reduction
-        kernel = lp.remove_instructions(kernel, set([instr.id]))
+        # {{{ Create new temporary variable

        # Create dim_tags
        dim_tags = ','.join(['f'] * len(inner_inames))
@@ -261,7 +119,7 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
            dim_tags = dim_tags + ',vec'

        # Create shape
-        shape = tuple(_get_iname_bound(kernel, i) for i in inner_inames + vec_inames)
+        shape = tuple(kernel.get_constant_iname_length(i) for i in inner_inames + vec_inames)

        # Update temporary_variables of this kernel
        from dune.codegen.loopy.temporary import DuneTemporaryVariable
@@ -277,6 +135,8 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
        tv.update(var)
        kernel = kernel.copy(temporary_variables=tv)

+        # }}}
+
        # Set accumulation variable to zero
        accum_init_inames = tuple(prim.Variable(i) for i in inner_inames)
        if vectorized:
@@ -287,15 +147,15 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
                                         0,
                                         within_inames=instr.within_inames,
                                         id=accum_init_id,
-                                         depends_on=depends_on,
+                                         depends_on=instr.depends_on,
                                         tags=('accum_init',),
                                         )
        kernel = kernel.copy(instructions=kernel.instructions + [accum_init_instr])

        # Accumulate in temporary variable
        assignee = prim.Subscript(prim.Variable(accum_variable,), accum_init_inames)
-        expression = prim.Sum((assignee, instr.expression.expr))
-        within_inames = frozenset(tuple(instr.within_inames) + instr.expression.inames)
+        expression = prim.Sum((assignee, reduction.expr))
+        within_inames = frozenset(tuple(instr.within_inames) + reduction.inames)
        accum_id = instr.id + '_accum'
        accum_instr = lp.Assignment(assignee,
                                    expression,
@@ -306,65 +166,125 @@ def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
                                    )
        kernel = kernel.copy(instructions=kernel.instructions + [accum_instr])

-        if 'haddsubst' not in str(instr):
-            # Write back the result if this reduction does not come from a substitution
-            assign_id = instr.id + '_assign'
-            assign_instr = lp.Assignment(instr.assignee,
-                                         assignee,
-                                         within_inames=within_inames,
-                                         id=assign_id,
-                                         depends_on=frozenset([accum_id]),
-                                         tags=('accum_assign',),
-                                         )
-            kernel = kernel.copy(instructions=kernel.instructions + [assign_instr])
-
-            # Restore dependencies
-            for dep in depending:
-                match = lp.match.Id(dep)
+        # Replace reduction in insn with accumulation result
+        def map_reduction(expr, rec):
+            return assignee
+        mapper = lp.symbolic.ReductionCallbackMapper(map_reduction)
+        new_expression = mapper(instr.expression)
+        assign_id = instr.id + '_assign'
+        new_instr = instr.copy(expression=new_expression,
+                               id=assign_id,
+                               depends_on=frozenset(list(instr.depends_on) + [accum_id]))
+        kernel = kernel.copy(instructions=kernel.instructions + [new_instr])
+
+        # Fix dependencies and remove old instruction
+        for i in kernel.instructions:
+            if instr.id in i.depends_on:
+                match = lp.match.Id(i.id)
                kernel = lp.add_dependency(kernel, match, assign_id)
-        else:
-            # If this reduction comes from a substitution take the
-            # corresponding assignment and replace the substitution with the
-            # accumulation result
-            assert len(depending) == 1
-            assignment, = [i for i in kernel.instructions if depending[0] == i.id]
-            other = [i for i in kernel.instructions if i.id != assignment.id]
-            replace = {prim.Variable(instr.assignee.name): assignee}
-            from dune.codegen.loopy.symbolic import substitute
-            new_expression = substitute(assignment.expression, replace)
-            new_instr = assignment.copy(expression=new_expression,
-                                        depends_on=frozenset(list(assignment.depends_on) + [accum_id]))
-            kernel = kernel.copy(instructions=other + [new_instr])
-
-            # Remove the temporary variable that belongs to the
-            # substitution. It is no longer used.
-            tv = kernel.temporary_variables.copy()
-            del tv[instr.assignee.name]
-            kernel = kernel.copy(temporary_variables=tv)
-
-        # Reordering loops only works if we duplicate some inames
+        kernel = lp.remove_instructions(kernel, set([instr.id]))
+
+        # Duplicate and reorder loops for accumulation
+        ids = [accum_init_id, accum_id]
        duplicate_inames = tuple(inner_inames)
+        for idx in ids:
+            match = lp.match.Id(idx)
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            current_iname_order = _current_iname_order(match_inames, iname_order)
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))

-        match = lp.match.Id(accum_init_id)
-        kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
-        match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
-        current_iname_order = _current_iname_order(match_inames, new_iname_order)
-        kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+        # Reorder assignment loops
+        kernel = lp.prioritize_loops(kernel, iname_order)
+        return kernel

-        # Reorder loops of the assignment of the result
-        if 'haddsubst' not in str(instr):
-            match = lp.match.Id(assign_id)
-        else:
-            match = lp.match.Id(assignment.id)
-        kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
-        match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
-        current_iname_order = _current_iname_order(match_inames, new_iname_order)
-        kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))

-        # Change loop order
+def _reorder_loops_in_tensor_contraction_direct(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction accumulating directly in the data structure"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
        current_iname_order = _current_iname_order(current_inames,
                                                   new_iname_order)
-        kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+
+        # We can directly use lp.prioritize_loops if
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        if isinstance(instr.expression, lp.symbolic.Reduction):
+            # If the instruction is a reduction we can directly accumulate in the assignee
+            assert set(inner_inames).issubset(set(i.name for i in instr.assignee.index_tuple))
+            match = lp.match.Id(instr.id)
+            kernel = remove_reduction(kernel, match)
+
+            lp.prioritize_loops(kernel, current_iname_order)
+            duplicate_inames = tuple(inner_inames)
+            match = lp.match.Id(instr.id + '_set_zero')
+            kernel = lp.duplicate_inames(kernel, duplicate_inames, match)
+            match_inames = tuple(lp.find_instructions(kernel, match)[0].within_inames)
+            set_zero_iname_order = _current_iname_order(match_inames, new_iname_order)
+            lp.prioritize_loops(kernel, tuple(set_zero_iname_order))
+        else:
+            # In stage 3 this is usually not a reduction. In this case direct
+            # accumulation is not possible and we accumulate in a temporay
+            # variable instead
+            match = lp.match.Id(instr.id)
+            kernel = _reorder_reduction_loops(kernel, match, current_iname_order)
+
+    return kernel
+
+
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_permutation):
+    """Reorder the loop nests of a tensor contraction using an accumulation variable"""
+    dim = world_dimension()
+
+    # Nothing to do if permutation is identity
+    if iname_permutation == tuple(range(dim + 1)):
+        return kernel
+
+    # Use names used in sum factorization kernel (without the index that distinguishes the different directions)
+    default_iname_order = ['sf_out_inames_{}'.format(dim - 1 - i) for i in range(dim)] + ['sf_red']
+    from dune.codegen.sumfact.permutation import permute_backward
+    new_iname_order = permute_backward(default_iname_order, iname_permutation)
+
+    for instr in kernel.instructions:
+        # Inames used in this reduction
+        outer_inames, reduction_iname, inner_inames, vec_inames = _get_inames_of_reduction(instr,
+                                                                                           iname_permutation)
+        if reduction_iname:
+            current_inames = outer_inames + [reduction_iname] + inner_inames + vec_inames
+        else:
+            current_inames = outer_inames + inner_inames + vec_inames
+        current_iname_order = _current_iname_order(current_inames, new_iname_order)
+
+        # We can directly use lp.prioritize_loops if:
+        # - The reduction is the innermost loop
+        # - There is no reduction (eg reduced direction on faces)
+        if iname_permutation[-1] == dim or reduction_iname is None:
+            kernel = lp.prioritize_loops(kernel, tuple(current_iname_order))
+            continue
+
+        match = lp.match.Id(instr.id)
+        kernel = _reorder_reduction_loops(kernel, match, current_iname_order)

    return kernel