diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index ab5ce9e1500baf2070cc0b8ad69d30ede8ec713e..ea22fd5d03950c65f2655a99ede0bce491ca5678 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -1,4 +1,5 @@ import copy +import itertools from dune.perftool.loopy.symbolic import substitute from dune.perftool.pdelab.argument import (name_accumulation_variable, @@ -328,6 +329,85 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): insn_dep = emit_sumfact_kernel(None, restriction, insn_dep) +def _sf_permutation_heuristic(permutations, stage): + """Heuristic to choose a permutation + + - Stage 1: Pick the permutation where in permutations[1:] most + elements are ordered by size + - Stage 3: Pick the permutation where in permutations[:-1] most + elements are ordered by size + """ + def cost(perm, stage): + cost = 0 + for i in range(0,len(perm)-2): + if stage==1: + if perm[i+1]>perm[i+2]: + cost += 1 + if stage==3: + if perm[0]>perm[i+1]: + cost += 1 + return cost + + perm = min(permutations, key=lambda i:cost(i,stage)) + return perm + + +def _sf_flop_cost(a_matrices): + """Computational cost of sumfactorization with this list of a_matrices + """ + cost = 0; + for l in range(len(a_matrices)): + cost_m = 1 + cost_n = 1 + for i in range(l+1): + cost_m *= a_matrices[i].rows + for i in range(l,len(a_matrices)): + cost_n *= a_matrices[i].cols + cost += cost_m * cost_n + return cost + + +def _sf_permutation_strategy(a_matrices, stage): + """Choose permutation of a_matices list based on computational cost + + Note: If there are multiple permutations with the same cost a + heuristic is used to pick one. + """ + # Combine permutation and a_matrices list + perm = [i for i, _ in enumerate(a_matrices)] + perm_a_matrices = zip(perm, a_matrices) + + # Find cost for all possible permutations of a_matrices list + perm_cost = [] + for permutation in itertools.permutations(perm_a_matrices): + perm, series = zip(*permutation) + cost = _sf_flop_cost(series) + perm_cost.append((perm,cost)) + + # Find minimal cost and all permutations with that cost + _, costs = zip(*perm_cost) + minimal_cost = min(costs) + minimal_cost_permutations = [p[0] for p in perm_cost if p[1]==minimal_cost] + + # Use heuristic to pic one of the minimal cost permutations + perm = _sf_permutation_heuristic(minimal_cost_permutations, stage) + return perm + + +def _permute_forward(t, perm): + tmp = [] + for pos in perm: + tmp.append(t[pos]) + return tuple(tmp) + + +def _permute_backward(t, perm): + tmp = [None]*len(t) + for i, pos in enumerate(perm): + tmp[pos] = t[i] + return tuple(tmp) + + @generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0))) def sum_factorization_kernel(a_matrices, buf, @@ -364,13 +444,22 @@ def sum_factorization_kernel(a_matrices, A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}] - So the multiplication with A_l is reduction over one index and the - transformation brings the next reduction index in the fastest + So the multiplication with A_l is a reduction over one index and + the transformation brings the next reduction index in the fastest position. Note: In the code below the transformation step is directly done in the reduction instruction by adapting the assignee! + It can make sense to permute the order of directions. If you have + a small m_l (e.g. stage 1 on faces) it is better to do direction l + first. This can be done permuting: + + - The order of the A matrices. + - Permuting the input tensor. + - Permuting the output tensor (this assures that the directions of + the output tensor are again ordered from 0 to d-1). + Arguments: ---------- a_matrices: An iterable of AMatrix instances @@ -391,7 +480,6 @@ def sum_factorization_kernel(a_matrices, restriction: Restriction for faces values. direct_input: Global data structure containing input for sumfactorization (e.g. when using FastDGGridOperator). - """ if get_global_context_value("dry_run", False): return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset() @@ -418,6 +506,18 @@ def sum_factorization_kernel(a_matrices, within_inames=additional_inames, )}) + # Decide in which order we want to process directions in the + # sumfactorization. A clever ordering can lead to a reduced + # complexity. This will e.g. happen at faces where we only have + # one quadratue point m_l=1 if l is the normal direction of the + # face. + # + # Rule of thumb: small m's early and large n's late. + perm = _sf_permutation_strategy(a_matrices, stage) + + # Permute a_matrices + a_matrices = _permute_forward(a_matrices, perm) + # Product of all matrices for l, a_matrix in enumerate(a_matrices): # Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication @@ -446,16 +546,29 @@ def sum_factorization_kernel(a_matrices, # * an input temporary (default) # * a global data structure (if FastDGGridOperator is in use) # * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator) + input_inames = (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) if l == 0 and direct_input is not None: + # See comment bellow + input_inames = _permute_backward(input_inames, perm) + inp_shape = _permute_backward(inp_shape, perm) + globalarg(direct_input, dtype=np.float64, shape=inp_shape) if a_matrix.vectorized: input_summand = prim.Call(prim.Variable("Vec4d"), (prim.Subscript(prim.Variable(direct_input), - (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])),)) + input_inames),)) else: input_summand = prim.Subscript(prim.Variable(direct_input), - (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname) + input_inames + vec_iname) else: + # If we did permute the order of a matrices above we also + # permuted the order of out_inames. Unfortunately the + # order of our input is from 0 to d-1. This means we need + # to permute _back_ to get the right coefficients. + if l == 0: + inp_shape = _permute_backward(inp_shape, perm) + input_inames = _permute_backward(input_inames, perm) + # Get a temporary that interprets the base storage of the input # as a column-major matrix. In later iteration of the amatrix loop # this reinterprets the output of the previous iteration. @@ -467,7 +580,7 @@ def sum_factorization_kernel(a_matrices, silenced_warning('read_no_write({})'.format(inp)) input_summand = prim.Subscript(prim.Variable(inp), - (k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname) + input_inames + vec_iname) switch_base_storage(buf) @@ -478,14 +591,16 @@ def sum_factorization_kernel(a_matrices, # corresponding shape (out_shape[0]) goes to the end (slowest # direction) and everything stays column major (ftags->fortran # style). + # + # If we are in the last step we reverse the permutation. + output_shape = tuple(out_shape[1:]) + (out_shape[0],) + if l == len(a_matrices)-1: + output_shape = _permute_backward(output_shape, perm) out = get_buffer_temporary(buf, - shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape, + shape=output_shape + vec_shape, dim_tags=ftags) # Write the matrix-matrix multiplication expression - # matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), - # (prim.Variable(i), k_expr) + vec_iname), - # input_summand)) matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(out_inames[0]), k_expr) + vec_iname), input_summand)) @@ -494,8 +609,12 @@ def sum_factorization_kernel(a_matrices, if a_matrix.cols != 1: matprod = lp.Reduction("sum", k, matprod) - # Here we also move the new direction (out_inames[0]) to the end - assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname) + # Here we also move the new direction (out_inames[0]) to the + # end and reverse permutation + output_inames = tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + if l == len(a_matrices)-1: + output_inames = _permute_backward(output_inames, perm) + assignee = prim.Subscript(prim.Variable(out), output_inames + vec_iname) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on