diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index e53422138b7b2ae549373816cfe244ed47e4d6bb..115aede7f16b081399ccc8016961f035162869d2 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -491,9 +491,15 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))] kernel = kernel.copy(preambles=preambles) + # Remove inames that have become obsolete + kernel = lp.remove_unused_inames(kernel) + # Do the loopy preprocessing! kernel = preprocess_kernel(kernel) + # *REALLY* ignore boostability. This is - so far - necessary due to a mystery bug. + kernel = kernel.copy(instructions=[i.copy(boostable=False, boostable_into=frozenset()) for i in kernel.instructions]) + if wrap_in_cgen: # Wrap the kernel in something which can generate code from dune.perftool.pdelab.signatures import assembly_routine_signature diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index a7b3c4c98ac3c5b262ff1f7dee9b5acecba33aa9..40f042ef96f1cf167487008d3e07b1cfe61374e6 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -247,9 +247,9 @@ def evaluate_reference_gradient(element, name, restriction): i = 0 for d in range(local_dimension()): if d == facedir: - i = i+1 + i = i + 1 quadinamemapping[i] = quad_inames[d] - i = i+1 + i = i + 1 for d in range(dim): prod = [] @@ -261,7 +261,7 @@ def evaluate_reference_gradient(element, name, restriction): if facedir is not None: facemod = get_facemod(restriction) from dune.perftool.sumfact.amatrix import PolynomialLookup, name_polynomials - prod.append(prim.Call(PolynomialLookup(name_polynomials(), facedir==d), + prod.append(prim.Call(PolynomialLookup(name_polynomials(), facedir == d), (prim.Variable(inames[facedir]), facemod)),) assignee = prim.Subscript(prim.Variable(name), (d,)) diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 539d2f698b8c23c5f13b30f66229def5137705be..4205aedb8276c4f1a3bcf61fc57985a47e756935 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -308,7 +308,6 @@ def sum_factorization_kernel(a_matrices, buf, stage, # Get the inames needed for one matrix-matrix multiplication i = sumfact_iname(out_shape[0], "row") j = sumfact_iname(out_shape[1], "col") - k = sumfact_iname(a_matrix.cols, "red") # Maybe introduce a vectorization iname for this matrix-matrix multiplication vec_iname = () @@ -317,15 +316,27 @@ def sum_factorization_kernel(a_matrices, buf, stage, vec_iname = (prim.Variable(iname),) transform(lp.tag_inames, [(iname, "vec")]) - # Construct the matrix-matrix-multiplication expression a_ik*in_kj - prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname), - Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname) - )) + if a_matrix.cols == 1: + # A trivial reduction is implemented as a product, otherwise we run into + # a code generation corner case producing way too complicated code. This + # could be fixed upstream, but the loopy code realizing reductions is not + # trivial and the priority is kind of low. + matprod = Product((Subscript(Variable(a_matrix.name), (Variable(i), 0) + vec_iname), + Subscript(Variable(inp), (0, Variable(j)) + vec_iname) + )) + else: + k = sumfact_iname(a_matrix.cols, "red") + + # Construct the matrix-matrix-multiplication expression a_ik*in_kj + prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname), + Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname) + )) + matprod = Reduction("sum", k, prod) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname), - expression=Reduction("sum", k, prod), + expression=matprod, forced_iname_deps=frozenset({i, j}).union(additional_inames), forced_iname_deps_is_final=True, depends_on=insn_dep, diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index a9e050fb68c3f583d283a62a39943f1deef53093..197e6e61de48661f733a6f28c0faaf3ddf95b8cf 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -38,7 +38,7 @@ def no_vectorization(sumfacts): sumf.restriction, sumf.a_matrices, get_counted_variable("buffer"), - get_counted_variable(restricted_name("input", sumf.restriction)), + get_counted_variable("input"), None) @@ -135,4 +135,4 @@ class HasSumfactMapper(lp.symbolic.CombineMapper): def find_sumfact(expr): - return HasSumfactMapper()(expr) + return HasSumfactMapper()(expr) \ No newline at end of file diff --git a/python/loopy b/python/loopy index 36c9bb5c0a5905022fc850c3efc5ad7661e5f897..c16057b7c361584d04edb59132f0742ecaa38226 160000 --- a/python/loopy +++ b/python/loopy @@ -1 +1 @@ -Subproject commit 36c9bb5c0a5905022fc850c3efc5ad7661e5f897 +Subproject commit c16057b7c361584d04edb59132f0742ecaa38226