From de423230b1cd268309961cd62b6c7a4f0774bdf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de> Date: Fri, 16 Dec 2016 15:21:56 +0100 Subject: [PATCH] Towards fastdg accumulation --- python/dune/perftool/options.py | 2 +- python/dune/perftool/sumfact/amatrix.py | 1 - python/dune/perftool/sumfact/basis.py | 12 +-- python/dune/perftool/sumfact/sumfact.py | 89 ++++++++++++++----- python/dune/perftool/sumfact/vectorization.py | 10 +-- test/sumfact/poisson/CMakeLists.txt | 7 ++ test/sumfact/poisson/poisson_dg.mini | 4 +- test/sumfact/poisson/poisson_fastdg.mini | 21 +++++ 8 files changed, 106 insertions(+), 40 deletions(-) create mode 100644 test/sumfact/poisson/poisson_fastdg.mini diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 4cb6e8e8..5f27ede9 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -42,7 +42,7 @@ def get_form_compiler_arguments(): parser.add_argument("--timer", action="store_true", help="measure times") parser.add_argument("--opcounter", action="store_true", default=False, help="Count operations. Should only be used with yaspgrid. Timer should be set.") parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project") - parser.add_argument("--fasfdg", action="store_false", help="Use FastDGGridOperator from PDELab.") + parser.add_argument("--fastdg", action="store_true", help="Use FastDGGridOperator from PDELab.") # TODO at some point this help description should be updated parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization") parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization") diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py index c13ab899..3822ba43 100644 --- a/python/dune/perftool/sumfact/amatrix.py +++ b/python/dune/perftool/sumfact/amatrix.py @@ -8,7 +8,6 @@ from dune.perftool.generation import (class_member, domain, function_mangler, get_global_context_value, - globalarg, iname, include_file, initializer_list, diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index e77001d4..8401ad18 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -75,29 +75,29 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): # Get the vectorization info. If this happens during the dry run, we get dummies from dune.perftool.sumfact.vectorization import get_vectorization_info - a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, restriction) + a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction) # Initialize the buffer for the sum fact kernel shape = (product(mat.cols for mat in a_matrices),) if index is not None: shape = shape + (4,) - input = initialize_buffer(buffer, + inp = initialize_buffer(buf, base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), num=2 ).get_temporary(shape=shape, - name=input, + name=inp, ) if insn_dep is None: - insn_dep = frozenset({Writes(input)}) + insn_dep = frozenset({Writes(inp)}) # Setup the input! - setup_theta(input, element, restriction, component, index) + setup_theta(inp, element, restriction, component, index) # Add a sum factorization kernel that implements the # evaluation of the gradients of basis functions at quadrature # points (stage 1) var, insn_dep = sum_factorization_kernel(a_matrices, - buffer, + buf, 1, preferred_position=i, insn_dep=insn_dep, diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 797a700f..86731a14 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -58,7 +58,6 @@ from pymbolic.primitives import (Call, ) from dune.perftool.sumfact.quadrature import quadrature_inames from dune.perftool.sumfact.vectorization import find_sumfact -from loopy import Reduction, GlobalArg from loopy.symbolic import FunctionIdentifier, IdentityMapper import loopy as lp @@ -142,7 +141,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Get the vectorization info. If this happens during the dry run, we get dummies from dune.perftool.sumfact.vectorization import get_vectorization_info - a_matrices, buffer, input, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction)) + a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction)) # Initialize a base storage for this buffer and get a temporay pointing to it shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1) @@ -156,12 +155,12 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): index = () vectag = frozenset() - temp = initialize_buffer(buffer, + temp = initialize_buffer(buf, base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), num=2 ).get_temporary(shape=shape, dim_tags=dim_tags, - name=input, + name=inp, ) # Those input fields, that are padded need to be set to zero in order to do a horizontal_add lateron @@ -203,7 +202,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # with the test function (stage 3) pref_pos = i if accterm.argument.index else None result, insn_dep = sum_factorization_kernel(a_matrices, - buffer, + buf, 3, insn_dep=insn_dep, additional_inames=frozenset(visitor.inames), @@ -245,18 +244,57 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): (maybe_wrap_subscript(result, prim.Variable(iname)),), ) - expr = Call(PDELabAccumulationFunction(accum, rank), - (ansatz_lfs.get_args() + - test_lfs.get_args() + - (result,) - ) - ) - instruction(assignees=(), - expression=expr, - forced_iname_deps=frozenset(inames + visitor.inames + vecinames), - forced_iname_deps_is_final=True, - depends_on=insn_dep, - ) + if get_option('fastdg'): + ft = get_global_context_value("form_type") + if ft=='residual': + globalarg(accum, dtype=np.float64, shape=(ansatz_lfs.index,), managed=False) + assignee = prim.Subscript(prim.Variable(accum), (test_lfs.index,)) + expression = prim.Sum((assignee,result)) + instruction(assignee=assignee, + expression=expression, + forced_iname_deps=frozenset(inames + visitor.inames + vecinames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + ) + else: + assert ft=='jacobian' + # palpo TODO: think about it + globalarg(accum, dtype=np.float64, shape=(ansatz_lfs.index, test_lfs.index), managed=False) + assignee = prim.Subscript(prim.Variable(accum), (ansatz_lfs.index, test_lfs.index)) + expression = prim.Sum((assignee,result)) + instruction(assignee=assignee, + expression=expression, + forced_iname_deps=frozenset(inames + visitor.inames + vecinames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + ) + # expr = Call(PDELabAccumulationFunction(accum, rank), + # (ansatz_lfs.get_args() + + # test_lfs.get_args() + + # (result,) + # ) + # ) + # instruction(assignees=(), + # expression=expr, + # forced_iname_deps=frozenset(inames + visitor.inames + vecinames), + # forced_iname_deps_is_final=True, + # depends_on=insn_dep, + # ) + + else: + expr = Call(PDELabAccumulationFunction(accum, rank), + (ansatz_lfs.get_args() + + test_lfs.get_args() + + (result,) + ) + ) + instruction(assignees=(), + expression=expr, + forced_iname_deps=frozenset(inames + visitor.inames + vecinames), + forced_iname_deps_is_final=True, + depends_on=insn_dep, + ) + # Mark the transformation that moves the quadrature loop inside the trialfunction loops for application transform(nest_quadrature_loops, visitor.inames) @@ -357,21 +395,24 @@ def sum_factorization_kernel(a_matrices, buf, stage, # a code generation corner case producing way too complicated code. This # could be fixed upstream, but the loopy code realizing reductions is not # trivial and the priority is kind of low. - matprod = Product((Subscript(Variable(a_matrix.name), (Variable(i), 0) + vec_iname), - Subscript(Variable(inp), (0, Variable(j)) + vec_iname) + matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(i), 0) + vec_iname), + prim.Subscript(prim.Variable(inp), (0, prim.Variable(j)) + vec_iname) )) else: k = sumfact_iname(a_matrix.cols, "red") # Construct the matrix-matrix-multiplication expression a_ik*in_kj - prod = Product((Subscript(Variable(a_matrix.name), (Variable(i), Variable(k)) + vec_iname), - Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname) - )) - matprod = Reduction("sum", k, prod) + prod = prim.Product((prim.Subscript(prim.Variable(a_matrix.name), + (prim.Variable(i), prim.Variable(k)) + vec_iname), + prim.Subscript(prim.Variable(inp), + (prim.Variable(k), prim.Variable(j)) + vec_iname) + )) + matprod = lp.Reduction("sum", k, prod) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on - insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname), + assignee = prim.Subscript(prim.Variable(out), (prim.Variable(i), prim.Variable(j)) + vec_iname) + insn_dep = frozenset({instruction(assignee=assignee, expression=matprod, forced_iname_deps=frozenset({i, j}).union(additional_inames), forced_iname_deps_is_final=True, diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 100f3b21..0adcc6ab 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -13,8 +13,8 @@ import loopy as lp @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, r, *args: (a, r)) -def vectorization_info(a_matrices, restriction, new_a_matrices, buffer, input, index, padding): - return (new_a_matrices, buffer, input, index, padding) +def vectorization_info(a_matrices, restriction, new_a_matrices, buf, inp, index, padding): + return (new_a_matrices, buf, inp, index, padding) def get_vectorization_info(a_matrices, restriction): @@ -63,8 +63,8 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): position_mapping[sumf] = available.pop() # Enable vectorization strategy: - input = get_counted_variable("joined_input") - buffer = get_counted_variable("joined_buffer") + inp = get_counted_variable("joined_input") + buf = get_counted_variable("joined_buffer") # Collect the large matrices! large_a_matrices = [] @@ -88,7 +88,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction): large_a_matrices.append(large) for sumf in stage_sumfacts: - vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buffer, input, position_mapping[sumf], frozenset(available)) + vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buf, inp, position_mapping[sumf], frozenset(available)) else: # Disable vectorization strategy no_vectorization(stage_sumfacts) diff --git a/test/sumfact/poisson/CMakeLists.txt b/test/sumfact/poisson/CMakeLists.txt index 08ca1ca7..1dc2483f 100644 --- a/test/sumfact/poisson/CMakeLists.txt +++ b/test/sumfact/poisson/CMakeLists.txt @@ -33,3 +33,10 @@ dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl BASENAME sumfact_poisson_dg INIFILE poisson_dg.mini ) + + +# Poisson DG using FastDGGridOperator +dune_add_formcompiler_system_test(UFLFILE poisson_dg.ufl + BASENAME sumfact_poisson_fastdg + INIFILE poisson_fastdg.mini + ) diff --git a/test/sumfact/poisson/poisson_dg.mini b/test/sumfact/poisson/poisson_dg.mini index 1e1127ec..747a5b7d 100644 --- a/test/sumfact/poisson/poisson_dg.mini +++ b/test/sumfact/poisson/poisson_dg.mini @@ -1,10 +1,9 @@ __name = sumfact_poisson_dg_{__exec_suffix} -__exec_suffix = {diff_suffix}_{quadvec_suffix}_{gradvec_suffix}_{fastdg_suffix} +__exec_suffix = {diff_suffix}_{quadvec_suffix}_{gradvec_suffix} diff_suffix = numdiff, symdiff | expand num quadvec_suffix = quadvec, nonquadvec | expand quad gradvec_suffix = gradvec, nongradvec | expand grad -fastdg_suffix = fastdg, standarddg | expand fastdg cells = 16 16 extension = 1. 1. @@ -20,4 +19,3 @@ exact_solution_expression = g compare_l2errorsquared = 1e-4 vectorize_quad = 1, 0 | expand quad vectorize_grads = 1, 0 | expand grad -fastdg = 1, 0 | expand fastdg diff --git a/test/sumfact/poisson/poisson_fastdg.mini b/test/sumfact/poisson/poisson_fastdg.mini new file mode 100644 index 00000000..ddd098e0 --- /dev/null +++ b/test/sumfact/poisson/poisson_fastdg.mini @@ -0,0 +1,21 @@ +__name = sumfact_poisson_fastdg_{__exec_suffix} +__exec_suffix = {gradvec_suffix}_{quadvec_suffix} + +gradvec_suffix = gradvec, nongradvec | expand gradvec +quadvec_suffix = quadvec, nonquadvec | expand quadvec + +cells = 16 16 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +extension = vtu + +[formcompiler] +numerical_jacobian = 0 +sumfact = 1 +exact_solution_expression = g +compare_l2errorsquared = 1e-4 +vectorize_quad = 1, 0 | expand quadvec +vectorize_grads = 1, 0 | expand gradvec +fastdg = 1 -- GitLab