From 0e8b4ea559058fb921e435d9254c3f4fce16decd Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Fri, 22 Sep 2017 16:24:34 +0200 Subject: [PATCH] Implement a greedy vectorization strategy It always aims for maximally horizontal vectorization where possible. --- .../poisson_dg_tensor/poisson_dg_tensor.mini | 4 +- applications/stokes_dg/stokes_dg.mini | 6 +-- python/dune/perftool/options.py | 1 + python/dune/perftool/sumfact/vectorization.py | 41 +++++++++++++++++++ 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini index feffea5b..09c57bf1 100644 --- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini +++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini @@ -38,7 +38,7 @@ extension = vtu fastdg = 1 sumfact = 1 vectorize_quad = 1 -vectorize_grads = 1 +vectorize_greedy = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount @@ -48,4 +48,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval [formcompiler.ufl_variants] cell = hexahedron -degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand +degree = 1, 3, 5, 7, 9 | expand diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini index 246deb18..1b0fbdeb 100644 --- a/applications/stokes_dg/stokes_dg.mini +++ b/applications/stokes_dg/stokes_dg.mini @@ -39,7 +39,7 @@ extension = vtu fastdg = 1 sumfact = 1 vectorize_quad = 1 -vectorize_grads = 1 +vectorize_greedy = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount @@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval [formcompiler.ufl_variants] cell = hexahedron -v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree -p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9 | expand degree +v_degree = 1, 3, 5, 7, 9 | expand degree +p_degree = 0, 2, 4, 6, 8 | expand degree diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index ed378971..0e1e8f58 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord): vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") + vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index b2dca33d..ab5535d7 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -51,6 +51,13 @@ def no_vectorization(sumfacts): def vertical_vectorization_strategy(sumfact, depth): + # If depth is 1, there is nothing do + if depth == 1: + if isinstance(sumfact, SumfactKernel): + return {sumfact: sumfact} + else: + return {k: sumfact for k in sumfact.kernels} + # Assert that this is not already sliced assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) result = {} @@ -139,6 +146,33 @@ def diagonal_vectorization_strategy(sumfacts, width): return result +def greedy_vectorization_strategy(sumfacts, width): + sumfacts = set(sumfacts) + horizontal = width + vertical = 1 + + result = {} + + while horizontal > 0: + if horizontal > 1: + horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0) + else: + horizontal_kernels = {sf: sf for sf in sumfacts} + + for sf in horizontal_kernels: + if horizontal_kernels[sf].horizontal_width == horizontal: + vert = vertical_vectorization_strategy(horizontal_kernels[sf], + vertical) + for k in vert: + result[k] = vert[k] + sumfacts.discard(sf) + + horizontal = horizontal / 2 + vertical = vertical * 2 + + return result + + def decide_vectorization_strategy(): """ Decide how to vectorize! Note that the vectorization of the quadrature loop is independent of this, @@ -188,6 +222,13 @@ def decide_vectorization_strategy(): sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): sfdict[old] = new + elif get_option("vectorize_greedy"): + inputkeys = set(sf.input_key for sf in active_sumfacts) + for inputkey in inputkeys: + width = get_vcl_type_size(np.float64) + sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] + for old, new in greedy_vectorization_strategy(sumfact_filter, width).items(): + sfdict[old] = new else: for old, new in no_vectorization(active_sumfacts).items(): sfdict[old] = new -- GitLab