diff --git a/applications/poisson_dg/poisson_dg.mini b/applications/poisson_dg/poisson_dg.mini index 00a7269e197361b3a5d0fb900f37b834c48e3473..9ae68223465e492f17e5c07594f20b2098423b51 100644 --- a/applications/poisson_dg/poisson_dg.mini +++ b/applications/poisson_dg/poisson_dg.mini @@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount # Input parameters dim = 3 mbperrank = 100 -ranks = 16 +ranks = 32 floatingbytes = 8 # Metaini Calculations @@ -38,7 +38,7 @@ extension = vtu fastdg = 1 sumfact = 1 vectorize_quad = 1 -vectorize_grads = 1 +vectorize_greedy = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount @@ -46,4 +46,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval [formcompiler.ufl_variants] cell = hexahedron -degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand +degree = 3, 7 | expand diff --git a/applications/poisson_dg_tensor/poisson_dg_tensor.mini b/applications/poisson_dg_tensor/poisson_dg_tensor.mini index feffea5b5bacb51d74ed25ae2645095f58c1577d..6b38a7cbd6a89007ebfdb32555cbfe1fc2eac591 100644 --- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini +++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini @@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount # Input parameters dim = 3 mbperrank = 100 -ranks = 16 +ranks = 32 floatingbytes = 8 # Metaini Calculations @@ -38,12 +38,10 @@ extension = vtu fastdg = 1 sumfact = 1 vectorize_quad = 1 -vectorize_grads = 1 +vectorize_greedy = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount -exact_solution_expression = g -compare_l2errorsquared = 1e-6 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval [formcompiler.ufl_variants] diff --git a/applications/poisson_dg_tensor/sliced/sliced.mini b/applications/poisson_dg_tensor/sliced/sliced.mini index bf0f7879644cccad7fdcac85837beeff6b9a470c..dee4de55259c6f5d6765c5d214d54ec42e58b2c1 100644 --- a/applications/poisson_dg_tensor/sliced/sliced.mini +++ b/applications/poisson_dg_tensor/sliced/sliced.mini @@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount # Input parameters dim = 3 mbperrank = 100 -ranks = 16 +ranks = 32 floatingbytes = 8 # Metaini Calculations @@ -42,8 +42,6 @@ vectorize_slice = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount -exact_solution_expression = g -compare_l2errorsquared = 1e-6 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval [formcompiler.ufl_variants] diff --git a/applications/stokes_dg/stokes_dg.mini b/applications/stokes_dg/stokes_dg.mini index 246deb18aa172031c25e81f49ecc900aca4223fa..51f9eac94de9c1272abb993ac918cc81ee3966df 100644 --- a/applications/stokes_dg/stokes_dg.mini +++ b/applications/stokes_dg/stokes_dg.mini @@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount # Input parameters dim = 3 mbperrank = 100 -ranks = 16 +ranks = 32 floatingbytes = 8 # Metaini Calculations @@ -39,7 +39,7 @@ extension = vtu fastdg = 1 sumfact = 1 vectorize_quad = 1 -vectorize_grads = 1 +vectorize_greedy = 1 instrumentation_level = 2, 3, 4 | expand opcounter = 1, 0 | expand opcount time_opcounter = 0, 1 | expand opcount @@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval [formcompiler.ufl_variants] cell = hexahedron -v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree -p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9 | expand degree +v_degree = 3, 7 | expand degree +p_degree = 2, 6 | expand degree diff --git a/applications/stokes_dg/stokes_dg.ufl b/applications/stokes_dg/stokes_dg.ufl index 22678c9fac1db46a13fcc84941c9dc6e2e96fd22..1347c0d05037206f03f5ac89dfc1803a72d1033b 100644 --- a/applications/stokes_dg/stokes_dg.ufl +++ b/applications/stokes_dg/stokes_dg.ufl @@ -1,7 +1,7 @@ cell = hexahedron x = SpatialCoordinate(cell) -g_v = as_vector((16.*x[1]*(1.-x[1])*x[2]*(1.-x[2]), 0.0, 0.0)) +g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0)) bctype = conditional(x[0] < 1. - 1e-8, 1, 0) V = VectorElement("DG", cell, v_degree) diff --git a/bin/donkey.sbatch b/bin/donkey.sbatch index b8091947f51c4d160b07f66c357d4132f8d4f99a..c16dc28d01e802e615b56da8378b6679392e6302 100755 --- a/bin/donkey.sbatch +++ b/bin/donkey.sbatch @@ -6,6 +6,7 @@ # Load modules ml gcc/6.2 +ml tbb ml intelmpi ml openblas ml metis @@ -15,7 +16,7 @@ ml suitesparse #SBATCH -J poisson_dg # Number of processes -#SBATCH -n 16 +#SBATCH -n 32 # Choose the SLURM partition (sinfo for overview) #SBATCH -p haswell16c @@ -28,7 +29,7 @@ ml suitesparse SRUNOPT="--cpu_bind=verbose,core" # Search for runnable executables -FILES=$(ls *.ini) +FILES=$(ls *.ini | grep -v '^verify') for inifile in $FILES do line=$(grep ^"opcounter = " $inifile) diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index ed37897178ab6fa5b74b987e38ce81f341eefa9a..0e1e8f585bce0c99b60addbe3809030251682ee8 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord): vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization") + vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)") turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.") architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl") diff --git a/python/dune/perftool/sumfact/realization.py b/python/dune/perftool/sumfact/realization.py index f822e9b032624a80e307d7666e4f149082852688..5fe9ebb121a925d1d33ade412fa0c3cb9b4a794b 100644 --- a/python/dune/perftool/sumfact/realization.py +++ b/python/dune/perftool/sumfact/realization.py @@ -282,7 +282,7 @@ def _realize_sum_factorization_kernel(sf): # Measure times and count operations in c++ code if get_option("instrumentation_level") >= 4: stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name), - depends_on=frozenset({tag}), + depends_on=frozenset({lp.match.Tagged(tag)}), within_inames=frozenset(sf.within_inames))}) if sf.stage == 1: qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop' diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index b2dca33d0c4bfcf82414e6ec58e0b8c66d1dfa42..303092f120d0df2b9f80575de739c7eef64c547b 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -51,6 +51,13 @@ def no_vectorization(sumfacts): def vertical_vectorization_strategy(sumfact, depth): + # If depth is 1, there is nothing do + if depth == 1: + if isinstance(sumfact, SumfactKernel): + return {sumfact: sumfact} + else: + return {k: sumfact for k in sumfact.kernels} + # Assert that this is not already sliced assert all(mat.slice_size is None for mat in sumfact.matrix_sequence) result = {} @@ -139,6 +146,36 @@ def diagonal_vectorization_strategy(sumfacts, width): return result +def greedy_vectorization_strategy(sumfacts, width): + sumfacts = set(sumfacts) + horizontal = width + vertical = 1 + allowed_padding = 1 + + result = {} + + while horizontal > 0: + if horizontal > 1: + horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding) + else: + horizontal_kernels = {sf: sf for sf in sumfacts} + + for sf in horizontal_kernels: + if horizontal_kernels[sf].horizontal_width == horizontal: + vert = vertical_vectorization_strategy(horizontal_kernels[sf], + vertical) + for k in vert: + result[k] = vert[k] + sumfacts.discard(sf) + + horizontal = horizontal // 2 + vertical = vertical * 2 + + # We heuristically allow padding only on the full SIMD width + allowed_padding = 0 + return result + + def decide_vectorization_strategy(): """ Decide how to vectorize! Note that the vectorization of the quadrature loop is independent of this, @@ -188,6 +225,13 @@ def decide_vectorization_strategy(): sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items(): sfdict[old] = new + elif get_option("vectorize_greedy"): + inputkeys = set(sf.input_key for sf in active_sumfacts) + for inputkey in inputkeys: + width = get_vcl_type_size(np.float64) + sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey] + for old, new in greedy_vectorization_strategy(sumfact_filter, width).items(): + sfdict[old] = new else: for old, new in no_vectorization(active_sumfacts).items(): sfdict[old] = new