Merge branch 'feature/paper-vectorization' into 'master'

Heuristic vectorization strategy See merge request !190

Merge branch 'feature/paper-vectorization' into 'master'
62a75918 · Dominic Kempf · 475df2c0 · a75470f1 · 62a75918 · 62a75918
Commit 62a75918 authored 7 years ago by Dominic Kempf
--- a/applications/poisson_dg/poisson_dg.mini
+++ b/applications/poisson_dg/poisson_dg.mini
@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
 # Input parameters
 dim = 3
 mbperrank = 100
-ranks = 16
+ranks = 32
 floatingbytes = 8
 # Metaini Calculations
@@ -38,7 +38,7 @@ extension = vtu
 fastdg = 1
 sumfact = 1
 vectorize_quad = 1
-vectorize_grads = 1
+vectorize_greedy = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
@@ -46,4 +46,4 @@ quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
 [formcompiler.ufl_variants]
 cell = hexahedron
-degree = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand
+degree = 3, 7 | expand
--- a/applications/poisson_dg_tensor/poisson_dg_tensor.mini
+++ b/applications/poisson_dg_tensor/poisson_dg_tensor.mini
@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
 # Input parameters
 dim = 3
 mbperrank = 100
-ranks = 16
+ranks = 32
 floatingbytes = 8
 # Metaini Calculations
@@ -38,12 +38,10 @@ extension = vtu
 fastdg = 1
 sumfact = 1
 vectorize_quad = 1
-vectorize_grads = 1
+vectorize_greedy = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
-exact_solution_expression = g
-compare_l2errorsquared = 1e-6
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
 [formcompiler.ufl_variants]

--- a/applications/poisson_dg_tensor/sliced/sliced.mini
+++ b/applications/poisson_dg_tensor/sliced/sliced.mini
@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
 # Input parameters
 dim = 3
 mbperrank = 100
-ranks = 16
+ranks = 32
 floatingbytes = 8
 # Metaini Calculations
@@ -42,8 +42,6 @@ vectorize_slice = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
-exact_solution_expression = g
-compare_l2errorsquared = 1e-6
 quadrature_order = {formcompiler.ufl_variants.degree} * 2 | eval
 [formcompiler.ufl_variants]

--- a/applications/stokes_dg/stokes_dg.mini
+++ b/applications/stokes_dg/stokes_dg.mini
@@ -8,7 +8,7 @@ opcount_suffix = opcount, nonopcount | expand opcount
 # Input parameters
 dim = 3
 mbperrank = 100
-ranks = 16
+ranks = 32
 floatingbytes = 8
 # Metaini Calculations
@@ -39,7 +39,7 @@ extension = vtu
 fastdg = 1
 sumfact = 1
 vectorize_quad = 1
-vectorize_grads = 1
+vectorize_greedy = 1
 instrumentation_level = 2, 3, 4 | expand
 opcounter = 1, 0 | expand opcount
 time_opcounter = 0, 1 | expand opcount
@@ -47,5 +47,5 @@ quadrature_order = {formcompiler.ufl_variants.v_degree} * 2 | eval
 [formcompiler.ufl_variants]
 cell = hexahedron
-v_degree = 2, 3, 4, 5, 6, 7, 8, 9, 10 | expand degree
+v_degree = 3, 7 | expand degree
-p_degree = 1, 2, 3, 4, 5, 6, 7, 8, 9  | expand degree
+p_degree = 2, 6 | expand degree
--- a/applications/stokes_dg/stokes_dg.ufl
+++ b/applications/stokes_dg/stokes_dg.ufl
 cell = hexahedron
 x = SpatialCoordinate(cell)
-g_v = as_vector((16.*x[1]*(1.-x[1])*x[2]*(1.-x[2]), 0.0, 0.0))
+g_v = as_vector((4.*x[1]*(1.-x[1]), 0.0, 0.0))
 bctype = conditional(x[0] < 1. - 1e-8, 1, 0)
 V = VectorElement("DG", cell, v_degree)

--- a/bin/donkey.sbatch
+++ b/bin/donkey.sbatch
@@ -6,6 +6,7 @@
 # Load modules
 ml gcc/6.2
+ml tbb
 ml intelmpi
 ml openblas
 ml metis
@@ -15,7 +16,7 @@ ml suitesparse
 #SBATCH -J poisson_dg
 # Number of processes
-#SBATCH -n 16
+#SBATCH -n 32
 # Choose the SLURM partition (sinfo for overview)
 #SBATCH -p haswell16c
@@ -28,7 +29,7 @@ ml suitesparse
 SRUNOPT="--cpu_bind=verbose,core"
 # Search for runnable executables
-FILES=$(ls *.ini)
+FILES=$(ls *.ini | grep -v '^verify')
 for inifile in $FILES
 do
  line=$(grep ^"opcounter = " $inifile)

--- a/python/dune/perftool/options.py
+++ b/python/dune/perftool/options.py
@@ -58,6 +58,7 @@ class PerftoolOptionsArray(ImmutableRecord):
    vectorize_grads = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
    vectorize_slice = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
    vectorize_diagonal = PerftoolOption(default=False, helpstr="whether to generate code with explicit vectorization")
+    vectorize_greedy = PerftoolOption(default=False, helpstr="the heuristic currently in use (to produce paper numbers)")
    turn_off_diagonal_jacobian = PerftoolOption(default=False, helpstr="Do not use diagonal_jacobian transformation on the ufl tree and cast result of jacobianInverseTransposed into a FieldMatrix.")
    architecture = PerftoolOption(default="haswell", helpstr="The architecture to optimize for. Possible values: haswell|knl")

--- a/python/dune/perftool/sumfact/realization.py
+++ b/python/dune/perftool/sumfact/realization.py
@@ -282,7 +282,7 @@ def _realize_sum_factorization_kernel(sf):
    # Measure times and count operations in c++ code
    if get_option("instrumentation_level") >= 4:
        stop_insn = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
-                                           depends_on=frozenset({tag}),
+                                           depends_on=frozenset({lp.match.Tagged(tag)}),
                                           within_inames=frozenset(sf.within_inames))})
        if sf.stage == 1:
            qp_timer_name = assembler_routine_name() + '_kernel' + '_quadratureloop'

--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -51,6 +51,13 @@ def no_vectorization(sumfacts):
 def vertical_vectorization_strategy(sumfact, depth):
+    # If depth is 1, there is nothing do
+    if depth == 1:
+        if isinstance(sumfact, SumfactKernel):
+            return {sumfact: sumfact}
+        else:
+            return {k: sumfact for k in sumfact.kernels}
    # Assert that this is not already sliced
    assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
    result = {}
@@ -139,6 +146,36 @@ def diagonal_vectorization_strategy(sumfacts, width):
    return result
+def greedy_vectorization_strategy(sumfacts, width):
+    sumfacts = set(sumfacts)
+    horizontal = width
+    vertical = 1
+    allowed_padding = 1
+    result = {}
+    while horizontal > 0:
+        if horizontal > 1:
+            horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=allowed_padding)
+        else:
+            horizontal_kernels = {sf: sf for sf in sumfacts}
+        for sf in horizontal_kernels:
+            if horizontal_kernels[sf].horizontal_width == horizontal:
+                vert = vertical_vectorization_strategy(horizontal_kernels[sf],
+                                                       vertical)
+                for k in vert:
+                    result[k] = vert[k]
+                sumfacts.discard(sf)
+        horizontal = horizontal // 2
+        vertical = vertical * 2
+        # We heuristically allow padding only on the full SIMD width
+        allowed_padding = 0
+    return result
 def decide_vectorization_strategy():
    """ Decide how to vectorize!
    Note that the vectorization of the quadrature loop is independent of this,
@@ -188,6 +225,13 @@ def decide_vectorization_strategy():
            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
            for old, new in diagonal_vectorization_strategy(sumfact_filter, width).items():
                sfdict[old] = new
+    elif get_option("vectorize_greedy"):
+        inputkeys = set(sf.input_key for sf in active_sumfacts)
+        for inputkey in inputkeys:
+            width = get_vcl_type_size(np.float64)
+            sumfact_filter = [sf for sf in active_sumfacts if sf.input_key == inputkey]
+            for old, new in greedy_vectorization_strategy(sumfact_filter, width).items():
+                sfdict[old] = new
    else:
        for old, new in no_vectorization(active_sumfacts).items():
            sfdict[old] = new