From 41a88da85e811c9d69fbd5f1620faa6860947f36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ren=C3=A9=20He=C3=9F?= <rene.hess@iwr.uni-heidelberg.de>
Date: Thu, 4 Apr 2019 15:26:18 +0200
Subject: [PATCH] [skip ci] Pass transformation string to autotuning

This can be used to distinguish between kernels with the same name but
different transformations.
---
 python/dune/codegen/sumfact/autotune.py       | 27 +++++++++++--
 .../dune/codegen/sumfact/transformations.py   | 38 +++++++++++++------
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py
index 64d9d43a..393b21ff 100644
--- a/python/dune/codegen/sumfact/autotune.py
+++ b/python/dune/codegen/sumfact/autotune.py
@@ -304,8 +304,14 @@ def generate_standalone_code(sf, filename):
     set_option("opcounter", opcounting)
 
 
-def generate_standalone_kernel_code(kernel, signature, filename):
+def generate_standalone_kernel_code(kernel, signature, filename, transformations=None):
     with open(filename, 'w') as f:
+        if transformations:
+            f.write('// Transformations:\n')
+            for trafo in transformations:
+                f.write('// {}\n'.format(trafo))
+            f.write('\n')
+
         # Write headers
         headers = ['#include "config.h"',
                    '#include <iostream>',
@@ -410,7 +416,17 @@ def generate_standalone_kernel_code(kernel, signature, filename):
         f.write('\n'.join(main))
 
 
-def autotune_realization(sf=None, kernel=None, signature=None):
+def autotune_realization(sf=None, kernel=None, signature=None, transformations=None):
+    """Generate an microbenchmark, compile run and return time
+
+    Parameters
+    ----------
+    sf: SumfactKernel or VectorizedSumfactKernel
+    kernel: loopy.kernel.LoopKernel
+    signature: str
+    transformation: list of str
+        Will be used to distinguish between autotune targets
+    """
     if sf is None:
         assert kernel is not None
         assert signature is not None
@@ -427,6 +443,9 @@ def autotune_realization(sf=None, kernel=None, signature=None):
         basename = "autotune_sumfact_{}".format(kernel.name)
     else:
         basename = "autotune_sumfact_{}".format(sf.function_name)
+    if transformations:
+        for trafo in transformations:
+            basename = '{}_{}'.format(basename, trafo)
     basename = hashlib.sha256(basename.encode()).hexdigest()
 
     filename = os.path.join(dir, "{}.cc".format(basename))
@@ -434,6 +453,8 @@ def autotune_realization(sf=None, kernel=None, signature=None):
     lock = os.path.join(dir, "{}.lock".format(basename))
     executable = os.path.join(dir, basename)
 
+    print("palpo filename: {}".format(filename))
+
     # Generate and compile a benchmark program
     #
     # Note: cache restoring is only necessary when generating from SumfactKernel
@@ -441,7 +462,7 @@ def autotune_realization(sf=None, kernel=None, signature=None):
         with filelock.FileLock(lock):
             if not os.path.isfile(logname):
                 if sf is None:
-                    generate_standalone_kernel_code(kernel, signature, filename)
+                    generate_standalone_kernel_code(kernel, signature, filename, transformations)
                 elif get_option("autotune_google_benchmark"):
                     generate_standalone_code_google_benchmark(sf, filename)
                 else:
diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py
index b0a97923..8342a343 100644
--- a/python/dune/codegen/sumfact/transformations.py
+++ b/python/dune/codegen/sumfact/transformations.py
@@ -94,7 +94,7 @@ def move_zero_assignment_up(kernel, move_up_inames):
     return kernel
 
 
-def reorder_loops_in_tensor_contraction(kernel, iname_order):
+def _reorder_loops_in_tensor_contraction_direct(kernel, iname_order):
     """Reorder the loop nest of the tensor contractions
 
     iname_order is a string that specifies the loop order. We use the following convention:
@@ -172,12 +172,15 @@ def reorder_loops_in_tensor_contraction(kernel, iname_order):
     return kernel
 
 
-def reorder_loops_in_tensor_contraction_with_accumulation_variable(kernel, iname_order):
+def _reorder_loops_in_tensor_contraction_accum(kernel, iname_order):
     dim = world_dimension()
     assert dim == 3
 
+    if iname_order.endswith('j'):
+        return kernel
+
     # kernel = remove_all_reductions(kernel)
-    kernel = reorder_loops_in_tensor_contraction(kernel, iname_order)
+    kernel = _reorder_loops_in_tensor_contraction_direct(kernel, iname_order)
 
     cond = lp.match.Tagged('set_zero')
     for instr in lp.find_instructions(kernel, cond):
@@ -281,17 +284,30 @@ def reorder_loops_in_tensor_contraction_with_accumulation_variable(kernel, iname
     return kernel
 
 
+def reorder_loops_in_tensor_contraction(kernel, iname_order, accum_variable=True):
+    if accum_variable:
+        return _reorder_loops_in_tensor_contraction_accum(kernel, iname_order)
+    else:
+        return _reorder_loops_in_tensor_contraction_direct(kernel, iname_order)
+
+
 def tensor_contraction_loop_order_generator(kernel):
     dim = world_dimension()
     assert dim == 3
-    yield kernel
 
     indices = ['l', 'k', 'i', 'j']
     import itertools
     for loop_order in itertools.permutations(indices):
-        loop_order = ''.join(loop_order)
-        new_kernel = reorder_loops_in_tensor_contraction(kernel, loop_order)
-        yield new_kernel
+        # palpo TODO: Heavy culling for 'quick' tests during development
+        if loop_order[0] != 'l' or loop_order[1] != 'k':
+            continue
+
+        order = ''.join(loop_order)
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, order, True)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_True'.format(order),]
+
+        new_kernel = reorder_loops_in_tensor_contraction(kernel, loop_order, False)
+        yield new_kernel, ['reorder_loops_in_tensor_contraction_{}_False'.format(order),]
 
 
 def simple_autotuner(kernel_generator, signature):
@@ -299,11 +315,11 @@ def simple_autotuner(kernel_generator, signature):
     from dune.codegen.options import set_option
     set_option("autotune_google_benchmark", True)
 
-    kernel = next(kernel_generator)
-    best_cost = autotune_realization(kernel=kernel, signature=signature)
+    kernel, transformations = next(kernel_generator)
+    best_cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
     best_kernel = kernel
-    for kernel in kernel_generator:
-        cost = autotune_realization(kernel=kernel, signature=signature)
+    for kernel, transformations in kernel_generator:
+        cost = autotune_realization(kernel=kernel, signature=signature, transformations=transformations)
         if cost < best_cost:
             best_cost = cost
             best_kernel = kernel
-- 
GitLab