From 2de8f89914a9627347e971b05fe28c274757fb39 Mon Sep 17 00:00:00 2001
From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de>
Date: Thu, 4 Oct 2018 11:23:06 +0200
Subject: [PATCH] A few fixes

---
 python/dune/perftool/sumfact/accumulation.py |  5 +++++
 python/dune/perftool/sumfact/autotune.py     | 14 ++++++++------
 python/dune/perftool/sumfact/basis.py        |  5 +++++
 python/dune/perftool/sumfact/symbolic.py     |  8 ++++++++
 4 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/python/dune/perftool/sumfact/accumulation.py b/python/dune/perftool/sumfact/accumulation.py
index abad6788..d0c6fba6 100644
--- a/python/dune/perftool/sumfact/accumulation.py
+++ b/python/dune/perftool/sumfact/accumulation.py
@@ -243,6 +243,11 @@ class AccumulationOutput(SumfactKernelInterfaceBase, ImmutableRecord):
         else:
             return ()
 
+    @property
+    def fastdg_interface_object_size(self):
+        size = sum(_local_sizes(self.trial_element)) if self.trial_element else 1
+        return size * sum(_local_sizes(self.test_element))
+
 
 def _local_sizes(element):
     from ufl import FiniteElement, MixedElement
diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py
index af324722..df8ab590 100644
--- a/python/dune/perftool/sumfact/autotune.py
+++ b/python/dune/perftool/sumfact/autotune.py
@@ -3,7 +3,7 @@
 from dune.perftool.generation import cache_restoring, delete_cache_items
 from dune.perftool.loopy.target import DuneTarget
 from dune.perftool.sumfact.realization import realize_sumfact_kernel_function
-from dune.perftool.options import get_option
+from dune.perftool.options import get_option, set_option
 
 import loopy as lp
 from pytools import product
@@ -82,8 +82,11 @@ def generate_standalone_code(sf, filename, logname):
                       ])
 
         # Setup a polynomial object (normally done in the LocalOperator members)
+        opcounting = get_option("opcounter")
+        set_option("opcounter", False)
         from dune.perftool.loopy.target import type_floatingpoint
         real = type_floatingpoint()
+        set_option("opcounter", opcounting)
         f.write("  using RF = {};\n".format(real))
         f.write("  using DF = {};\n".format(real))
 
@@ -102,6 +105,7 @@ def generate_standalone_code(sf, filename, logname):
         # Allocate buffers
         size = max(product(m.quadrature_size for m in sf.matrix_sequence) * sf.vector_width,
                    product(m.basis_size for m in sf.matrix_sequence) * sf.vector_width)
+        size = int(size * (get_option("precision_bits") / 8))
         f.writelines(["  char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size),
                       "  char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size),
                       ])
@@ -111,10 +115,8 @@ def generate_standalone_code(sf, filename, logname):
             if "jacobian" in arg:
                 f.write("{} = 0;\n".format(arg))
             else:
-                basis_size = product(m.basis_size for m in sf.matrix_sequence)
-                if sf.within_inames:
-                    basis_size = basis_size * basis_size
-                f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], product(m.basis_size for m in sf.matrix_sequence)))
+                size = sf.interface.fastdg_interface_object_size
+                f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size))
 
         # Write stuff into the input buffer
         f.writelines(["  {0} *input = ({0} *)buffer0;\n".format(real),
@@ -162,7 +164,7 @@ def generate_standalone_code(sf, filename, logname):
                       ])
 
         # Add the implementation of the kernel.
-        f.write("  for(int i=0; i<10000000; ++i)\n")
+        f.write("  for(int i=0; i<{}; ++i)\n".format(int(1e9 / sf.operations)))
         f.write("  {\n")
         for line in knl.member.lines[1:]:
             f.write("    {}\n".format(line))
diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py
index e9584d1d..22f5ffad 100644
--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -138,6 +138,11 @@ class LFSSumfactKernelInput(SumfactKernelInterfaceBase, ImmutableRecord):
         else:
             return ()
 
+    @property
+    def fastdg_interface_object_size(self):
+        from dune.perftool.sumfact.accumulation import _local_sizes
+        return sum(_local_sizes(self.element))
+
 
 def _basis_functions_per_direction(element):
     """Number of basis functions per direction """
diff --git a/python/dune/perftool/sumfact/symbolic.py b/python/dune/perftool/sumfact/symbolic.py
index fb283a05..3365ddca 100644
--- a/python/dune/perftool/sumfact/symbolic.py
+++ b/python/dune/perftool/sumfact/symbolic.py
@@ -123,6 +123,10 @@ class VectorSumfactKernelInput(SumfactKernelInterfaceBase):
     def function_name_suffix(self):
         return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
+    @property
+    def fastdg_interface_object_size(self):
+        return self.interfaces[0].fastdg_interface_object_size
+
 
 class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def __init__(self, interfaces):
@@ -211,6 +215,10 @@ class VectorSumfactKernelOutput(SumfactKernelInterfaceBase):
     def function_name_suffix(self):
         return "".join(i.function_name_suffix for i in remove_duplicates(self.interfaces))
 
+    @property
+    def fastdg_interface_object_size(self):
+        return self.interfaces[0].fastdg_interface_object_size
+
 
 class SumfactKernelBase(object):
     pass
-- 
GitLab