From 9c40248664334ab39ea1d42f4e0160cb6b758294 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@uni-muenster.de>
Date: Mon, 19 Feb 2018 13:37:44 +0100
Subject: [PATCH] enable 512 vectorization

---
 .../perftool/blockstructured/vectorization.py | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py
index 0fa63c21..a5edfc73 100644
--- a/python/dune/perftool/blockstructured/vectorization.py
+++ b/python/dune/perftool/blockstructured/vectorization.py
@@ -3,6 +3,7 @@ import numpy as np
 import pymbolic.primitives as prim
 from dune.perftool.loopy.temporary import DuneTemporaryVariable
 from dune.perftool.loopy.symbolic import substitute
+from dune.perftool.loopy.vcl import get_vcl_type_size
 from dune.perftool.options import get_option
 from dune.perftool.pdelab.argument import PDELabAccumulationFunction
 from dune.perftool.pdelab.geometry import world_dimension
@@ -17,7 +18,8 @@ def add_vcl_temporaries(knl):
     new_vec_temporaries = dict()
     for alias in vector_alias:
         vector_name = alias.replace('alias','vec')
-        new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(4,), managed=True,
+        new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64,
+                                                                 shape=(get_vcl_type_size(np.float64),), managed=True,
                                                                  scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
 
     return knl.copy(temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries))
@@ -58,10 +60,12 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
             identifier_a = vng('a')
             identifier_b = vng('b')
             new_vec_temporaries[identifier_a] = DuneTemporaryVariable(identifier_a, dtype=np.float64,
-                                                                      shape=(2,)*(world_dimension()-1)+(4,), managed=True,
-                                                                      scope=lp.temp_var_scope.PRIVATE,
+                                                                      shape=(2,)*(world_dimension()-1)
+                                                                            +(get_vcl_type_size(np.float64),),
+                                                                      managed=True, scope=lp.temp_var_scope.PRIVATE,
                                                                       dim_tags=('f',)*(world_dimension()-1)+('vec',))
-            new_vec_temporaries[identifier_b] = DuneTemporaryVariable(identifier_b, dtype=np.float64, shape=(4,), managed=True,
+            new_vec_temporaries[identifier_b] = DuneTemporaryVariable(identifier_b, dtype=np.float64,
+                                                                      shape=(get_vcl_type_size(np.float64),), managed=True,
                                                                       scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',))
 
             var_a = prim.Subscript(prim.Variable(identifier_a),
@@ -103,7 +107,9 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
 
             # r+=a[iy]
             id_accum = idg('insn_mod_accum')
-            expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)),
+            expr_accum = prim.Sum((var_a,
+                                   prim.Call(prim.Variable('permute4d<-1,{}>'.format(','.join(map(str,range(get_vcl_type_size(np.float64)-1))))),
+                                             (var_b,)),
                                    substitute(insn.assignee, {iname_ix:0})))
             new_insns.append(lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}),
                                            expression=expr_accum,
@@ -115,7 +121,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
                              )
             # a[iy] = permute
             id_permute = idg('insn_permute')
-            expr_permute = prim.Call(prim.Variable('permute4d<3,-1,-1,-1>'), (var_b,))
+            expr_permute = prim.Call(prim.Variable('permute4d<3,{}>'.format(','.join(['-1']*(get_vcl_type_size(np.float64)-1)))),
+                                     (var_b,))
             new_insns.append(lp.Assignment(assignee=var_a,
                                            expression=expr_permute,
                                            id=id_permute,
@@ -127,7 +134,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer):
 
             # tail handling, uses tail alias
             id_accum_tail = idg('insn_accum_tail')
-            subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//4,
+            subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//get_vcl_type_size(np.float64),
                          iname_ix: 0, insn.assignee_name: prim.Variable(insn.assignee_name+'_tail'),
                          **replace_tail_inames}
             assignee_tail = substitute(insn.assignee, subst_map)
@@ -226,7 +233,6 @@ def add_vcl_access(knl, iname_inner):
     dim_names = ["x","y","z"] + [str(i) for i in range(4,dim+1)]
     for alias in vector_alias:
         parameters = 'ex_o,ex_i,'+','.join(['e'+d for d in dim_names[1:dim]])+',ix,'+','.join(['i'+d for d in dim_names[1:dim]])
-        from pudb import set_trace; set_trace()
         knl = lp.extract_subst(knl, alias+'_subst', '{}[{}]'.format(alias, parameters),
                                parameters=parameters)
         new_subst = knl.substitutions.copy()
@@ -268,10 +274,11 @@ def find_accumulation_inames(knl):
 
 
 def vectorize_micro_elements(knl):
+    vcl_size = get_vcl_type_size(np.float64)
     if "subel_x" in knl.all_inames():
-        knl = lp.split_iname(knl,"subel_x",4, inner_tag='vec')
+        knl = lp.split_iname(knl,"subel_x", vcl_size, inner_tag='vec')
         array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')]
-        knl = lp.split_array_axis(knl, array_alias, 0, 4)
+        knl = lp.split_array_axis(knl, array_alias, 0, vcl_size)
 
         knl = add_vcl_temporaries(knl)
         knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer')
-- 
GitLab