diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py index 0fa63c21043d175f52757359c52c6904471911a2..a5edfc73e82b2c468a6dece2d053014a33f45af5 100644 --- a/python/dune/perftool/blockstructured/vectorization.py +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -3,6 +3,7 @@ import numpy as np import pymbolic.primitives as prim from dune.perftool.loopy.temporary import DuneTemporaryVariable from dune.perftool.loopy.symbolic import substitute +from dune.perftool.loopy.vcl import get_vcl_type_size from dune.perftool.options import get_option from dune.perftool.pdelab.argument import PDELabAccumulationFunction from dune.perftool.pdelab.geometry import world_dimension @@ -17,7 +18,8 @@ def add_vcl_temporaries(knl): new_vec_temporaries = dict() for alias in vector_alias: vector_name = alias.replace('alias','vec') - new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, shape=(4,), managed=True, + new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, + shape=(get_vcl_type_size(np.float64),), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) return knl.copy(temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)) @@ -58,10 +60,12 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): identifier_a = vng('a') identifier_b = vng('b') new_vec_temporaries[identifier_a] = DuneTemporaryVariable(identifier_a, dtype=np.float64, - shape=(2,)*(world_dimension()-1)+(4,), managed=True, - scope=lp.temp_var_scope.PRIVATE, + shape=(2,)*(world_dimension()-1) + +(get_vcl_type_size(np.float64),), + managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('f',)*(world_dimension()-1)+('vec',)) - new_vec_temporaries[identifier_b] = DuneTemporaryVariable(identifier_b, dtype=np.float64, shape=(4,), managed=True, + new_vec_temporaries[identifier_b] = DuneTemporaryVariable(identifier_b, dtype=np.float64, + shape=(get_vcl_type_size(np.float64),), managed=True, scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) var_a = prim.Subscript(prim.Variable(identifier_a), @@ -103,7 +107,9 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): # r+=a[iy] id_accum = idg('insn_mod_accum') - expr_accum = prim.Sum((var_a, prim.Call(prim.Variable('permute4d<-1,0,1,2>'), (var_b,)), + expr_accum = prim.Sum((var_a, + prim.Call(prim.Variable('permute4d<-1,{}>'.format(','.join(map(str,range(get_vcl_type_size(np.float64)-1))))), + (var_b,)), substitute(insn.assignee, {iname_ix:0}))) new_insns.append(lp.Assignment(assignee=substitute(insn.assignee,{iname_ix:0}), expression=expr_accum, @@ -115,7 +121,8 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): ) # a[iy] = permute id_permute = idg('insn_permute') - expr_permute = prim.Call(prim.Variable('permute4d<3,-1,-1,-1>'), (var_b,)) + expr_permute = prim.Call(prim.Variable('permute4d<3,{}>'.format(','.join(['-1']*(get_vcl_type_size(np.float64)-1)))), + (var_b,)) new_insns.append(lp.Assignment(assignee=var_a, expression=expr_permute, id=id_permute, @@ -127,7 +134,7 @@ def add_vcl_accum_insns(knl, iname_inner, iname_outer): # tail handling, uses tail alias id_accum_tail = idg('insn_accum_tail') - subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//4, + subst_map = {iname_inner: 0, iname_outer: get_option("number_of_blocks")//get_vcl_type_size(np.float64), iname_ix: 0, insn.assignee_name: prim.Variable(insn.assignee_name+'_tail'), **replace_tail_inames} assignee_tail = substitute(insn.assignee, subst_map) @@ -226,7 +233,6 @@ def add_vcl_access(knl, iname_inner): dim_names = ["x","y","z"] + [str(i) for i in range(4,dim+1)] for alias in vector_alias: parameters = 'ex_o,ex_i,'+','.join(['e'+d for d in dim_names[1:dim]])+',ix,'+','.join(['i'+d for d in dim_names[1:dim]]) - from pudb import set_trace; set_trace() knl = lp.extract_subst(knl, alias+'_subst', '{}[{}]'.format(alias, parameters), parameters=parameters) new_subst = knl.substitutions.copy() @@ -268,10 +274,11 @@ def find_accumulation_inames(knl): def vectorize_micro_elements(knl): + vcl_size = get_vcl_type_size(np.float64) if "subel_x" in knl.all_inames(): - knl = lp.split_iname(knl,"subel_x",4, inner_tag='vec') + knl = lp.split_iname(knl,"subel_x", vcl_size, inner_tag='vec') array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] - knl = lp.split_array_axis(knl, array_alias, 0, 4) + knl = lp.split_array_axis(knl, array_alias, 0, vcl_size) knl = add_vcl_temporaries(knl) knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer')