From 31e8390329bd8c8a87ef7d563f22764e5f552f33 Mon Sep 17 00:00:00 2001 From: Marcel Koch <marcel.koch@web.de> Date: Thu, 18 Jan 2018 15:41:28 +0100 Subject: [PATCH] use plain array while creating the kernel, not afterwards in the vectorization --- .../dune/perftool/blockstructured/__init__.py | 11 +- .../perftool/blockstructured/accumulation.py | 51 +++++++ .../dune/perftool/blockstructured/argument.py | 31 ++++- .../perftool/blockstructured/vectorization.py | 126 ------------------ python/dune/perftool/pdelab/localoperator.py | 8 +- 5 files changed, 94 insertions(+), 133 deletions(-) create mode 100644 python/dune/perftool/blockstructured/accumulation.py diff --git a/python/dune/perftool/blockstructured/__init__.py b/python/dune/perftool/blockstructured/__init__.py index 9657e81c..830221bb 100644 --- a/python/dune/perftool/blockstructured/__init__.py +++ b/python/dune/perftool/blockstructured/__init__.py @@ -3,6 +3,7 @@ import dune.perftool.blockstructured.argument import dune.perftool.blockstructured.geometry import dune.perftool.blockstructured.spaces import dune.perftool.blockstructured.basis +from dune.perftool.options import get_option from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position # from dune.perftool.pdelab.geometry import to_global @@ -18,6 +19,7 @@ from dune.perftool.blockstructured.tools import sub_element_inames from dune.perftool.pdelab import PDELabInterface + class BlockStructuredInterface(PDELabInterface): def __init__(self): PDELabInterface.__init__(self) @@ -25,10 +27,17 @@ class BlockStructuredInterface(PDELabInterface): # # Local function space related generator functions # + def generate_accumulation_instruction(self, expr, visitor): + if get_option("vectorization_blockstructured"): + from dune.perftool.blockstructured.accumulation import generate_accumulation_instruction + return generate_accumulation_instruction(expr, visitor) + else: + from dune.perftool.pdelab.localoperator import generate_accumulation_instruction + return generate_accumulation_instruction(expr, visitor) # TODO current way to squeeze subelem iname in, not really ideal def lfs_inames(self, element, restriction, number=None, context=''): - return lfs_inames(element, restriction, number, context) + sub_element_inames() + return sub_element_inames() + lfs_inames(element, restriction, number, context) # # Test and trial function related generator functions diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py new file mode 100644 index 00000000..5a8a0232 --- /dev/null +++ b/python/dune/perftool/blockstructured/accumulation.py @@ -0,0 +1,51 @@ +from dune.perftool.generation import temporary_variable, instruction +from dune.perftool.options import get_option +from dune.perftool.pdelab.localoperator import determine_accumulation_space +from dune.perftool.pdelab.argument import name_accumulation_variable +from dune.perftool.pdelab.localoperator import boundary_predicates +import pymbolic.primitives as prim + + +def define_accumulation_alias(container, accumspace): + k = get_option("number_of_blocks") + p = accumspace.element.degree() + temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1), + base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True) + code = "{} = &{}.container()({},0);".format(container + "_alias", container, accumspace.lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({container+"_alias"})) + + +def name_accumulation_alias(accumvar, accumspace): + name = accumvar+"_alias" + define_accumulation_alias(accumvar, accumspace) + return name + + +def generate_accumulation_instruction(expr, visitor): + # Collect the lfs and lfs indices for the accumulate call + test_lfs = determine_accumulation_space(visitor.test_info, 0) + # In the jacobian case, also determine the space for the ansatz space + ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1) + + # Collect the lfs and lfs indices for the accumulate call + accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + accumvar_alias = name_accumulation_alias(accumvar, test_lfs) + + predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id) + + quad_inames = visitor.interface.quadrature_inames() + lfs_inames = visitor.test_info.inames + if visitor.trial_info: + lfs_inames = lfs_inames + visitor.trial_info.inames + + assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames)) + + instruction(assignee=assignee, + expression=prim.Sum((expr, assignee)), + forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)), + forced_iname_deps_is_final=True, + predicates=predicates + ) diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py index 7ca3a94c..b40dcbac 100644 --- a/python/dune/perftool/blockstructured/argument.py +++ b/python/dune/perftool/blockstructured/argument.py @@ -1,10 +1,30 @@ from dune.perftool.generation import (backend, kernel_cached, - valuearg) + valuearg, temporary_variable, instruction) +from dune.perftool.options import get_option from dune.perftool.pdelab.argument import CoefficientAccess -from dune.perftool.blockstructured.tools import micro_index_to_macro_index +from dune.perftool.blockstructured.tools import micro_index_to_macro_index, sub_element_inames from loopy.types import NumpyType import pymbolic.primitives as prim +import loopy as lp + + +def define_alias(container, lfs, element): + k = get_option("number_of_blocks") + p = element.degree() + temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1), + base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True) + code = "{} = &{}({},0);".format(container+"_alias",container, lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({container+"_alias"})) + + +def name_alias(container, lfs, element): + define_alias(container, lfs, element) + name = container+"_alias" + return name # TODO remove the need for element @@ -20,4 +40,9 @@ def pymbolic_coefficient(container, lfs, element, index): lfs = prim.Variable(lfs) # use higher order FEM index instead of Q1 index - return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) + if get_option("vectorization_blockstructured"): + subelem_inames = sub_element_inames() + coeff_alias = name_alias(container, lfs, element) + return prim.Subscript(lp.TaggedVariable(coeff_alias, 'coeff_alias'), tuple(prim.Variable(i) for i in subelem_inames+index)) + else: + return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py index b6a83142..19112ab9 100644 --- a/python/dune/perftool/blockstructured/vectorization.py +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -1,128 +1,2 @@ -import loopy as lp -import pymbolic.primitives as prim -from dune.perftool.generation import temporary_variable, instruction, kernel_cached, get_counter -from dune.perftool.options import get_option -from dune.perftool.pdelab.argument import CoefficientAccess, PDELabAccumulationFunction - -vector_temporaries = set() - - -def define_vector(name, code, inames, read_variables): - temporary_variable(name, base_storage='dummy'+name, managed=True, _base_storage_access_may_be_aliasing=True) - instruction(code=code, - inames=inames, - read_variables=read_variables, - assignees=frozenset({name})) - - -def name_vector(expr, insn, iname): - func = expr.function - parameters = ', '.join(tuple(str(p) for p in expr.parameters[0:2])) - if isinstance(func, CoefficientAccess): - name = func.name - elif isinstance(func, PDELabAccumulationFunction): - name = func.accumobj - else: - return None - vector_name = name+"_vec" - - vector_temporaries.add(vector_name) - - c_alias = "{} = &{}({});".format(vector_name,name,parameters) - define_vector(vector_name, c_alias, insn.within_inames - frozenset({iname}), insn.read_dependency_names()-frozenset({iname})) - return vector_name - - -def add_vectortype_alias(knl, iname): - assert(isinstance(knl, lp.LoopKernel)) - from loopy.symbolic import WalkMapper - class SubstLocalVector(WalkMapper): - def __init__(self): - self.localvectors = () - - def add_localvector(self, expr): - if isinstance(expr, prim.Call): - if isinstance(expr.function, CoefficientAccess): - localvector = expr.function.name - elif isinstance(expr.function, PDELabAccumulationFunction): - localvector = expr.function.accumobj - else: - return True - self.localvectors += ((localvector, expr.parameters[0:2])) - return True - - post_visit = add_localvector - - # find insn with localvectors in their expression - insn_with_localvectors = dict() - for insn in knl.instructions: - if isinstance(insn, lp.MultiAssignmentBase): - SM = SubstLocalVector() - SM(insn.expression) - if SM.localvectors: - insn_with_localvectors[insn.id] = SM.localvectors - - # add temporaries - temporaries_names = set((lv[0]+"_vec" for lv in insn_with_localvectors.values())) - new_temporaries = dict() - for temporary_name in temporaries_names: - from dune.perftool.loopy.temporary import DuneTemporaryVariable - # temporary name unique? - k = get_option("number_of_blocks") - new_temporaries[temporary_name] = \ - DuneTemporaryVariable(temporary_name, base_storage="dummy_"+temporary_name, managed=True, - _base_storage_access_may_be_aliasing=True, shape=(1,), - scope=lp.temp_var_scope.PRIVATE, read_only=True) - knl = knl.copy(temporary_variables={**knl.temporary_variables, **new_temporaries}) - - # set temporaries to localvector - temporaries_insns = [] - for insn_id, localvector in insn_with_localvectors.items(): - insn = knl.id_to_insn[insn_id] - - if isinstance(insn, lp.CallInstruction): - code = "{} = &{}.container()({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0])) - else: - code = "{} = &{}({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0])) - - temporaries_insns.append(lp.CInstruction(id='insn_{}'.format(str(get_counter('__insn_id')).zfill(4)), - iname_exprs=frozenset(), - within_inames=frozenset(), - code=code, - read_variables=frozenset({localvector[0]}), - assignees=frozenset({localvector[0]+'_vec'}))) - knl = knl.copy(instructions=knl.instructions+temporaries_insns) - - new_insns = [] - for insn in knl.instructions: - if insn.id in insn_with_localvectors: - localvector = insn_with_localvectors[insn.id] - if isinstance(insn, lp.CallInstruction): - expr = insn.expression.parameters[-1] - insn_dict = insn.__dict__ - insn_dict['assignee'] = prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1]) - insn_dict['expression'] = expr + prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1]) - insn_dict['temp_var_type'] = () - insn_dict.pop('assignees') - insn_dict.pop('temp_var_types') - new_insns.append(lp.Assignment(**insn_dict)) - else: - from loopy.symbolic import SubstitutionMapper - class ReplaceCall(SubstitutionMapper): - def map_call(self, expr): - if isinstance(expr.function, CoefficientAccess): - return prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1]) - else: - return super.map_call(expr) - new_insns.append(insn.with_transformed_expressions(ReplaceCall(lambda e: None))) - else: - new_insns.append(insn.copy()) #vielleicht auf dependencies achten?? - knl = knl.copy(instructions=new_insns) - - return knl - - def vectorize_micro_elements(knl): - if "subel_x" in knl.all_inames(): - knl = add_vectortype_alias(knl, "subel_x_inner") return knl diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index 64a96ed3..78dda3e6 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -260,6 +260,7 @@ def determine_accumulation_space(info, number): return AccumulationSpace(lfs=lfs, index=lfsi, restriction=info.restriction, + element=element ) @@ -525,7 +526,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): silenced_warnings=silenced, name=name, ) - from loopy import make_reduction_inames_unique kernel = make_reduction_inames_unique(kernel) @@ -551,8 +551,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): from dune.perftool.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) - from dune.perftool.blockstructured.vectorization import vectorize_micro_elements - kernel = vectorize_micro_elements(kernel) # Maybe apply vectorization strategies if get_option("vectorization_quadloop"): if get_option("sumfact"): @@ -560,6 +558,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): kernel = vectorize_quadrature_loop(kernel) else: raise NotImplementedError("Only vectorizing sumfactorized code right now!") + if get_option("vectorization_blockstructured"): + from dune.perftool.blockstructured.vectorization import vectorize_micro_elements + kernel = vectorize_micro_elements(kernel) + # Now add the preambles to the kernel preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))] -- GitLab