Skip to content
Snippets Groups Projects
Commit 31e83903 authored by Marcel Koch's avatar Marcel Koch
Browse files

use plain array while creating the kernel, not afterwards in the vectorization

parent 90aff340
No related branches found
No related tags found
No related merge requests found
...@@ -3,6 +3,7 @@ import dune.perftool.blockstructured.argument ...@@ -3,6 +3,7 @@ import dune.perftool.blockstructured.argument
import dune.perftool.blockstructured.geometry import dune.perftool.blockstructured.geometry
import dune.perftool.blockstructured.spaces import dune.perftool.blockstructured.spaces
import dune.perftool.blockstructured.basis import dune.perftool.blockstructured.basis
from dune.perftool.options import get_option
from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position
# from dune.perftool.pdelab.geometry import to_global # from dune.perftool.pdelab.geometry import to_global
...@@ -18,6 +19,7 @@ from dune.perftool.blockstructured.tools import sub_element_inames ...@@ -18,6 +19,7 @@ from dune.perftool.blockstructured.tools import sub_element_inames
from dune.perftool.pdelab import PDELabInterface from dune.perftool.pdelab import PDELabInterface
class BlockStructuredInterface(PDELabInterface): class BlockStructuredInterface(PDELabInterface):
def __init__(self): def __init__(self):
PDELabInterface.__init__(self) PDELabInterface.__init__(self)
...@@ -25,10 +27,17 @@ class BlockStructuredInterface(PDELabInterface): ...@@ -25,10 +27,17 @@ class BlockStructuredInterface(PDELabInterface):
# #
# Local function space related generator functions # Local function space related generator functions
# #
def generate_accumulation_instruction(self, expr, visitor):
if get_option("vectorization_blockstructured"):
from dune.perftool.blockstructured.accumulation import generate_accumulation_instruction
return generate_accumulation_instruction(expr, visitor)
else:
from dune.perftool.pdelab.localoperator import generate_accumulation_instruction
return generate_accumulation_instruction(expr, visitor)
# TODO current way to squeeze subelem iname in, not really ideal # TODO current way to squeeze subelem iname in, not really ideal
def lfs_inames(self, element, restriction, number=None, context=''): def lfs_inames(self, element, restriction, number=None, context=''):
return lfs_inames(element, restriction, number, context) + sub_element_inames() return sub_element_inames() + lfs_inames(element, restriction, number, context)
# #
# Test and trial function related generator functions # Test and trial function related generator functions
......
from dune.perftool.generation import temporary_variable, instruction
from dune.perftool.options import get_option
from dune.perftool.pdelab.localoperator import determine_accumulation_space
from dune.perftool.pdelab.argument import name_accumulation_variable
from dune.perftool.pdelab.localoperator import boundary_predicates
import pymbolic.primitives as prim
def define_accumulation_alias(container, accumspace):
k = get_option("number_of_blocks")
p = accumspace.element.degree()
temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1),
base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
code = "{} = &{}.container()({},0);".format(container + "_alias", container, accumspace.lfs.name)
instruction(within_inames=frozenset(),
code=code,
read_variables=frozenset({container}),
assignees=frozenset({container+"_alias"}))
def name_accumulation_alias(accumvar, accumspace):
name = accumvar+"_alias"
define_accumulation_alias(accumvar, accumspace)
return name
def generate_accumulation_instruction(expr, visitor):
# Collect the lfs and lfs indices for the accumulate call
test_lfs = determine_accumulation_space(visitor.test_info, 0)
# In the jacobian case, also determine the space for the ansatz space
ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1)
# Collect the lfs and lfs indices for the accumulate call
accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction())
accumvar_alias = name_accumulation_alias(accumvar, test_lfs)
predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id)
quad_inames = visitor.interface.quadrature_inames()
lfs_inames = visitor.test_info.inames
if visitor.trial_info:
lfs_inames = lfs_inames + visitor.trial_info.inames
assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames))
instruction(assignee=assignee,
expression=prim.Sum((expr, assignee)),
forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)),
forced_iname_deps_is_final=True,
predicates=predicates
)
from dune.perftool.generation import (backend, from dune.perftool.generation import (backend,
kernel_cached, kernel_cached,
valuearg) valuearg, temporary_variable, instruction)
from dune.perftool.options import get_option
from dune.perftool.pdelab.argument import CoefficientAccess from dune.perftool.pdelab.argument import CoefficientAccess
from dune.perftool.blockstructured.tools import micro_index_to_macro_index from dune.perftool.blockstructured.tools import micro_index_to_macro_index, sub_element_inames
from loopy.types import NumpyType from loopy.types import NumpyType
import pymbolic.primitives as prim import pymbolic.primitives as prim
import loopy as lp
def define_alias(container, lfs, element):
k = get_option("number_of_blocks")
p = element.degree()
temporary_variable(container+"_alias", shape=(k + 1, k + 1, p + 1, p + 1), strides=(p * k + 1, p, k + 1, 1),
base_storage="dummy_"+container, managed=True, _base_storage_access_may_be_aliasing=True)
code = "{} = &{}({},0);".format(container+"_alias",container, lfs.name)
instruction(within_inames=frozenset(),
code=code,
read_variables=frozenset({container}),
assignees=frozenset({container+"_alias"}))
def name_alias(container, lfs, element):
define_alias(container, lfs, element)
name = container+"_alias"
return name
# TODO remove the need for element # TODO remove the need for element
...@@ -20,4 +40,9 @@ def pymbolic_coefficient(container, lfs, element, index): ...@@ -20,4 +40,9 @@ def pymbolic_coefficient(container, lfs, element, index):
lfs = prim.Variable(lfs) lfs = prim.Variable(lfs)
# use higher order FEM index instead of Q1 index # use higher order FEM index instead of Q1 index
return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) if get_option("vectorization_blockstructured"):
subelem_inames = sub_element_inames()
coeff_alias = name_alias(container, lfs, element)
return prim.Subscript(lp.TaggedVariable(coeff_alias, 'coeff_alias'), tuple(prim.Variable(i) for i in subelem_inames+index))
else:
return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),))
import loopy as lp
import pymbolic.primitives as prim
from dune.perftool.generation import temporary_variable, instruction, kernel_cached, get_counter
from dune.perftool.options import get_option
from dune.perftool.pdelab.argument import CoefficientAccess, PDELabAccumulationFunction
vector_temporaries = set()
def define_vector(name, code, inames, read_variables):
temporary_variable(name, base_storage='dummy'+name, managed=True, _base_storage_access_may_be_aliasing=True)
instruction(code=code,
inames=inames,
read_variables=read_variables,
assignees=frozenset({name}))
def name_vector(expr, insn, iname):
func = expr.function
parameters = ', '.join(tuple(str(p) for p in expr.parameters[0:2]))
if isinstance(func, CoefficientAccess):
name = func.name
elif isinstance(func, PDELabAccumulationFunction):
name = func.accumobj
else:
return None
vector_name = name+"_vec"
vector_temporaries.add(vector_name)
c_alias = "{} = &{}({});".format(vector_name,name,parameters)
define_vector(vector_name, c_alias, insn.within_inames - frozenset({iname}), insn.read_dependency_names()-frozenset({iname}))
return vector_name
def add_vectortype_alias(knl, iname):
assert(isinstance(knl, lp.LoopKernel))
from loopy.symbolic import WalkMapper
class SubstLocalVector(WalkMapper):
def __init__(self):
self.localvectors = ()
def add_localvector(self, expr):
if isinstance(expr, prim.Call):
if isinstance(expr.function, CoefficientAccess):
localvector = expr.function.name
elif isinstance(expr.function, PDELabAccumulationFunction):
localvector = expr.function.accumobj
else:
return True
self.localvectors += ((localvector, expr.parameters[0:2]))
return True
post_visit = add_localvector
# find insn with localvectors in their expression
insn_with_localvectors = dict()
for insn in knl.instructions:
if isinstance(insn, lp.MultiAssignmentBase):
SM = SubstLocalVector()
SM(insn.expression)
if SM.localvectors:
insn_with_localvectors[insn.id] = SM.localvectors
# add temporaries
temporaries_names = set((lv[0]+"_vec" for lv in insn_with_localvectors.values()))
new_temporaries = dict()
for temporary_name in temporaries_names:
from dune.perftool.loopy.temporary import DuneTemporaryVariable
# temporary name unique?
k = get_option("number_of_blocks")
new_temporaries[temporary_name] = \
DuneTemporaryVariable(temporary_name, base_storage="dummy_"+temporary_name, managed=True,
_base_storage_access_may_be_aliasing=True, shape=(1,),
scope=lp.temp_var_scope.PRIVATE, read_only=True)
knl = knl.copy(temporary_variables={**knl.temporary_variables, **new_temporaries})
# set temporaries to localvector
temporaries_insns = []
for insn_id, localvector in insn_with_localvectors.items():
insn = knl.id_to_insn[insn_id]
if isinstance(insn, lp.CallInstruction):
code = "{} = &{}.container()({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0]))
else:
code = "{} = &{}({},0);".format(localvector[0]+"_vec",localvector[0], str(localvector[1][0]))
temporaries_insns.append(lp.CInstruction(id='insn_{}'.format(str(get_counter('__insn_id')).zfill(4)),
iname_exprs=frozenset(),
within_inames=frozenset(),
code=code,
read_variables=frozenset({localvector[0]}),
assignees=frozenset({localvector[0]+'_vec'})))
knl = knl.copy(instructions=knl.instructions+temporaries_insns)
new_insns = []
for insn in knl.instructions:
if insn.id in insn_with_localvectors:
localvector = insn_with_localvectors[insn.id]
if isinstance(insn, lp.CallInstruction):
expr = insn.expression.parameters[-1]
insn_dict = insn.__dict__
insn_dict['assignee'] = prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
insn_dict['expression'] = expr + prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
insn_dict['temp_var_type'] = ()
insn_dict.pop('assignees')
insn_dict.pop('temp_var_types')
new_insns.append(lp.Assignment(**insn_dict))
else:
from loopy.symbolic import SubstitutionMapper
class ReplaceCall(SubstitutionMapper):
def map_call(self, expr):
if isinstance(expr.function, CoefficientAccess):
return prim.Subscript(prim.Variable(localvector[0]+"_vec"), localvector[1][1])
else:
return super.map_call(expr)
new_insns.append(insn.with_transformed_expressions(ReplaceCall(lambda e: None)))
else:
new_insns.append(insn.copy()) #vielleicht auf dependencies achten??
knl = knl.copy(instructions=new_insns)
return knl
def vectorize_micro_elements(knl): def vectorize_micro_elements(knl):
if "subel_x" in knl.all_inames():
knl = add_vectortype_alias(knl, "subel_x_inner")
return knl return knl
...@@ -260,6 +260,7 @@ def determine_accumulation_space(info, number): ...@@ -260,6 +260,7 @@ def determine_accumulation_space(info, number):
return AccumulationSpace(lfs=lfs, return AccumulationSpace(lfs=lfs,
index=lfsi, index=lfsi,
restriction=info.restriction, restriction=info.restriction,
element=element
) )
...@@ -525,7 +526,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): ...@@ -525,7 +526,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
silenced_warnings=silenced, silenced_warnings=silenced,
name=name, name=name,
) )
from loopy import make_reduction_inames_unique from loopy import make_reduction_inames_unique
kernel = make_reduction_inames_unique(kernel) kernel = make_reduction_inames_unique(kernel)
...@@ -551,8 +551,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): ...@@ -551,8 +551,6 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
from dune.perftool.loopy import heuristic_duplication from dune.perftool.loopy import heuristic_duplication
kernel = heuristic_duplication(kernel) kernel = heuristic_duplication(kernel)
from dune.perftool.blockstructured.vectorization import vectorize_micro_elements
kernel = vectorize_micro_elements(kernel)
# Maybe apply vectorization strategies # Maybe apply vectorization strategies
if get_option("vectorization_quadloop"): if get_option("vectorization_quadloop"):
if get_option("sumfact"): if get_option("sumfact"):
...@@ -560,6 +558,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True): ...@@ -560,6 +558,10 @@ def extract_kernel_from_cache(tag, wrap_in_cgen=True):
kernel = vectorize_quadrature_loop(kernel) kernel = vectorize_quadrature_loop(kernel)
else: else:
raise NotImplementedError("Only vectorizing sumfactorized code right now!") raise NotImplementedError("Only vectorizing sumfactorized code right now!")
if get_option("vectorization_blockstructured"):
from dune.perftool.blockstructured.vectorization import vectorize_micro_elements
kernel = vectorize_micro_elements(kernel)
# Now add the preambles to the kernel # Now add the preambles to the kernel
preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))] preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment