diff --git a/python/dune/perftool/blockstructured/__init__.py b/python/dune/perftool/blockstructured/__init__.py index 9657e81c3336681285d65f0bd22c18c36155ff84..8849491e8e6c31f33f1a1c5d9a1e247122158137 100644 --- a/python/dune/perftool/blockstructured/__init__.py +++ b/python/dune/perftool/blockstructured/__init__.py @@ -3,9 +3,9 @@ import dune.perftool.blockstructured.argument import dune.perftool.blockstructured.geometry import dune.perftool.blockstructured.spaces import dune.perftool.blockstructured.basis - +import dune.perftool.blockstructured.transformations +from dune.perftool.options import get_form_option from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position -# from dune.perftool.pdelab.geometry import to_global from dune.perftool.blockstructured.spaces import lfs_inames from dune.perftool.blockstructured.basis import (pymbolic_reference_gradient, pymbolic_basis) @@ -14,7 +14,6 @@ from dune.perftool.blockstructured.geometry import (pymbolic_jacobian_inverse_tr pymbolic_facet_jacobian_determinant, to_global) from dune.perftool.blockstructured.tools import sub_element_inames - from dune.perftool.pdelab import PDELabInterface @@ -22,13 +21,19 @@ class BlockStructuredInterface(PDELabInterface): def __init__(self): PDELabInterface.__init__(self) + def generate_accumulation_instruction(self, expr, visitor): + if get_form_option('vectorization_blockstructured'): + from dune.perftool.blockstructured.accumulation import generate_accumulation_instruction + return generate_accumulation_instruction(expr, visitor) + else: + return PDELabInterface.generate_accumulation_instruction(self, expr, visitor) # # Local function space related generator functions # - # TODO current way to squeeze subelem iname in, not really ideal + def lfs_inames(self, element, restriction, number=None, context=''): - return lfs_inames(element, restriction, number, context) + sub_element_inames() + return sub_element_inames() + lfs_inames(element, restriction, number, context) # # Test and trial function related generator functions diff --git a/python/dune/perftool/blockstructured/accumulation.py b/python/dune/perftool/blockstructured/accumulation.py new file mode 100644 index 0000000000000000000000000000000000000000..ee52acbb157721717157d1f17a45f0595b783e25 --- /dev/null +++ b/python/dune/perftool/blockstructured/accumulation.py @@ -0,0 +1,68 @@ +from dune.perftool.generation import instruction +from dune.perftool.loopy.target import dtype_floatingpoint +from dune.perftool.options import get_form_option +from dune.perftool.pdelab.geometry import world_dimension +from dune.perftool.pdelab.localoperator import determine_accumulation_space +from dune.perftool.pdelab.argument import name_accumulation_variable +from dune.perftool.pdelab.localoperator import boundary_predicates +from dune.perftool.generation.loopy import function_mangler, globalarg +import loopy as lp +import pymbolic.primitives as prim + + +def name_accumulation_alias(container, accumspace): + name = container + "_" + accumspace.lfs.name + "_alias" + name_tail = container + "_" + accumspace.lfs.name + "_alias_tail" + k = get_form_option("number_of_blocks") + p = accumspace.element.degree() + + def _add_alias_insn(name): + dim = world_dimension() + element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) + index_stride = tuple((p * k + 1)**i for i in range(0, dim)) + globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) + code = "auto {} = &{}.container()({},0);".format(name, container, accumspace.lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({name})) + + _add_alias_insn(name) + _add_alias_insn(name_tail) + return name + + +@function_mangler +def residual_weight_mangler(knl, func, arg_dtypes): + if isinstance(func, str) and func.endswith('.weight'): + return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype_floatingpoint()),), ()) + + +def generate_accumulation_instruction(expr, visitor): + # Collect the lfs and lfs indices for the accumulate call + test_lfs = determine_accumulation_space(visitor.test_info, 0) + # In the jacobian case, also determine the space for the ansatz space + ansatz_lfs = determine_accumulation_space(visitor.trial_info, 1) + + # Collect the lfs and lfs indices for the accumulate call + accumvar = name_accumulation_variable(test_lfs.get_restriction() + ansatz_lfs.get_restriction()) + accumvar_alias = name_accumulation_alias(accumvar, test_lfs) + + predicates = boundary_predicates(expr, visitor.measure, visitor.subdomain_id) + + quad_inames = visitor.interface.quadrature_inames() + lfs_inames = visitor.test_info.inames + if visitor.trial_info: + lfs_inames = lfs_inames + visitor.trial_info.inames + + assignee = prim.Subscript(prim.Variable(accumvar_alias), tuple(prim.Variable(i) for i in lfs_inames)) + + expr_with_weight = prim.Product((expr, prim.Call(prim.Variable(accumvar + '.weight'), ()))) + + instruction(assignee=assignee, + expression=prim.Sum((expr_with_weight, assignee)), + forced_iname_deps=frozenset(lfs_inames).union(frozenset(quad_inames)), + forced_iname_deps_is_final=True, + predicates=predicates, + tags=frozenset({'accum'}) + ) diff --git a/python/dune/perftool/blockstructured/argument.py b/python/dune/perftool/blockstructured/argument.py index 7ca3a94c60989f3b4ffe4d1247c3e7fd45a6f575..ff08627822fbad61bea36d6873377324deb34050 100644 --- a/python/dune/perftool/blockstructured/argument.py +++ b/python/dune/perftool/blockstructured/argument.py @@ -1,12 +1,30 @@ from dune.perftool.generation import (backend, kernel_cached, - valuearg) + valuearg, instruction, globalarg) +from dune.perftool.options import get_form_option from dune.perftool.pdelab.argument import CoefficientAccess -from dune.perftool.blockstructured.tools import micro_index_to_macro_index +from dune.perftool.blockstructured.tools import micro_index_to_macro_index, sub_element_inames +from dune.perftool.pdelab.geometry import world_dimension from loopy.types import NumpyType import pymbolic.primitives as prim +def name_alias(container, lfs, element): + name = container + "_" + lfs.name + "_alias" + k = get_form_option("number_of_blocks") + p = element.degree() + dim = world_dimension() + element_stride = tuple(p * (p * k + 1)**i for i in range(0, dim)) + index_stride = tuple((p * k + 1)**i for i in range(0, dim)) + globalarg(name, shape=(k,) * dim + (p + 1,) * dim, strides=element_stride + index_stride, managed=True) + code = "const auto {} = &{}({},0);".format(name, container, lfs.name) + instruction(within_inames=frozenset(), + code=code, + read_variables=frozenset({container}), + assignees=frozenset({name})) + return name + + # TODO remove the need for element @kernel_cached @backend(interface="pymbolic_coefficient", name="blockstructured") @@ -20,4 +38,9 @@ def pymbolic_coefficient(container, lfs, element, index): lfs = prim.Variable(lfs) # use higher order FEM index instead of Q1 index - return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) + if get_form_option("vectorization_blockstructured"): + subelem_inames = sub_element_inames() + coeff_alias = name_alias(container, lfs, element) + return prim.Subscript(prim.Variable(coeff_alias), tuple(prim.Variable(i) for i in subelem_inames + index)) + else: + return prim.Call(CoefficientAccess(container), (lfs, micro_index_to_macro_index(element, index),)) diff --git a/python/dune/perftool/blockstructured/basis.py b/python/dune/perftool/blockstructured/basis.py index 2748a7228903e85f71980db1488bc68cf597bcc6..98ab1bbd66f99af2451a7de1e74e5fe24719c3be 100644 --- a/python/dune/perftool/blockstructured/basis.py +++ b/python/dune/perftool/blockstructured/basis.py @@ -21,6 +21,7 @@ from dune.perftool.pdelab.quadrature import pymbolic_quadrature_position_in_cell from dune.perftool.pdelab.spaces import type_leaf_gfs from dune.perftool.pdelab.restriction import restricted_name from dune.perftool.blockstructured.spaces import lfs_inames +from dune.perftool.blockstructured.tools import tensor_index_to_sequential_index from ufl import MixedElement @@ -70,7 +71,8 @@ def name_localbasis(leaf_element): @kernel_cached def evaluate_basis(leaf_element, name, restriction): - temporary_variable(name, shape=(leaf_element.degree(), 1), decl_method=declare_cache_temporary(leaf_element, restriction, 'Function')) + temporary_variable(name, shape=((leaf_element.degree() + 1)**world_dimension(), 1), + decl_method=declare_cache_temporary(leaf_element, restriction, 'Function')) cache = name_localbasis_cache(leaf_element) qp = pymbolic_quadrature_position_in_cell(restriction) localbasis = name_localbasis(leaf_element) @@ -86,14 +88,15 @@ def pymbolic_basis(leaf_element, restriction, number, context=''): name = "phi_{}".format(FEM_name_mangling(leaf_element)) name = restricted_name(name, restriction) evaluate_basis(leaf_element, name, restriction) - iname = lfs_inames(leaf_element, restriction, number, context=context)[0] + inames = lfs_inames(leaf_element, restriction, number, context=context) - return prim.Subscript(prim.Variable(name), (prim.Variable(iname), 0)) + return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, leaf_element.degree() + 1), 0)) @kernel_cached def evaluate_reference_gradient(leaf_element, name, restriction): - temporary_variable(name, shape=(leaf_element.degree(), 1, world_dimension()), decl_method=declare_cache_temporary(leaf_element, restriction, 'Jacobian')) + temporary_variable(name, shape=((leaf_element.degree() + 1)**world_dimension(), 1, world_dimension()), + decl_method=declare_cache_temporary(leaf_element, restriction, 'Jacobian')) cache = name_localbasis_cache(leaf_element) qp = pymbolic_quadrature_position_in_cell(restriction) localbasis = name_localbasis(leaf_element) @@ -109,6 +112,6 @@ def pymbolic_reference_gradient(leaf_element, restriction, number, context=''): name = "js_{}".format(FEM_name_mangling(leaf_element)) name = restricted_name(name, restriction) evaluate_reference_gradient(leaf_element, name, restriction) - iname = lfs_inames(leaf_element, restriction, number, context=context)[0] + inames = lfs_inames(leaf_element, restriction, number, context=context) - return prim.Subscript(prim.Variable(name), (prim.Variable(iname), 0)) + return prim.Subscript(prim.Variable(name), (tensor_index_to_sequential_index(inames, leaf_element.degree() + 1), 0)) diff --git a/python/dune/perftool/blockstructured/spaces.py b/python/dune/perftool/blockstructured/spaces.py index 3991050bad79d787d21c52a35c83e6f550faf939..1f39a9d2ee51c228f9dc4acb96c7530ad9dea3ce 100644 --- a/python/dune/perftool/blockstructured/spaces.py +++ b/python/dune/perftool/blockstructured/spaces.py @@ -14,7 +14,16 @@ def lfs_inames(element, restriction, count=None, context=''): lfs = name_leaf_lfs(element, restriction) - name = "micro_{}_{}_index".format(lfs, context) - domain(name, pow(element.degree() + 1, world_dimension())) + # register transformation + # warning: this will register the transformation a couple of times + from dune.perftool.generation import transform + from dune.perftool. blockstructured.transformations import blockstructured_iname_duplication + transform(blockstructured_iname_duplication) - return (name, ) + dim_names = ["x", "y", "z"] + [str(i) for i in range(4, world_dimension() + 1)] + name = "micro_{}_{}_index_".format(lfs, context) + inames = tuple() + for i in range(world_dimension()): + inames = inames + (name + dim_names[i],) + domain(name + dim_names[i], element.degree() + 1) + return inames diff --git a/python/dune/perftool/blockstructured/tools.py b/python/dune/perftool/blockstructured/tools.py index 029b0f752b6c5f37854550588070f6072a7de977..a9bf01f26f5a52daa91f6cbbcf01bd31dffabb36 100644 --- a/python/dune/perftool/blockstructured/tools.py +++ b/python/dune/perftool/blockstructured/tools.py @@ -21,11 +21,11 @@ import pymbolic.primitives as prim def sub_element_inames(): name = "subel" dim = local_dimension() + dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)] inames = tuple() for i in range(dim): - name_counted = get_counted_variable(name) - inames = inames + (name_counted,) - domain(name_counted, get_form_option("number_of_blocks")) + inames = inames + ("subel_" + dim_names[i],) + domain("subel_" + dim_names[i], get_form_option("number_of_blocks")) return inames @@ -97,16 +97,16 @@ def sub_facet_inames(): # compute sequential index for given tensor index, the same as index in base-k to base-10 def tensor_index_to_sequential_index(indices, k): - return prim.Sum(tuple(index * k ** i for i, index in enumerate(indices))) + return prim.Sum(tuple(prim.Variable(index) * k ** i for i, index in enumerate(indices))) # compute tensor index for given sequential index, the same as index in base-10 to base-k -def to_tensor_index(iname, k): +def sequential_index_to_tensor_index(iname, k): return tuple(prim.Remainder(prim.Variable(iname) / (k**i), k) for i in range(world_dimension())) # compute index for higher order FEM for a given Q1 index -def micro_index_to_macro_index(element, iname): +def micro_index_to_macro_index(element, inames): it = get_global_context_value("integral_type") if it == "cell": subelem_inames = sub_element_inames() @@ -115,7 +115,5 @@ def micro_index_to_macro_index(element, iname): k = get_form_option("number_of_blocks") p = element.degree() - modified_index = prim.Sum((tensor_index_to_sequential_index(to_tensor_index(iname, p + 1), p * k + 1), - tensor_index_to_sequential_index(tuple(prim.Variable(iname) * p for iname in subelem_inames), p * k + 1))) - - return modified_index + return prim.Sum(tuple((p * prim.Variable(si) + prim.Variable(bi)) * (p * k + 1) ** i + for i, (si, bi) in enumerate(zip(subelem_inames, inames)))) diff --git a/python/dune/perftool/blockstructured/transformations.py b/python/dune/perftool/blockstructured/transformations.py new file mode 100644 index 0000000000000000000000000000000000000000..ad010e4579e310e03e4a8d8f9a592d380735256c --- /dev/null +++ b/python/dune/perftool/blockstructured/transformations.py @@ -0,0 +1,26 @@ +from loopy import (has_schedulable_iname_nesting, + get_iname_duplication_options, + duplicate_inames, + ) + + +def blockstructured_iname_duplication(kernel): + # If the given kernel is schedulable, nothing needs to be done. + if has_schedulable_iname_nesting(kernel): + return kernel + + # group inames with the same base name for duplication + duplication_options = dict(get_iname_duplication_options(kernel)) + suffixes = ['x', 'y', 'z'] + for iname in duplication_options: + if iname[-1] in suffixes: + dup_kernel = kernel + base = iname[:-1] + for s in suffixes: + if base + s in duplication_options: + dup_kernel = duplicate_inames(dup_kernel, base + s, duplication_options[base + s]) + del duplication_options[base + s] + if has_schedulable_iname_nesting(dup_kernel): + return dup_kernel + + raise NotImplementedError("Your kernel needs multiple iname duplications! No generic algorithm implemented for that yet! (#39)") diff --git a/python/dune/perftool/blockstructured/vectorization.py b/python/dune/perftool/blockstructured/vectorization.py new file mode 100644 index 0000000000000000000000000000000000000000..d711719b5f516b1b35379ee05ad31bc659afa55e --- /dev/null +++ b/python/dune/perftool/blockstructured/vectorization.py @@ -0,0 +1,320 @@ +import loopy as lp +import numpy as np +import pymbolic.primitives as prim +from dune.perftool.loopy.target import dtype_floatingpoint +from dune.perftool.loopy.temporary import DuneTemporaryVariable +from dune.perftool.loopy.symbolic import substitute +from dune.perftool.loopy.vcl import get_vcl_type_size, VCLPermute, VCLLoad, VCLStore +from dune.perftool.options import get_form_option +from dune.perftool.pdelab.argument import PDELabAccumulationFunction +from dune.perftool.pdelab.geometry import world_dimension +from dune.perftool.tools import get_pymbolic_indices + + +def add_vcl_temporaries(knl): + vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] + + # add new temporaries for vectors + # hope one read insn doesn't have two different reads from the same temporary + new_vec_temporaries = dict() + new_insns = [] + vec_iname = 'write_vec_once' + from islpy import BasicSet + domain = BasicSet("{{ [{0}] : 0<={0}<{1} }}".format(vec_iname, get_vcl_type_size(dtype_floatingpoint()))) + for alias in vector_alias: + vector_name = alias.replace('alias', 'vec') + new_vec_temporaries[vector_name] = DuneTemporaryVariable(vector_name, dtype=np.float64, + shape=(get_vcl_type_size(np.float64),), managed=True, + scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) + # write once to the vector such that loopy won't complain + new_insns.append(lp.Assignment(assignee=prim.Subscript(prim.Variable(vector_name), prim.Variable(vec_iname)), + expression=0, within_inames=frozenset({vec_iname}), + id='write_' + vector_name)) + + from loopy.kernel.data import VectorizeTag + return knl.copy(instructions=knl.instructions + new_insns, domains=knl.domains + [domain], + temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries), + iname_to_tag=dict(**knl.iname_to_tag, **{vec_iname: VectorizeTag()})) + + +def add_vcl_accum_insns(knl, iname_inner, iname_outer): + nptype = dtype_floatingpoint() + vcl_size = get_vcl_type_size(np.float64) + + from loopy.match import Tagged + accum_insns = set(lp.find_instructions(knl, Tagged('accum'))) + + new_insns = [] + vng = knl.get_var_name_generator() + idg = knl.get_instruction_id_generator() + new_vec_temporaries = dict() + for insn in knl.instructions: + # somehow CInstructions are not hashable.... + if isinstance(insn, lp.MultiAssignmentBase) and insn in accum_insns: + # write accum expr as "r = expr + r" + expr_without_r = prim.Sum(tuple(e for e in insn.expression.children if not e == insn.assignee)) + + inames_micro = set((i for i in insn.within_inames if i.startswith('micro'))) + iname_ix = next((i for i in inames_micro if i.endswith("_x"))) + + # need inames for head and tail handling a priori + from loopy.match import Not, All + replace_head_inames = dict() + replace_tail_inames = dict() + for iname in inames_micro - frozenset({iname_ix}): + head = iname + '_head' + tail = iname + '_tail' + replace_head_inames[iname] = prim.Variable(head) + replace_tail_inames[iname] = prim.Variable(tail) + knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_tail') + knl = lp.duplicate_inames(knl, iname, Not(All()), suffix='_head') + inames_head = frozenset((var.name for var in replace_head_inames.values())) + inames_tail = frozenset((var.name for var in replace_tail_inames.values())) + + # erstelle a[iy] und b + identifier_left = vng('left_node') + identifier_right = vng('right_node') + new_vec_temporaries[identifier_left] = DuneTemporaryVariable(identifier_left, dtype=np.float64, + shape=(2,) * (world_dimension() - 1) + + (vcl_size,), + managed=True, scope=lp.temp_var_scope.PRIVATE, + dim_tags=('f',) * (world_dimension() - 1) + + ('vec',)) + new_vec_temporaries[identifier_right] = DuneTemporaryVariable(identifier_right, dtype=np.float64, + shape=(vcl_size,), managed=True, + scope=lp.temp_var_scope.PRIVATE, dim_tags=('vec',)) + + var_left = prim.Subscript(prim.Variable(identifier_left), + tuple(prim.Variable(i) for i in sorted(inames_micro - frozenset({iname_ix}))) + + (prim.Variable(iname_inner),)) + var_right = prim.Subscript(prim.Variable(identifier_right), (prim.Variable(iname_inner),)) + + # init a + id_init_a = idg('insn_init_' + identifier_left) + new_insns.append(lp.Assignment(assignee=substitute(var_left, replace_head_inames), + expression=0, + id=id_init_a, + within_inames=(insn.within_inames - frozenset({iname_outer}) - + inames_micro) | inames_head, + tags=frozenset({'head'}))) + + # setze werte für a und b + expr_right = substitute(expr_without_r, {iname_ix: 1}) + expr_left = prim.Sum((substitute(expr_without_r, {iname_ix: 0}), var_left)) + + id_set_left = idg('insn_' + identifier_left) + id_set_right = idg('insn_' + identifier_right) + new_insns.append(lp.Assignment(assignee=var_right, + expression=expr_right, + id=id_set_right, + depends_on=insn.depends_on, + within_inames=insn.within_inames - frozenset({iname_ix}))) + new_insns.append(lp.Assignment(assignee=var_left, + expression=expr_left, + id=id_set_left, + depends_on=insn.depends_on | frozenset({id_init_a}), + within_inames=insn.within_inames - frozenset({iname_ix}))) + + # r+=a[iy] + id_accum = idg('insn_mod_accum') + expr_accum = prim.Sum((var_left, + prim.Call(VCLPermute(nptype, vcl_size, (-1,) + tuple(range(vcl_size - 1))), + (var_right,)), + substitute(insn.assignee, {iname_ix: 0}))) + new_insns.append(lp.Assignment(assignee=substitute(insn.assignee, {iname_ix: 0}), + expression=expr_accum, + id=id_accum, + depends_on=insn.depends_on | frozenset({id_set_left, + id_init_a, id_set_right}), + within_inames=insn.within_inames - frozenset({iname_ix}), + tags=frozenset({'accum'}))) + # a[iy] = permute + id_permute = idg('insn_permute') + expr_permute = prim.Call(VCLPermute(nptype, vcl_size, (vcl_size - 1,) + (-1,) * (vcl_size - 1)), + (var_right,)) + new_insns.append(lp.Assignment(assignee=var_left, + expression=expr_permute, + id=id_permute, + depends_on=insn.depends_on | frozenset({id_set_left, id_init_a, id_set_right, + id_accum}), + within_inames=insn.within_inames - frozenset({iname_ix}) + )) + + # tail handling, uses tail alias + id_accum_tail = idg('insn_accum_tail') + subst_map = {iname_inner: vcl_size - 1, iname_outer: get_form_option("number_of_blocks") // vcl_size - 1, + iname_ix: 1, insn.assignee_name: prim.Variable(insn.assignee_name + '_tail'), + **replace_tail_inames} + assignee_tail = substitute(insn.assignee, subst_map) + expr_tail = prim.Sum((substitute(var_left, {iname_inner: 0, **replace_tail_inames}), assignee_tail)) + + new_insns.append(lp.Assignment(assignee=assignee_tail, + expression=expr_tail, + id=id_accum_tail, + depends_on=frozenset({id_accum, id_permute, id_set_left, id_init_a}), + within_inames=(insn.within_inames - frozenset({iname_inner, iname_outer}) - + inames_micro) | inames_tail, + tags=frozenset({'tail'}))) + else: + new_insns.append(insn) + + return knl.copy(instructions=new_insns, + temporary_variables=dict(**knl.temporary_variables, **new_vec_temporaries)) + + +def add_vcl_access(knl, iname_inner): + from loopy.match import Reads, Tagged + accum_insns = set((insn.id for insn in lp.find_instructions(knl, Tagged('accum')))) + read_insns = set((insn.id for insn in lp.find_instructions(knl, Reads('*alias')))) + + from loopy.symbolic import CombineMapper + from loopy.symbolic import IdentityMapper + + class AliasIndexCollector(CombineMapper, IdentityMapper): + def combine(self, values): + return sum(values, tuple()) + + def map_constant(self, expr): + return tuple() + + map_variable = map_constant + map_function_symbol = map_constant + map_loopy_function_identifier = map_constant + + def map_subscript(self, expr): + if expr.aggregate.name.endswith('alias'): + return expr.aggregate, expr.index_tuple + else: + return tuple() + + # add load instructions if the vector is read, based on the read instruction + # TODO brauche mehrere vectoren, falls in einer insn von einem alias mit unterschiedlichen idx gelesen wird + # muss dafür eigentlich nur den namen des vectors anpassen + idg = knl.get_instruction_id_generator() + aic = AliasIndexCollector() + load_insns = [] + read_dependencies = dict() + for id in read_insns: + insn = knl.id_to_insn[id] + + alias, index = aic(insn.expression) + name_alias = alias.name + name_vec = name_alias.replace('alias', 'vec') + + # compute index without vec iname + strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) + index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) + if i != 0 and i.name != iname_inner)) + + # add load instruction + load_id = idg('insn_' + name_vec + '_load') + call_load = prim.Call(VCLLoad(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) + load_insns.append(lp.CallInstruction(assignees=(), expression=call_load, + id=load_id, within_inames=insn.within_inames | insn.reduction_inames(),)) + read_dependencies.setdefault(id, set()) + read_dependencies[id].add(load_id) + + # add store instructions if the vector is written, based on the write instruction + store_insns = [] + for id in accum_insns: + insn = knl.id_to_insn[id] + + alias, index = aic(insn.expression) + name_alias = alias.name + name_vec = name_alias.replace('alias', 'vec') + + # flat index without vec iname + strides = tuple(tag.stride for tag in knl.arg_dict[name_alias].dim_tags) + index = prim.Sum(tuple(prim.Product((i, s)) for i, s in zip(index, strides) + if i != 0 and i.name != iname_inner)) + + # add store instruction + store_id = idg('insn_' + name_vec + '_store') + call_store = prim.Call(VCLStore(name_vec), (prim.Sum((prim.Variable(name_alias), index)),)) + store_insns.append(lp.CallInstruction(assignees=(), expression=call_store, + id=store_id, within_inames=insn.within_inames, + depends_on=insn.depends_on | frozenset({id}) | read_dependencies[id])) + + # replace alias with vcl vector, except for accumulation assignee + vector_alias = [a for a in knl.arg_dict if a.endswith('alias')] + dim = world_dimension() + dim_names = ["x", "y", "z"] + [str(i) for i in range(4, dim + 1)] + # remove CInstructions since loopy extract expects to get only assignments + knl_without_cinsn = knl.copy(instructions=[insn for insn in knl.instructions + if not isinstance(insn, lp.CInstruction)]) + for alias in vector_alias: + # Rename lhs which would match the substitution rule since loopy doesn't want substitutions as lhs + new_insns = [] + for insn in knl_without_cinsn.instructions: + if isinstance(insn, lp.Assignment) and isinstance(insn.assignee, prim.Subscript): + if insn.assignee.aggregate.name == alias: + new_insns.append(insn.copy(assignee=prim.Subscript(prim.Variable('dummy_' + alias), + insn.assignee.index_tuple))) + pass + else: + new_insns.append(insn) + else: + new_insns.append(insn) + knl_without_cinsn = knl_without_cinsn.copy(instructions=new_insns) + + # substitution rule for alias[ex_outer,ex_inner, ey, ix, iy] -> vec[ex_inner] + parameters = 'ex_o,ex_i,' + ','.join(['e' + d for d in dim_names[1:dim]]) + \ + ',ix,' + ','.join(['i' + d for d in dim_names[1:dim]]) + knl_without_cinsn = lp.extract_subst(knl_without_cinsn, alias + '_subst', '{}[{}]'.format(alias, parameters), + parameters=parameters) + new_subst = knl_without_cinsn.substitutions.copy() + rule = new_subst[alias + '_subst'] + rule.expression = prim.Subscript(prim.Variable(alias.replace('alias', 'vec')), (prim.Variable('ex_i'),)) + knl_without_cinsn = knl_without_cinsn.copy(substitutions=new_subst) + + from loopy.match import All + knl_without_cinsn = lp.expand_subst(knl_without_cinsn, All()) + knl = knl_without_cinsn.copy(instructions=knl_without_cinsn.instructions + [insn for insn in knl.instructions + if isinstance(insn, lp.CInstruction)]) + + # add store and load dependencies and set right accumulation assignee + new_insns = [] + for insn in knl.instructions: + if insn.id not in read_insns | accum_insns: + new_insns.append(insn) + else: + if insn.id in accum_insns: + assignee_alias = insn.assignee + try: + assignee_vec = next((expr for expr in insn.expression.children + if isinstance(expr, prim.Subscript) and + expr.aggregate.name.replace('vec', 'alias') == + assignee_alias.aggregate.name.replace('dummy_', ''))) + except StopIteration: + from dune.perftool.error import PerftoolVectorizationError + raise PerftoolVectorizationError + new_insns.append(insn.copy(assignee=assignee_vec, + depends_on=insn.depends_on | read_dependencies[insn.id])) + else: + new_insns.append(insn.copy(depends_on=insn.depends_on | read_dependencies[insn.id])) + + return knl.copy(instructions=new_insns + load_insns + store_insns) + + +def find_accumulation_inames(knl): + inames = set() + for insn in knl.instructions: + if any((n.startswith('r_') and n.endswith('alias') for n in insn.write_dependency_names())): + inames |= insn.within_inames + + inames = set((i for i in inames if i.startswith('micro') and not i.endswith('_x'))) + + return inames + + +def vectorize_micro_elements(knl): + if "subel_x" in knl.all_inames(): + vcl_size = get_vcl_type_size(np.float64) + knl = lp.split_iname(knl, "subel_x", vcl_size, inner_tag='vec') + array_alias = [a for a in knl.arg_dict.keys() if a.endswith('alias') or a.endswith('tail')] + knl = lp.split_array_axis(knl, array_alias, 0, vcl_size) + + knl = add_vcl_temporaries(knl) + knl = add_vcl_accum_insns(knl, 'subel_x_inner', 'subel_x_outer') + knl = add_vcl_access(knl, 'subel_x_inner') + return knl diff --git a/python/dune/perftool/loopy/target.py b/python/dune/perftool/loopy/target.py index 22a0c317295ae93976c364bd394022447d7e6d43..bb9bf793b3adf64f3c29fa5883a97762d0438ee2 100644 --- a/python/dune/perftool/loopy/target.py +++ b/python/dune/perftool/loopy/target.py @@ -8,7 +8,7 @@ from dune.perftool.loopy.vcl import VCLTypeRegistry from dune.perftool.generation import (include_file, retrieve_cache_functions, ) -from dune.perftool.options import get_option +from dune.perftool.options import get_form_option, get_option from dune.perftool.tools import round_to_multiple from loopy.symbolic import Literal @@ -209,6 +209,25 @@ class DuneASTBuilder(CASTBuilder): return ret +class BlockstructuredDuneExpressionToCExpressionMapper(DuneExpressionToCExpressionMapper): + def map_variable(self, expr, type_context): + arg = self.kernel.arg_dict.get(expr.name) + # allow the use of pointers + if isinstance(arg, DuneGlobalArg) and expr.name.endswith('alias'): + return prim.Variable(expr.name) + else: + return ExpressionToCExpressionMapper.map_variable(self, expr, type_context) + + +class BlockstructuredDuneASTBuilder(DuneASTBuilder): + def get_expression_to_c_expression_mapper(self, codegen_state): + return BlockstructuredDuneExpressionToCExpressionMapper(codegen_state) + + def add_vector_access(self, access_expr, index): + # In the vectorized blockstructured case I need read access to an element of an vector + return prim.Subscript(access_expr, index) + + class DuneTarget(TargetBase): def __init__(self, declare_temporaries=True): # Set fortran_abi to allow reusing CASTBuilder for the moment @@ -222,7 +241,10 @@ class DuneTarget(TargetBase): return DummyHostASTBuilder(self) def get_device_ast_builder(self): - return DuneASTBuilder(self) + if get_form_option("blockstructured"): + return BlockstructuredDuneASTBuilder(self) + else: + return DuneASTBuilder(self) def dtype_to_typename(self, dtype): if dtype.dtype.kind == "V": diff --git a/python/dune/perftool/loopy/transformations/duplicate.py b/python/dune/perftool/loopy/transformations/duplicate.py index eecdd9a8a16523a8b9e0422784734c7a58ab2f24..7d5f3d96e8fc6a46ab0a55e3c54ce85e72795bf5 100644 --- a/python/dune/perftool/loopy/transformations/duplicate.py +++ b/python/dune/perftool/loopy/transformations/duplicate.py @@ -1,5 +1,4 @@ """ Iname duplication strategies to make kernels schedulable """ - from loopy import (has_schedulable_iname_nesting, get_iname_duplication_options, duplicate_inames, diff --git a/python/dune/perftool/loopy/vcl.py b/python/dune/perftool/loopy/vcl.py index f76f7881cf76e4360d2b6b8c0e691cfcad1488c1..345dec931596c07c8641a03f5c0c5035f86d06b6 100644 --- a/python/dune/perftool/loopy/vcl.py +++ b/python/dune/perftool/loopy/vcl.py @@ -91,6 +91,21 @@ def vcl_cast_mangler(knl, func, arg_dtypes): return lp.CallMangleInfo(func.name, (lp.types.NumpyType(func.nptype),), (arg_dtypes[0],)) +class VCLPermute(lp.symbolic.FunctionIdentifier): + def __init__(self, nptype, vector_width, permutation): + self.nptype = nptype + self.vector_width = vector_width + self.permutation = permutation + + def __getinitargs__(self): + return (self.nptype, self.vector_width, self.permutation) + + @property + def name(self): + return "permute{}<{}>".format(get_vcl_typename(self.nptype, vector_width=self.vector_width)[-2:], + ','.join(map(str, self.permutation))) + + @function_mangler def vcl_function_mangler(knl, func, arg_dtypes): if func == "mul_add": @@ -108,3 +123,41 @@ def vcl_function_mangler(knl, func, arg_dtypes): vcl = lp.types.NumpyType(get_vcl_type(dtype)) include_file("dune/perftool/sumfact/horizontaladd.hh", filetag="operatorfile") return lp.CallMangleInfo(func, (lp.types.NumpyType(dtype.dtype),), (vcl,)) + + if isinstance(func, VCLPermute): + dtype = arg_dtypes[0] + vcl = lp.types.NumpyType(get_vcl_type(dtype)) + return lp.CallMangleInfo(func.name, (vcl,), (vcl,)) + + +class VCLLoad(lp.symbolic.FunctionIdentifier): + def __init__(self, vec): + self.vec = vec + + def __getinitargs__(self): + return (self.vec,) + + @property + def name(self): + return "{}.load".format(self.vec) + + +class VCLStore(lp.symbolic.FunctionIdentifier): + def __init__(self, vec): + self.vec = vec + + def __getinitargs__(self): + return (self.vec,) + + @property + def name(self): + return "{}.store".format(self.vec) + + +@function_mangler +def vcl_store_and_load_mangler(knl, func, arg_dtypes): + if isinstance(func, VCLLoad): + return lp.CallMangleInfo(func.name, (), (lp.types.NumpyType(np.int32),)) + + if isinstance(func, VCLStore): + return lp.CallMangleInfo(func.name, (), (lp.types.NumpyType(np.int32),)) diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 229b989002dd417ddb67d451f6c0de11f2bfc124..1ecc85c5903b69f83953502e8afa485f5a962702 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -92,6 +92,7 @@ class PerftoolFormOptionsArray(ImmutableRecord): precompute_quadrature_info = PerftoolOption(default=True, helpstr="compute quadrature points and weights in the constructor of the local operator") blockstructured = PerftoolOption(default=False, helpstr="Use block structure") number_of_blocks = PerftoolOption(default=1, helpstr="Number of sub blocks in one direction") + vectorization_blockstructured = PerftoolOption(default=False, helpstr="Vectorize block structuring") adjoint = PerftoolOption(default=False, helpstr="Generate adjoint operator") control = PerftoolOption(default=False, helpstr="Generate operator of derivative w.r.t. the control variable") objective_function = PerftoolOption(default=None, helpstr="Name of form representing the objective function in UFL file") diff --git a/python/dune/perftool/pdelab/basis.py b/python/dune/perftool/pdelab/basis.py index 2c724f8e4a716c46475434d4f1949de6ba512632..55c9a7ef7676c7647d485486d73440aed9d22aa2 100644 --- a/python/dune/perftool/pdelab/basis.py +++ b/python/dune/perftool/pdelab/basis.py @@ -168,18 +168,17 @@ def evaluate_coefficient(visitor, element, name, container, restriction, index): from ufl import FiniteElement assert isinstance(sub_element, FiniteElement) - temporary_variable(name, shape=(),) + temporary_variable(name, shape=(), managed=True) lfs = name_lfs(element, restriction, index) basis = visitor.interface.pymbolic_basis(sub_element, restriction, 0, context='trial') - basisindex, _ = get_pymbolic_indices(basis) - + basisindex = get_pymbolic_indices(basis)[:-1] if get_form_option("blockstructured"): from dune.perftool.blockstructured.argument import pymbolic_coefficient coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex) else: from dune.perftool.pdelab.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, basisindex) + coeff = pymbolic_coefficient(container, lfs, basisindex[0]) assignee = Variable(name) @@ -200,16 +199,13 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction from ufl import FiniteElement assert isinstance(sub_element, FiniteElement) - temporary_variable(name, - shape=(element.cell().geometric_dimension(),), - shape_impl=("arr",), - ) + temporary_variable(name, shape=(element.cell().geometric_dimension(),), managed=True) dimindex = component_iname(count=0) lfs = name_lfs(element, restriction, index) basis = visitor.interface.pymbolic_reference_gradient(sub_element, restriction, 0, context='trialgrad') - basisindex, _ = get_pymbolic_indices(basis) + basisindex = get_pymbolic_indices(basis)[:-1] from dune.perftool.tools import maybe_wrap_subscript basis = maybe_wrap_subscript(basis, Variable(dimindex)) @@ -218,7 +214,7 @@ def evaluate_coefficient_gradient(visitor, element, name, container, restriction coeff = pymbolic_coefficient(container, lfs, sub_element, basisindex) else: from dune.perftool.pdelab.argument import pymbolic_coefficient - coeff = pymbolic_coefficient(container, lfs, basisindex) + coeff = pymbolic_coefficient(container, lfs, basisindex[0]) assignee = Subscript(Variable(name), (Variable(dimindex),)) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index c89a23cf1f8970213825806804015fc861797ea1..a4bd0201c10f681499b6dbd450b585306ca4ec94 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -262,7 +262,7 @@ def determine_accumulation_space(info, number): if get_form_option("blockstructured"): from dune.perftool.blockstructured.tools import micro_index_to_macro_index from dune.perftool.blockstructured.spaces import lfs_inames - lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, info.restriction, count=number)[0]) + lfsi = micro_index_to_macro_index(subel, lfs_inames(subel, info.restriction, count=number)) else: from dune.perftool.pdelab.spaces import lfs_inames lfsi = Variable(lfs_iname(subel, info.restriction, count=number)) @@ -275,6 +275,7 @@ def determine_accumulation_space(info, number): return AccumulationSpace(lfs=lfs, index=lfsi, restriction=info.restriction, + element=element ) @@ -529,7 +530,6 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin name=name, lang_version=(2017, 2, 1), ) - from loopy import make_reduction_inames_unique kernel = make_reduction_inames_unique(kernel) @@ -547,6 +547,9 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin kernel = vectorize_quadrature_loop(kernel) else: raise NotImplementedError("Only vectorizing sumfactorized code right now!") + if get_form_option("vectorization_blockstructured"): + from dune.perftool.blockstructured.vectorization import vectorize_micro_elements + kernel = vectorize_micro_elements(kernel) # Now add the preambles to the kernel preambles = [(i, p) for i, p in enumerate(retrieve_cache_items("{} and preamble".format(tag)))] diff --git a/python/dune/perftool/tools.py b/python/dune/perftool/tools.py index 4a95dbced5e24d7efd0ddadc422511a1fc1362cc..d5c0a18ebe8f7e32316aa459c959ab1eb9ba75f8 100644 --- a/python/dune/perftool/tools.py +++ b/python/dune/perftool/tools.py @@ -24,13 +24,15 @@ def get_pymbolic_indices(expr): def ind(i): if isinstance(i, int): - return i - return get_pymbolic_basename(i) + return (i,) + elif isinstance(i, prim.Sum) or isinstance(i, prim.Product): + return sum((ind(c) for c in i.children if isinstance(c, prim.Expression)), ()) + return (get_pymbolic_basename(i),) if not isinstance(expr.index, tuple): return (ind(expr.index),) - return tuple(ind(i) for i in expr.index) + return sum((ind(i) for i in expr.index), ()) def maybe_wrap_subscript(expr, indices): diff --git a/test/blockstructured/poisson/CMakeLists.txt b/test/blockstructured/poisson/CMakeLists.txt index 9355f4364cd703d6ed808a25075657aeb254f3cf..721cadc2ef9672d4ae6f8af6ef5479478c76df50 100644 --- a/test/blockstructured/poisson/CMakeLists.txt +++ b/test/blockstructured/poisson/CMakeLists.txt @@ -18,4 +18,9 @@ dune_add_formcompiler_system_test(UFLFILE poisson_neumann.ufl dune_add_formcompiler_system_test(UFLFILE poisson.ufl BASENAME blockstructured_poisson_matrix_free INIFILE poisson_matrix_free.mini + ) + +dune_add_formcompiler_system_test(UFLFILE poisson_vec.ufl + BASENAME blockstructured_poisson_vec + INIFILE poisson_vec.mini ) \ No newline at end of file diff --git a/test/blockstructured/poisson/poisson_vec.mini b/test/blockstructured/poisson/poisson_vec.mini new file mode 100644 index 0000000000000000000000000000000000000000..8d7e2d9774274243f8c7b976355a34f8487ad581 --- /dev/null +++ b/test/blockstructured/poisson/poisson_vec.mini @@ -0,0 +1,18 @@ +__name = blockstructured_poisson_vec + +cells = 5 5 +extension = 1. 1. + +[wrapper.vtkcompare] +name = {__name} +reference = poisson_ref +extension = vtu + +[formcompiler] +compare_l2errorsquared = 1e-7 + +[formcompiler.r] +numerical_jacobian = 1 +blockstructured = 1 +number_of_blocks = 16 +vectorization_blockstructured = 1 \ No newline at end of file diff --git a/test/blockstructured/poisson/poisson_vec.ufl b/test/blockstructured/poisson/poisson_vec.ufl new file mode 100644 index 0000000000000000000000000000000000000000..a0ed7f33138b8689a78cd193659439a0691a3d8c --- /dev/null +++ b/test/blockstructured/poisson/poisson_vec.ufl @@ -0,0 +1,14 @@ +cell = quadrilateral +x = SpatialCoordinate(cell) + +g = x[0]**2 + x[1]**2 +f = -4 + +V = FiniteElement("CG", cell, 1) +u = TrialFunction(V) +v = TestFunction(V) + +r =(inner(grad(u), grad(v)) - f*v)*dx +interpolate_expression = g +exact_solution = g +is_dirichlet = 1