diff --git a/python/dune/perftool/generation/loopy.py b/python/dune/perftool/generation/loopy.py index a5e073cd08644ec7ca709c26577491657f2a7b8f..680b1ec48ac9f2bf77f96c92826dc71d9ef60a06 100644 --- a/python/dune/perftool/generation/loopy.py +++ b/python/dune/perftool/generation/loopy.py @@ -89,7 +89,7 @@ def c_instruction_impl(**kw): @generator_factory(item_tags=("instruction", "exprinstruction"), context_tags="kernel", - cache_key_generator=lambda *a, **kw: kw['expression'], + cache_key_generator=lambda *a, **kw: (kw['assignee'], kw['expression']), ) def expr_instruction_impl(**kw): if 'assignees' in kw: @@ -110,8 +110,11 @@ def _insn_cache_key(code=None, expression=None, **kwargs): if code is not None: return code if expression is not None: - return expression - raise ValueError("Please specify either code or expression for instruction!") + if 'assignees' in kwargs: + return (kwargs['assignees'], expression) + if 'assignee' in kwargs: + return (kwargs['assignee'], expression) + raise ValueError("Error caching instruction, missing information about assignee") @generator_factory(item_tags=("insn_id",), @@ -132,6 +135,9 @@ def instruction(code=None, expression=None, **kwargs): if 'assignees' in kwargs and len(kwargs['assignees']) == 0: call_instruction_impl(id=id, expression=expression, **kwargs) else: + if 'assignees' in kwargs: + assert 'assignee' not in kwargs + kwargs['assignee'] = kwargs['assignees'] expr_instruction_impl(id=id, expression=expression, **kwargs) # return the ID, as it is the only useful information to the user @@ -167,15 +173,29 @@ def barrier(**kwargs): return name -def loopy_class_member(name, classtag=None, **kwargs): +def loopy_class_member(name, classtag=None, potentially_vectorized=False, **kwargs): """ A class member is based on loopy! It is an * temporary variable of the constructor kernel * A globalarg of the requesting kernel (to make things pass) """ assert classtag + assert "base_storage" not in kwargs + + if potentially_vectorized: + base = "{}_base".format(name) + kwargs["base_storage"] = base + + # Check if this is vectorized, if so, change name to vec + dim_tags = kwargs.get("dim_tags", "") + if dim_tags and dim_tags.split(',')[-1] == "vec": + name = "{}_vec".format(name) + temporary_variable(name, kernel=classtag, **kwargs) silenced_warning("read_no_write({})".format(name), kernel=classtag) kwargs.pop("decl_method", None) + kwargs.pop("base_storage", None) # TODO I guess some filtering has to be applied here. globalarg(name, **kwargs) + + return name diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py index 4f5832669bd876c3601a1ffb90294f32b4efff6a..af0fc6973a96b9df8dd4e9db8cb2e5392e770249 100644 --- a/python/dune/perftool/loopy/buffer.py +++ b/python/dune/perftool/loopy/buffer.py @@ -38,13 +38,9 @@ class FlipFlopBuffer(object): formdata = get_global_context_value('formdata') dim = formdata.geometric_dimension - # TODO: doc! - storage_shape = (self.base_storage_size,) + (1,) * (dim - 1) - # Construct the temporary and return it temporary_variable(name, base_storage=base, - storage_shape=storage_shape, managed=True, **kwargs ) diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py index 0c9d1f967521e770e21e9d77c67df663c1231165..23415aac89afd84a66a94d110f3d899682a9bf13 100644 --- a/python/dune/perftool/loopy/symbolic.py +++ b/python/dune/perftool/loopy/symbolic.py @@ -15,21 +15,37 @@ import pymbolic.primitives as prim class SumfactKernel(prim.Variable): - def __init__(self, a_matrices, buffer, insn_dep=frozenset({}), additional_inames=frozenset({})): + def __init__(self, + a_matrices, + buffer, + stage=1, + insn_dep=frozenset({}), + additional_inames=frozenset({}), + preferred_interleaving_position=0, + setup_method=None, + input_temporary=None, + ): self.a_matrices = a_matrices self.buffer = buffer + self.stage = stage self.insn_dep = insn_dep self.additional_inames = additional_inames + self.preferred_interleaving_position = preferred_interleaving_position + self.setup_method = setup_method + self.input_temporary = input_temporary + + if setup_method is not None: + assert isinstance(setup_method, tuple) and len(setup_method) == 2 prim.Variable.__init__(self, "SUMFACT") def __getinitargs__(self): - return (self.a_matrices, self.buffer, self.insn_dep, self.additional_inames) + return (self.a_matrices, self.buffer, self.stage, self.insn_dep, self.additional_inames, self.preferred_interleaving_position, self.setup_method, self.input_temporary) def stringifier(self): return lp.symbolic.StringifyMapper - init_arg_names = ("a_matrices", "buffer", "insn_dep", "additional_inames") + init_arg_names = ("a_matrices", "buffer", "stage", "insn_dep", "additional_inames", "preferred_interleaving_position", "setup_method", "input_temporary") mapper_method = "map_sumfact_kernel" diff --git a/python/dune/perftool/options.py b/python/dune/perftool/options.py index 16d45c59cb02ffb9aaa4a9f83bc02e364e2e78ab..120e5202732fb9884f65626da9e3a38481aaf1f7 100644 --- a/python/dune/perftool/options.py +++ b/python/dune/perftool/options.py @@ -43,7 +43,8 @@ def get_form_compiler_arguments(): parser.add_argument("--project-basedir", type=str, help="The base (build) directory of the dune-perftool project") # TODO at some point this help description should be updated parser.add_argument("--sumfact", action="store_true", help="Use sumfactorization") - parser.add_argument("--vectorize", action="store_true", help="whether to generate code with explicit vectorization") + parser.add_argument("--vectorize-quad", action="store_true", help="whether to generate code with explicit vectorization") + parser.add_argument("--vectorize-grads", action="store_true", help="whether to generate code with explicit vectorization") # Modify the positional argument to not be a list args = vars(parser.parse_args()) diff --git a/python/dune/perftool/pdelab/localoperator.py b/python/dune/perftool/pdelab/localoperator.py index e5c3b0b4dc5011c087b3ea97ad6e67be556949da..c1455f35f71a8cb82103df1662870cc39b4992a3 100644 --- a/python/dune/perftool/pdelab/localoperator.py +++ b/python/dune/perftool/pdelab/localoperator.py @@ -29,6 +29,7 @@ from pymbolic.primitives import Variable from pytools import Record import loopy as lp +import cgen def name_form(formdata, data): @@ -531,7 +532,7 @@ def extract_kernel_from_cache(tag): kernel = heuristic_duplication(kernel) # Maybe apply vectorization strategies - if get_option("vectorize"): + if get_option("vectorize_quad"): if get_option("sumfact"): # Vectorization of the quadrature loop insns = [i.id for i in lp.find_instructions(kernel, lp.match.Tagged("quadvec"))] @@ -668,7 +669,7 @@ def cgen_class_from_cache(tag, members=[]): allow_complex=False, is_generating_device_code=True, ) - decls = target.get_device_ast_builder().get_temporary_decls(codegen_state, 0) + decls = [cgen.Line("\n " + next(iter(d.generate()))) for d in target.get_device_ast_builder().get_temporary_decls(codegen_state, 0)] from dune.perftool.cgen import Class return Class(basename, base_classes=base_classes, members=[constructor] + members + pm + decls, tparam_decls=tparams) diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py index b039c4545cb6dd61e8a05843b9e45f8a8cefbc15..54303ecedeb42608d2054b9c91264ce3e973bbd7 100644 --- a/python/dune/perftool/sumfact/amatrix.py +++ b/python/dune/perftool/sumfact/amatrix.py @@ -28,23 +28,33 @@ from loopy import CallMangleInfo from loopy.symbolic import FunctionIdentifier from loopy.types import NumpyType -from pytools import Record, product +from pytools import ImmutableRecord, product import pymbolic.primitives as prim import numpy -class AMatrix(Record): +class AMatrix(ImmutableRecord): def __init__(self, rows, cols, transpose=False, derivative=False): - Record.__init__(self, - rows=rows, - cols=cols, - transpose=transpose, - derivative=derivative, - ) - - def __hash__(self): - return hash((self.transpose, self.derivative, self.rows, self.cols)) + ImmutableRecord.__init__(self, + rows=rows, + cols=cols, + transpose=transpose, + derivative=derivative, + ) + + +class LargeAMatrix(ImmutableRecord): + def __init__(self, rows, cols, transpose, derivative): + assert isinstance(transpose, tuple) + assert isinstance(derivative, tuple) + assert len(transpose) == len(derivative) + ImmutableRecord.__init__(self, + rows=rows, + cols=cols, + transpose=transpose, + derivative=derivative, + ) def quadrature_points_per_direction(): @@ -171,17 +181,23 @@ def polynomial_lookup_mangler(target, func, dtypes): return CallMangleInfo(func.name, (NumpyType(numpy.float64),), (NumpyType(numpy.int32), NumpyType(numpy.float64))) -def define_theta(name, shape, transpose, derivative): +def define_theta(name, shape, transpose, derivative, additional_indices=()): sort_quadrature_points_weights() polynomials = name_polynomials() qp = name_oned_quadrature_points() + dim_tags = "f,f" + if additional_indices: + dim_tags = dim_tags + ",c" + shape = shape + (4,) + loopy_class_member(name, dtype=numpy.float64, shape=shape, classtag="operator", - dim_tags="f,f", + dim_tags=dim_tags, managed=True, + potentially_vectorized=True, ) # TODO Enforce the alignment here! @@ -194,7 +210,7 @@ def define_theta(name, shape, transpose, derivative): else: args = (prim.Variable(j), prim.Subscript(prim.Variable(qp), (prim.Variable(i),))) - instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j))), + instruction(assignee=prim.Subscript(prim.Variable(name), (prim.Variable(i), prim.Variable(j)) + additional_indices), expression=prim.Call(PolynomialLookup(polynomials, derivative), args), kernel="operator", ) @@ -206,6 +222,25 @@ def name_theta(transpose=False, derivative=False): shape = (basis_functions_per_direction(), quadrature_points_per_direction()) else: shape = (quadrature_points_per_direction(), basis_functions_per_direction()) - globalarg(name, shape=shape, dtype=numpy.float64, dim_tags="f,f") define_theta(name, shape, transpose, derivative) return name + + +def name_large_theta(transpose=False, derivative=False): + ident = tuple("{}{}".format("d" if d else "", "T" if t else "") for t, d in zip(transpose, derivative)) + name = "ThetaLarge_{}".format("_".join(ident)) + if True in transpose: + shape = (basis_functions_per_direction(), quadrature_points_per_direction()) + else: + shape = (quadrature_points_per_direction(), basis_functions_per_direction()) + + for i, (t, d) in enumerate(zip(transpose, derivative)): + define_theta(name, shape, t, d, additional_indices=(i,)) + + return loopy_class_member(name, + classtag="operator", + dtype=numpy.float64, + dim_tags="f,f,vec", + shape=shape + (4,), + potentially_vectorized=True, + ) diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index 96f24219dc7244474d6c370728723f7280830eed..390acb59e5a7f4ba7ec2b724b59eb94f93848b5e 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -75,10 +75,10 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) # Add a sum factorization kernel that implements the # evaluation of the gradients of basis functions at quadrature # points (stage 1) - insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name) var = SumfactKernel(a_matrices, buffer_name, - insn_dep=frozenset({insn_dep}), + preferred_interleaving_position=i, + setup_method=(setup_theta, (element, restriction, component)) ) buffers.append(var) @@ -100,11 +100,8 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) def pymbolic_trialfunction_gradient(element, restriction, component): rawname = "gradu" + "_".join(str(c) for c in component) name = restricted_name(rawname, restriction) - from dune.perftool.pdelab.argument import name_coefficientcontainer - container = name_coefficientcontainer(restriction) sumfact_evaluate_coefficient_gradient(element, name, restriction, component) - from pymbolic.primitives import Variable - return Variable(name) + return prim.Variable(name) @kernel_cached @@ -128,10 +125,10 @@ def pymbolic_trialfunction(element, restriction, component): # Add a sum factorization kernel that implements the evaluation of # the basis functions at quadrature points (stage 1) - insn_dep = setup_theta(element, restriction, component, a_matrices, buffer_name) var = SumfactKernel(a_matrices, buffer_name, - insn_dep=frozenset({insn_dep}), + preferred_interleaving_position=dim, + setup_method=(setup_theta, (element, restriction, component)) ) return prim.Subscript(var, diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 35cb783e2e325efbe89796ccfae89d550300f1b6..e3042690f55007928716e0baeb806fa4e0ba9030 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -29,13 +29,18 @@ from dune.perftool.loopy.buffer import (get_buffer_temporary, from dune.perftool.sumfact.quadrature import nest_quadrature_loops from dune.perftool.pdelab.localoperator import determine_accumulation_space from dune.perftool.pdelab.restriction import restricted_name -from dune.perftool.pdelab.spaces import name_lfs +from dune.perftool.pdelab.spaces import (name_lfs, + name_lfs_bound, + ) from dune.perftool.sumfact.amatrix import (AMatrix, + LargeAMatrix, quadrature_points_per_direction, basis_functions_per_direction, + name_large_theta, name_theta, ) from dune.perftool.loopy.symbolic import SumfactKernel +from dune.perftool.tools import get_pymbolic_basename from dune.perftool.error import PerftoolError from pymbolic.primitives import (Call, Product, @@ -44,9 +49,10 @@ from pymbolic.primitives import (Call, ) from dune.perftool.sumfact.quadrature import quadrature_inames from loopy import Reduction, GlobalArg -from loopy.symbolic import FunctionIdentifier +from loopy.symbolic import FunctionIdentifier, IdentityMapper import loopy as lp +import numpy as np import pymbolic.primitives as prim from pytools import product @@ -72,13 +78,59 @@ def find_sumfact(expr): return HasSumfactMapper()(expr) -def expand_sumfact_kernels(insns): +class IndexFiddleMapper(IdentityMapper): + def __init__(self, var, index, pos): + assert isinstance(var, str) + self.var = var + self.index = index + if isinstance(index, str): + self.index = prim.Variable(index) + self.pos = pos + + def map_variable(self, expr): + if expr.name == self.var: + return prim.Subscript(expr, (self.var,)) + else: + return IdentityMapper.map_variable(self, expr) + + def map_subscript(self, expr): + if expr.aggregate.name == self.var: + ind = expr.index + if not isinstance(ind, tuple): + ind = (ind,) + if self.pos is None: + ind = ind + (self.index,) + else: + raise NotImplementedError + return prim.Subscript(expr.aggregate, ind) + else: + return IdentityMapper.map_subscript(self, expr) + + +def fiddle_in_index(expr, var, index, pos=None): + return IndexFiddleMapper(var, index, pos)(expr) + + +def default_resolution(insns): for insn in insns: if isinstance(insn, (lp.Assignment, lp.CallInstruction)): replace = {} deps = [] for sumf in find_sumfact(insn.expression): - var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, sumf.insn_dep, sumf.additional_inames) + # Maybe set up the input + if sumf.setup_method: + shape = product(mat.cols for mat in sumf.a_matrices) + shape = (shape,) + inp = get_buffer_temporary(sumf.buffer, shape=shape) + silenced_warning('read_no_write({})'.format(inp)) + + func, args = sumf.setup_method + insn_dep = frozenset({func(inp, *args)}) + else: + insn_dep = sumf.insn_dep + + # Call the sum factorization algorithm + var, dep = sum_factorization_kernel(sumf.a_matrices, sumf.buffer, insn_dep, sumf.additional_inames) replace[sumf] = prim.Variable(var) deps.append(dep) @@ -89,6 +141,98 @@ def expand_sumfact_kernels(insns): ) +def apply_sumfact_grad_vectorization(insns, stage): + sumfact_kernels = [] + for insn in insns: + if isinstance(insn, (lp.Assignment, lp.CallInstruction)): + sumf = find_sumfact(insn.expression) + if sumf: + sumf, = sumf + if sumf.stage == stage: + sumfact_kernels.append(sumf) + + # Now apply some heuristics when to vectorize... + if len(set(sumfact_kernels)) < 3: + default_resolution(insns) + else: + # Vectorize!!! + sumfact_kernels = sorted(sumfact_kernels, key=lambda s: s.preferred_interleaving_position) + # Pad to 4 + if len(sumfact_kernels) < 4: + sumfact_kernels.append(sumfact_kernels[0]) + + # Determine a name for the buffer in use + buffer = "_".join(sf.buffer for sf in sumfact_kernels) + insn_dep = frozenset() + + # Maybe initialize the input buffer + if sumfact_kernels[0].setup_method: + shape = product(mat.cols for mat in sumfact_kernels[0].a_matrices) + shape = (shape, 4) + initialize_buffer(buffer, base_storage_size=4 * shape[0]) + inp = get_buffer_temporary(buffer, shape=shape) + silenced_warning('read_no_write({})'.format(inp)) + + # Call the individual setup_methods + for i, sumf in enumerate(sumfact_kernels): + func, args = sumf.setup_method + insn_dep = insn_dep.union({func(inp, *args, additional_indices=(i,))}) + else: + # No setup method defined. We need to make sure the input is correctly setup + shape = tuple(mat.cols for mat in sumfact_kernels[0].a_matrices) + (4,) + initialize_buffer(buffer, base_storage_size=4 * shape[0]) + dim_tags = ",".join("f" * (len(shape) - 1)) + ",c" + inp = get_buffer_temporary(buffer, shape=shape, dim_tags=dim_tags) + + for i, sumf in enumerate(sumfact_kernels): + assert sumf.input_temporary + for insn in insns: + if isinstance(insn, lp.Assignment): + if get_pymbolic_basename(insn.assignee) == sumf.input_temporary: + built_instruction(insn.copy(assignee=prim.Subscript(prim.Variable(inp), insn.assignee.index + (i,)))) + from dune.perftool.generation.loopy import expr_instruction_impl + expr_instruction_impl._memoize_cache = {k: v for k, v in expr_instruction_impl._memoize_cache.items() if v.id != insn.id} + + # Determine the joined AMatrix + large_a_matrices = [] + for i in range(len(sumfact_kernels[0].a_matrices)): + assert len(set(tuple(sf.a_matrices[i].rows for sf in sumfact_kernels))) == 1 + assert len(set(tuple(sf.a_matrices[i].cols for sf in sumfact_kernels))) == 1 + large = LargeAMatrix(rows=sumfact_kernels[0].a_matrices[i].rows, + cols=sumfact_kernels[0].a_matrices[i].cols, + transpose=tuple(sf.a_matrices[i].transpose for sf in sumfact_kernels), + derivative=tuple(sf.a_matrices[i].derivative for sf in sumfact_kernels), + ) + large_a_matrices.append(large) + + # Join the instruction dependencies + insn_dep = insn_dep.union(*tuple(sf.insn_dep for sf in sumfact_kernels)) + + var, dep = sum_factorization_kernel(large_a_matrices, buffer, insn_dep, sumfact_kernels[0].additional_inames, add_vec_tag=True) + + for insn in insns: + if isinstance(insn, (lp.Assignment, lp.CallInstruction)): + sumfacts = find_sumfact(insn.expression) + if sumfacts: + sumf, = sumfacts + if sumf.stage == stage: + replace = {} + replace[sumf] = prim.Variable(var) + newexpr = substitute(insn.expression, replace) + newexpr = fiddle_in_index(newexpr, var, sumf.preferred_interleaving_position) + assert newexpr + built_instruction(insn.copy(expression=newexpr, + depends_on=dep, + )) + +def expand_sumfact_kernels(insns): + if get_option("vectorize_grads"): + apply_sumfact_grad_vectorization(insns, 1) + apply_sumfact_grad_vectorization(insns, 3) + else: + default_resolution(insns) + + def filter_sumfact_instructions(): """ Remove all instructions that contain a SumfactKernel node """ from dune.perftool.generation.loopy import expr_instruction_impl, call_instruction_impl @@ -109,19 +253,13 @@ def sumfact_iname(bound, _type): return name -def setup_theta(element, restriction, component, a_matrices, buffer_name): - number_basis = product(mat.cols for mat in a_matrices) - shape = (number_basis,) - inp = get_buffer_temporary(buffer_name, - shape=shape) - silenced_warning('read_no_write({})'.format(inp)) - +def setup_theta(inp, element, restriction, component, additional_indices=()): # Write initial coefficients into buffer - basisiname = sumfact_iname(number_basis, "basis") lfs = name_lfs(element, restriction, component) + basisiname = sumfact_iname(name_lfs_bound(lfs), "basis") container = name_coefficientcontainer(restriction) coeff = pymbolic_coefficient(container, lfs, basisiname) - assignee = Subscript(Variable(inp), (Variable(basisiname),)) + assignee = Subscript(Variable(inp), (Variable(basisiname),) + additional_indices) return instruction(assignee=assignee, expression=coeff, ) @@ -178,9 +316,10 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): a_matrices = [theta_matrix] * dim a_matrices[i] = dtheta_matrix a_matrices = tuple(a_matrices) - + pref_pos = i else: a_matrices = (theta_matrix,) * dim + pref_pos = dim # Initialize a base storage for this buffer and get a temporay pointing to it temp = initialize_buffer(buf, @@ -219,6 +358,9 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): buf, insn_dep=frozenset({contrib_dep}), additional_inames=frozenset(visitor.inames), + stage=3, + preferred_interleaving_position=pref_pos, + input_temporary=temp, ) inames = tuple(sumfact_iname(mat.rows, 'accum') for mat in a_matrices) @@ -258,7 +400,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): transform(nest_quadrature_loops, visitor.inames) -def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({})): +def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False): """ Calculate a sum factorization matrix product. @@ -278,6 +420,10 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional should depend upon. All following ones will depend on each other. """ + ftags = "f,f" + (",vec" if add_vec_tag else "") + ctags = "c,c" + (",vec" if add_vec_tag else "") + vec_shape = (4,) if add_vec_tag else () + insn_dep = frozenset({barrier(depends_on=insn_dep, within_inames=additional_inames, )}) @@ -288,8 +434,8 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional # this reinterprets the output of the previous iteration. inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:])) inp = get_buffer_temporary(buf, - shape=inp_shape, - dim_tags="f,f") + shape=inp_shape + vec_shape, + dim_tags=ftags) # The input temporary will only be read from, so we need to silence the loopy warning silenced_warning('read_no_write({})'.format(inp)) @@ -301,22 +447,34 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:])) out = get_buffer_temporary(buf, - shape=out_shape, - dim_tags="c,c") + shape=out_shape + vec_shape, + dim_tags=ctags) # Get the inames needed for one matrix-matrix multiplication i = sumfact_iname(out_shape[0], "row") j = sumfact_iname(out_shape[1], "col") k = sumfact_iname(a_matrix.cols, "red") + # Select the correct Theta Matrix for multiplication + if isinstance(a_matrix, AMatrix): + theta = name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative) + vec_iname = () + elif isinstance(a_matrix, LargeAMatrix): + theta = name_large_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative) + iname = sumfact_iname(4, "vec") + vec_iname = (prim.Variable(iname),) + transform(lp.tag_inames, [(iname, "vec")]) + else: + raise TypeError("What kind of an AMatrix is that?") + # Construct the matrix-matrix-multiplication expression a_ik*in_kj - prod = Product((Subscript(Variable(name_theta(transpose=a_matrix.transpose, derivative=a_matrix.derivative)), (Variable(i), Variable(k))), - Subscript(Variable(inp), (Variable(k), Variable(j))) + prod = Product((Subscript(Variable(theta), (Variable(i), Variable(k)) + vec_iname), + Subscript(Variable(inp), (Variable(k), Variable(j)) + vec_iname) )) # Issue the reduction instruction that implements the multiplication # at the same time store the instruction ID for the next instruction to depend on - insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j))), + insn_dep = frozenset({instruction(assignee=Subscript(Variable(out), (Variable(i), Variable(j)) + vec_iname), expression=Reduction("sum", k, prod), forced_iname_deps=frozenset({i, j}).union(additional_inames), forced_iname_deps_is_final=True, @@ -330,6 +488,11 @@ def sum_factorization_kernel(a_matrices, buf, insn_dep=frozenset({}), additional out_shape = tuple(mat.rows for mat in a_matrices) dim_tags = ",".join(['f'] * dim) + + if add_vec_tag: + out_shape = out_shape + vec_shape + dim_tags = dim_tags + ",c" + out = get_buffer_temporary(buf, shape=out_shape, dim_tags=dim_tags) diff --git a/python/pytools b/python/pytools index 6c5830f6e007f4ece99d75c83b0f72550cad80c2..7ed09d95be2bd2fb22392010cf28830fd43a30ca 160000 --- a/python/pytools +++ b/python/pytools @@ -1 +1 @@ -Subproject commit 6c5830f6e007f4ece99d75c83b0f72550cad80c2 +Subproject commit 7ed09d95be2bd2fb22392010cf28830fd43a30ca diff --git a/test/sumfact/mass/mass.mini b/test/sumfact/mass/mass.mini index c8729beade5e5f75d2f45619b3e33e5f8776292d..80594037a20b03ef2465080d6d7cdc62062b7c0e 100644 --- a/test/sumfact/mass/mass.mini +++ b/test/sumfact/mass/mass.mini @@ -13,5 +13,5 @@ extension = vtu [formcompiler] numerical_jacobian = 1, 0 | expand num -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec sumfact = 1 diff --git a/test/sumfact/mass/mass_3d.mini b/test/sumfact/mass/mass_3d.mini index 17da3e4f4747889c3538b2de8d3afaf96469345f..2948674e0acc6b34dd4e92fec728f449e38978c6 100644 --- a/test/sumfact/mass/mass_3d.mini +++ b/test/sumfact/mass/mass_3d.mini @@ -15,5 +15,5 @@ extension = vtu [formcompiler] numerical_jacobian = 1, 0 | expand num -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec sumfact = 1 diff --git a/test/sumfact/poisson/poisson_2d_order1.mini b/test/sumfact/poisson/poisson_2d_order1.mini index 339ae2c9b837bc78aace34be79fe03d54d82ab80..85e91b2135c564ac9ab743fec5502040b2c34a65 100644 --- a/test/sumfact/poisson/poisson_2d_order1.mini +++ b/test/sumfact/poisson/poisson_2d_order1.mini @@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-4 sumfact = 1 -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec diff --git a/test/sumfact/poisson/poisson_2d_order2.mini b/test/sumfact/poisson/poisson_2d_order2.mini index 8bdf424c46a66ae0a0e2b1a1da257eede4975789..5cdd15ff4bf39a8839ec1b1aa74f623ff4ad5585 100644 --- a/test/sumfact/poisson/poisson_2d_order2.mini +++ b/test/sumfact/poisson/poisson_2d_order2.mini @@ -17,4 +17,4 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-8 sumfact = 1 -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec diff --git a/test/sumfact/poisson/poisson_3d_order1.mini b/test/sumfact/poisson/poisson_3d_order1.mini index 6250546eafa4527491931539547a6e961003dc8a..a636bda7fe69f91ebfc99154ecedf2b90ab89d07 100644 --- a/test/sumfact/poisson/poisson_3d_order1.mini +++ b/test/sumfact/poisson/poisson_3d_order1.mini @@ -1,8 +1,10 @@ __name = sumfact_poisson_3d_order1_{__exec_suffix} -__exec_suffix = {diff_suffix}_{vec_suffix} +__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix} diff_suffix = numdiff, symdiff | expand num -vec_suffix = vec, nonvec | expand vec +vecq_suffix = quadvec, nonquadvec | expand vec_q +vecg_suffix = gradvec, nongradvec | expand vec_g +{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude cells = 8 8 8 extension = 1. 1. 1. @@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-4 sumfact = 1 -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec_q +vectorize_grads = 1, 0 | expand vec_g diff --git a/test/sumfact/poisson/poisson_3d_order2.mini b/test/sumfact/poisson/poisson_3d_order2.mini index 54c0e3ccd6723c995dc3c396cdf08e5c06f5b561..24ec010e7beef46dea74bffd318411118a2c727d 100644 --- a/test/sumfact/poisson/poisson_3d_order2.mini +++ b/test/sumfact/poisson/poisson_3d_order2.mini @@ -1,8 +1,10 @@ __name = sumfact_poisson_3d_order2_{__exec_suffix} -__exec_suffix = {diff_suffix}_{vec_suffix} +__exec_suffix = {diff_suffix}_{vecq_suffix}_{vecg_suffix} diff_suffix = numdiff, symdiff | expand num -vec_suffix = vec, nonvec | expand vec +vecq_suffix = quadvec, nonquadvec | expand vec_q +vecg_suffix = gradvec, nongradvec | expand vec_g +{vecq_suffix} == quadvec and {vecg_suffix} == gradvec | exclude cells = 8 8 8 extension = 1. 1. 1. @@ -17,4 +19,5 @@ numerical_jacobian = 1, 0 | expand num exact_solution_expression = g compare_l2errorsquared = 1e-8 sumfact = 1 -vectorize = 1, 0 | expand vec +vectorize_quad = 1, 0 | expand vec_q +vectorize_grads = 1, 0 | expand vec_g