diff --git a/python/dune/perftool/loopy/symbolic.py b/python/dune/perftool/loopy/symbolic.py index 42f0420d52e5704713734aac165919c92afbdbee..ad3c74076c814c454b7081d018b1d5ff16efad62 100644 --- a/python/dune/perftool/loopy/symbolic.py +++ b/python/dune/perftool/loopy/symbolic.py @@ -22,6 +22,9 @@ class SumfactKernel(prim.Variable): preferred_position, restriction, ): + if not isinstance(restriction, tuple): + restriction = (restriction, 0) + self.a_matrices = a_matrices self.buffer = buffer self.stage = stage diff --git a/python/dune/perftool/loopy/transformations/collect_rotate.py b/python/dune/perftool/loopy/transformations/collect_rotate.py index 7d227f87e7c513c30c289882dcc453a042a4ff46..424056de75c539eeef83ccba3c1a660932ec403d 100644 --- a/python/dune/perftool/loopy/transformations/collect_rotate.py +++ b/python/dune/perftool/loopy/transformations/collect_rotate.py @@ -159,12 +159,12 @@ def collect_vector_data_rotate(knl): for expr in quantities[quantity]: assert isinstance(expr, prim.Subscript) last_index = expr.index[-1] - assert last_index in tuple(range(4)) replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (prim.Variable("vec_index") + last_index, prim.Variable(new_iname)), ) else: # Add a vector view to this quantity + expr, = quantities[quantity] knl = add_vector_view(knl, quantity) replacemap_vec[expr] = prim.Subscript(prim.Variable(get_vector_view_name(quantity)), (prim.Variable("vec_index"), prim.Variable(new_iname)), @@ -289,16 +289,16 @@ def collect_vector_data_rotate(knl): ) ) - # Rotate back! - if rotating: - new_insns.append(lp.CallInstruction((), # assignees - prim.Call(prim.Variable("transpose_reg"), - tuple(prim.Subscript(prim.Variable(lhsname), (prim.Variable("vec_index") + i, prim.Variable(new_iname))) for i in range(4))), - depends_on=frozenset({Tagged("vec_write")}), - within_inames=common_inames.union(inames).union(frozenset({new_iname})), - within_inames_is_final=True, - id="{}_rotateback".format(lhsname), - )) + # Rotate back! + if rotating and "{}_rotateback".format(lhsname) not in [i.id for i in new_insns]: + new_insns.append(lp.CallInstruction((), # assignees + prim.Call(prim.Variable("transpose_reg"), + tuple(prim.Subscript(prim.Variable(lhsname), (prim.Variable("vec_index") + i, prim.Variable(new_iname))) for i in range(4))), + depends_on=frozenset({Tagged("vec_write")}), + within_inames=common_inames.union(inames).union(frozenset({new_iname})), + within_inames_is_final=True, + id="{}_rotateback".format(lhsname), + )) from loopy.kernel.creation import resolve_dependencies return resolve_dependencies(knl.copy(instructions=new_insns + other_insns)) diff --git a/python/dune/perftool/pdelab/geometry.py b/python/dune/perftool/pdelab/geometry.py index 973104dfe931865ec72d3bf8c34a182467f7ca5e..1930ed923104649e6410b573fb878c8b9680c9d1 100644 --- a/python/dune/perftool/pdelab/geometry.py +++ b/python/dune/perftool/pdelab/geometry.py @@ -12,7 +12,9 @@ from dune.perftool.generation import (backend, temporary_variable, valuearg, ) -from dune.perftool.options import option_switch +from dune.perftool.options import (get_option, + option_switch, + ) from dune.perftool.pdelab.quadrature import quadrature_preamble from dune.perftool.tools import get_pymbolic_basename from ufl.algorithms import MultiFunction @@ -260,8 +262,12 @@ def declare_normal(name, shape, shape_impl): def name_unit_outer_normal(): name = "outer_normal" - temporary_variable(name, shape=(world_dimension(),), decl_method=declare_normal) - evaluate_unit_outer_normal(name) + if not get_option("diagonal_transformation_matrix"): + temporary_variable(name, shape=(world_dimension(),), decl_method=declare_normal) + evaluate_unit_outer_normal(name) + else: + declare_normal(name, None, None) + globalarg(name, shape=(world_dimension(),), dtype=np.float64) return "outer_normal" diff --git a/python/dune/perftool/sumfact/__init__.py b/python/dune/perftool/sumfact/__init__.py index d82ea08af1a7ad439562be9732114b098848d6c1..cba13702efbf8f8a7333dc919e48c8de87048232 100644 --- a/python/dune/perftool/sumfact/__init__.py +++ b/python/dune/perftool/sumfact/__init__.py @@ -24,10 +24,14 @@ class SumFactInterface(PDELabInterface): return pymbolic_reference_gradient(element, restriction, number) def pymbolic_trialfunction_gradient(self, element, restriction, component, visitor=None): - return pymbolic_trialfunction_gradient(element, restriction, component, visitor) + ret, indices = pymbolic_trialfunction_gradient(element, restriction, component, visitor) + visitor.indices = indices + return ret def pymbolic_trialfunction(self, element, restriction, component, visitor=None): - return pymbolic_trialfunction(element, restriction, component, visitor) + ret, indices = pymbolic_trialfunction(element, restriction, component, visitor) + visitor.indices = indices + return ret def quadrature_inames(self): return quadrature_inames() diff --git a/python/dune/perftool/sumfact/amatrix.py b/python/dune/perftool/sumfact/amatrix.py index 135fcc3d4cf50bf8e5e53822c29c8997516c9d2b..c13ab899aab3c75fdbc6997f39409e506a9019c4 100644 --- a/python/dune/perftool/sumfact/amatrix.py +++ b/python/dune/perftool/sumfact/amatrix.py @@ -59,22 +59,24 @@ class AMatrix(ImmutableRecord): class LargeAMatrix(ImmutableRecord): - def __init__(self, rows, cols, transpose, derivative): + def __init__(self, rows, cols, transpose, derivative, face): assert isinstance(derivative, tuple) ImmutableRecord.__init__(self, rows=rows, cols=cols, transpose=transpose, derivative=derivative, + face=face, ) @property def name(self): - name = "ThetaLarge{}_{}".format("T" if self.transpose else "", - "_".join(tuple("d" if d else "" for d in self.derivative)) - ) + name = "ThetaLarge{}{}_{}".format("face{}_".format(self.face) if self.face is not None else "", + "T" if self.transpose else "", + "_".join(tuple("d" if d else "" for d in self.derivative)) + ) for i, d in enumerate(self.derivative): - define_theta(name, (self.rows, self.cols), self.transpose, d, additional_indices=(i,)) + define_theta(name, (self.rows, self.cols), self.transpose, d, face=self.face, additional_indices=(i,)) return loopy_class_member(name, classtag="operator", diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index fe9ef21b484d6567a5deca882c6da9b5a68b307c..a7b3c4c98ac3c5b262ff1f7dee9b5acecba33aa9 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -75,7 +75,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): # Get the vectorization info. If this happens during the dry run, we get dummies from dune.perftool.sumfact.vectorization import get_vectorization_info - a_matrices, buffer, input, index = get_vectorization_info(a_matrices, 0) + a_matrices, buffer, input, index = get_vectorization_info(a_matrices, restriction) # Initialize the buffer for the sum fact kernel shape = (product(mat.cols for mat in a_matrices),) @@ -101,6 +101,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): 1, preferred_position=i, insn_dep=insn_dep, + restriction=restriction, ) buffers.append(var) @@ -109,9 +110,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): # with the position in the vector register. if index: assert len(visitor.indices) == 1 - indices = visitor.indices - visitor.indices = None - return maybe_wrap_subscript(var, tuple(prim.Variable(i) for i in quadrature_inames()) + indices) + return maybe_wrap_subscript(var, tuple(prim.Variable(i) for i in quadrature_inames()) + visitor.indices), None # TODO this should be quite conditional!!! for i, buf in enumerate(buffers): @@ -126,7 +125,7 @@ def pymbolic_trialfunction_gradient(element, restriction, component, visitor): forced_iname_deps_is_final=True, ) - return prim.Variable(name) + return prim.Variable(name), visitor.indices @kernel_cached @@ -140,7 +139,7 @@ def pymbolic_trialfunction(element, restriction, component, visitor): # Get the vectorization info. If this happens during the dry run, we get dummies from dune.perftool.sumfact.vectorization import get_vectorization_info - a_matrices, buffer, input, index = get_vectorization_info(a_matrices, 0) + a_matrices, buffer, input, index = get_vectorization_info(a_matrices, restriction) # Flip flop buffers for sumfactorization shape = (product(mat.cols for mat in a_matrices),) @@ -164,6 +163,7 @@ def pymbolic_trialfunction(element, restriction, component, visitor): preferred_position=None, insn_dep=frozenset({Writes(input)}), outshape=tuple(mat.rows for mat in a_matrices if mat.rows != 1), + restriction=restriction, ) if index: @@ -172,8 +172,8 @@ def pymbolic_trialfunction(element, restriction, component, visitor): index = () return prim.Subscript(var, - tuple(prim.Variable(i) for i in quadrature_inames() + index) - ) + tuple(prim.Variable(i) for i in quadrature_inames()) + index + ), visitor.indices @iname diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index b038e763fd7a7168d6aec101ecc397bc50639203..539d2f698b8c23c5f13b30f66229def5137705be 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -128,7 +128,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Get the vectorization info. If this happens during the dry run, we get dummies from dune.perftool.sumfact.vectorization import get_vectorization_info - a_matrices, buffer, input, index = get_vectorization_info(a_matrices, restriction) + a_matrices, buffer, input, index = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction)) # Initialize a base storage for this buffer and get a temporay pointing to it shape = tuple(mat.cols for mat in a_matrices if mat.cols != 1) @@ -185,6 +185,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): insn_dep=insn_dep, additional_inames=frozenset(visitor.inames), preferred_position=pref_pos, + restriction=(accterm.argument.restriction, restriction), ) inames = tuple(sumfact_iname(mat.rows, 'accum') for mat in a_matrices) @@ -240,7 +241,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): insn_dep = emit_sumfact_kernel(None, restriction, insn_dep) -@generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s)) +@generator_factory(item_tags=("sumfactkernel",), context_tags=("kernel",), cache_key_generator=lambda a, b, s, **kw: (a, b, s, kw.get("restriction", 0))) def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index a7e8eae021ed6b5d606b567874def31a39b73d91..a9e050fb68c3f583d283a62a39943f1deef53093 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -18,29 +18,32 @@ def vectorization_info(a_matrices, restriction, new_a_matrices, buffer, input, i def get_vectorization_info(a_matrices, restriction): + if not isinstance(restriction, tuple): + restriction = (restriction, 0) from dune.perftool.generation import get_global_context_value if get_global_context_value("dry_run"): # Return dummy data return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) - try: - return vectorization_info(a_matrices, restriction, None, None, None, None) - except TypeError: + + # Try getting the vectorization info collected after dry run + vec = vectorization_info(a_matrices, restriction, None, None, None, None) + if vec[0] is None: raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt") + return vec def no_vectorization(sumfacts): for sumf in sumfacts: - for res in (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE): - vectorization_info(sumf.a_matrices, - res, - sumf.a_matrices, - get_counted_variable("buffer"), - get_counted_variable(restricted_name("input", sumf.restriction)), - None) + vectorization_info(sumf.a_matrices, + sumf.restriction, + sumf.a_matrices, + get_counted_variable("buffer"), + get_counted_variable(restricted_name("input", sumf.restriction)), + None) -def decide_stage_vectorization_strategy(sumfacts, stage): - stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage]) +def decide_stage_vectorization_strategy(sumfacts, stage, restriction): + stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage and sf.restriction == restriction]) if len(stage_sumfacts) in (3, 4): # Map the sum factorization to their position in the joint kernel position_mapping = {} @@ -79,6 +82,7 @@ def decide_stage_vectorization_strategy(sumfacts, stage): cols=next(iter(stage_sumfacts)).a_matrices[i].cols, transpose=next(iter(stage_sumfacts)).a_matrices[i].transpose, derivative=tuple(derivative), + face=next(iter(stage_sumfacts)).a_matrices[i].face, ) large_a_matrices.append(large) @@ -106,8 +110,11 @@ def decide_vectorization_strategy(): if not get_option("vectorize_grads"): no_vectorization(sumfacts) else: - decide_stage_vectorization_strategy(sumfacts, 1) - decide_stage_vectorization_strategy(sumfacts, 3) + for stage in (1, 3): + res = (Restriction.NONE, Restriction.POSITIVE, Restriction.NEGATIVE) + import itertools as it + for restriction in it.product(res, res): + decide_stage_vectorization_strategy(sumfacts, stage, restriction) class HasSumfactMapper(lp.symbolic.CombineMapper): diff --git a/test/sumfact/poisson/poisson_dg.mini b/test/sumfact/poisson/poisson_dg.mini index aa5e41bcffb47a164644fa1a39e71a4adcab5f00..747a5b7dac42a836c8a4eb0acdfc59b418f58581 100644 --- a/test/sumfact/poisson/poisson_dg.mini +++ b/test/sumfact/poisson/poisson_dg.mini @@ -1,5 +1,9 @@ __name = sumfact_poisson_dg_{__exec_suffix} -__exec_suffix = numdiff, symdiff | expand num +__exec_suffix = {diff_suffix}_{quadvec_suffix}_{gradvec_suffix} + +diff_suffix = numdiff, symdiff | expand num +quadvec_suffix = quadvec, nonquadvec | expand quad +gradvec_suffix = gradvec, nongradvec | expand grad cells = 16 16 extension = 1. 1. @@ -13,3 +17,5 @@ numerical_jacobian = 1, 0 | expand num sumfact = 1 exact_solution_expression = g compare_l2errorsquared = 1e-4 +vectorize_quad = 1, 0 | expand quad +vectorize_grads = 1, 0 | expand grad