From 488dabdadab743aa16f0299c5ba00ab42873a431 Mon Sep 17 00:00:00 2001 From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> Date: Fri, 2 Dec 2016 19:52:38 +0100 Subject: [PATCH] Implement large matrix construction --- python/dune/perftool/loopy/buffer.py | 4 +- python/dune/perftool/sumfact/basis.py | 30 ++++++++----- python/dune/perftool/sumfact/sumfact.py | 36 ++++++++++------ python/dune/perftool/sumfact/vectorization.py | 42 +++++++++++++++---- 4 files changed, 79 insertions(+), 33 deletions(-) diff --git a/python/dune/perftool/loopy/buffer.py b/python/dune/perftool/loopy/buffer.py index 84076012..feef4570 100644 --- a/python/dune/perftool/loopy/buffer.py +++ b/python/dune/perftool/loopy/buffer.py @@ -29,9 +29,9 @@ class FlipFlopBuffer(object): base = self.base_storage[self._current] # Construct a temporary name - name = kwargs.pop("name", get_counted_variable(self.identifier)) + name = kwargs.pop("name", None) if name is None: - from pudb import set_trace; set_trace() + name = get_counted_variable(self.identifier) # Get geometric dimension formdata = get_global_context_value('formdata') diff --git a/python/dune/perftool/sumfact/basis.py b/python/dune/perftool/sumfact/basis.py index fb3c3ff3..8a451a55 100644 --- a/python/dune/perftool/sumfact/basis.py +++ b/python/dune/perftool/sumfact/basis.py @@ -75,14 +75,14 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) # Initialize the buffer for the sum fact kernel shape = (product(mat.cols for mat in a_matrices),) - if index: + if index is not None: shape = shape + (4,) - initialize_buffer(buffer, - base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), - num=2 - ).get_temporary(shape=shape, - name=input, - ) + input = initialize_buffer(buffer, + base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), + num=2 + ).get_temporary(shape=shape, + name=input, + ) # Setup the input! setup_theta(input, element, restriction, component, index) @@ -90,14 +90,22 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) # Add a sum factorization kernel that implements the # evaluation of the gradients of basis functions at quadrature # points (stage 1) - var, _ = sum_factorization_kernel(a_matrices, + if index: + var, _ = sum_factorization_kernel(a_matrices, buffer, 1, preferred_position=i, insn_dep=frozenset({Writes(input)}), + output_name=name, ) - - buffers.append(var) + else: + var, _ = sum_factorization_kernel(a_matrices, + buffer, + 1, + preferred_position=i, + insn_dep=frozenset({Writes(input)}), + ) + buffers.append(var) # TODO this should be quite conditional!!! for i, buf in enumerate(buffers): @@ -139,7 +147,7 @@ def pymbolic_trialfunction(element, restriction, component): # Flip flop buffers for sumfactorization shape = (product(mat.cols for mat in a_matrices),) - if vec: + if index is not None: shape = shape + (4,) initialize_buffer(buffer, base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), diff --git a/python/dune/perftool/sumfact/sumfact.py b/python/dune/perftool/sumfact/sumfact.py index 6cf4bd7b..7f32ed52 100644 --- a/python/dune/perftool/sumfact/sumfact.py +++ b/python/dune/perftool/sumfact/sumfact.py @@ -321,14 +321,20 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): a_matrices, buffer, input, index = get_vectorization_info(a_matrices) # Initialize a base storage for this buffer and get a temporay pointing to it -# shape = product(mat.cols for mat in a_matrices) -# if vec: -# shape = (shape, 4) + shape = tuple(mat.cols for mat in a_matrices) + dim_tags = ",".join(['f'] * dim) + if index is not None: + shape = shape + (4,) + dim_tags = dim_tags + ",c" + index = (index,) + else: + index = () + temp = initialize_buffer(buffer, base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), num=2 - ).get_temporary(shape=(quadrature_points_per_direction(),) * dim, - dim_tags=",".join(['f'] * dim), + ).get_temporary(shape=shape, + dim_tags=dim_tags, name=input, ) @@ -346,7 +352,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): # Issue an instruction in the quadrature loop that fills the buffer # with the evaluation of the contribution at all quadrature points - assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames())) + assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + index) contrib_dep = instruction(assignee=assignee, expression=expression, forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), @@ -403,7 +409,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): transform(nest_quadrature_loops, visitor.inames) -def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False, preferred_position=None): +def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), preferred_position=None, output_name=None): """ Calculate a sum factorization matrix product. @@ -426,9 +432,13 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add if get_global_context_value("dry_run", False): return SumfactKernel(a_matrices, buf, stage, preferred_position), frozenset() - ftags = "f,f" + (",vec" if add_vec_tag else "") - ctags = "c,c" + (",vec" if add_vec_tag else "") - vec_shape = (4,) if add_vec_tag else () + ftags = "f,f" + ctags = "c,c" + vec_shape = () + if isinstance(next(iter(a_matrices)), LargeAMatrix): + ftags = ftags + ",vec" + ctags = ctags + ",vec" + vec_shape = (4,) insn_dep = frozenset({barrier(depends_on=insn_dep, within_inames=additional_inames, @@ -495,13 +505,15 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add out_shape = tuple(mat.rows for mat in a_matrices) dim_tags = ",".join(['f'] * dim) - if add_vec_tag: + if isinstance(next(iter(a_matrices)), LargeAMatrix): out_shape = out_shape + vec_shape dim_tags = dim_tags + ",c" out = get_buffer_temporary(buf, shape=out_shape, - dim_tags=dim_tags) + dim_tags=dim_tags, + name=output_name, + ) silenced_warning('read_no_write({})'.format(out)) return prim.Variable(out), insn_dep diff --git a/python/dune/perftool/sumfact/vectorization.py b/python/dune/perftool/sumfact/vectorization.py index 9e4f2eb8..9155fb27 100644 --- a/python/dune/perftool/sumfact/vectorization.py +++ b/python/dune/perftool/sumfact/vectorization.py @@ -10,8 +10,8 @@ import loopy as lp @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, *args: a) -def vectorization_info(a_matrices, buffer, input, index): - return (a_matrices, buffer, input, index) +def vectorization_info(a_matrices, new_a_matrices, buffer, input, index): + return (new_a_matrices, buffer, input, index) def get_vectorization_info(a_matrices): @@ -20,20 +20,21 @@ def get_vectorization_info(a_matrices): # Return dummy data return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) try: - return vectorization_info(a_matrices, None, None, None) + return vectorization_info(a_matrices, None, None, None, None) except TypeError: raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt") def no_vectorization(sumfacts): for sumf in sumfacts: - vectorization_info(sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) + vectorization_info(sumf.a_matrices, sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) def decide_stage_vectorization_strategy(sumfacts, stage): stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage]) if len(stage_sumfacts) in (3, 4): # Map the sum factorization to their position in the joint kernel + position_mapping = {} available = set(range(4)) for sf in stage_sumfacts: if sf.preferred_position is not None: @@ -41,16 +42,41 @@ def decide_stage_vectorization_strategy(sumfacts, stage): # Later on, more complicated stuff might be necessary here. assert sf.preferred_position in available available.discard(sf.preferred_position) + position_mapping[sf] = sf.preferred_position + + # Choose a position for those that have no preferred one! + for sumf in stage_sumfacts: + if sumf.preferred_position is None: + position_mapping[sumf] = available.pop() # Enable vectorization strategy: input = get_counted_variable("joined_input") buffer = get_counted_variable("joined_buffer") + # Collect the large matrices! + large_a_matrices = [] + for i in range(len(next(iter(stage_sumfacts)).a_matrices)): + # Assert that the matrices of all sum factorizations have the same size + assert len(set(tuple(sf.a_matrices[i].rows for sf in stage_sumfacts))) == 1 + assert len(set(tuple(sf.a_matrices[i].cols for sf in stage_sumfacts))) == 1 + + # Collect the transpose/derivative information + transpose = [False] * 4 + derivative = [False] * 4 + for sf in stage_sumfacts: + transpose[position_mapping[sf]] = sf.a_matrices[i].transpose + derivative[position_mapping[sf]] = sf.a_matrices[i].derivative + + from dune.perftool.sumfact.amatrix import LargeAMatrix + large = LargeAMatrix(rows=next(iter(stage_sumfacts)).a_matrices[i].rows, + cols=next(iter(stage_sumfacts)).a_matrices[i].cols, + transpose=tuple(transpose), + derivative=tuple(derivative), + ) + large_a_matrices.append(large) + for sumf in stage_sumfacts: - pref_pos = sumf.preferred_position - if pref_pos is None: - pref_pos = available.pop() - vectorization_info(sumf.a_matrices, buffer, input, pref_pos) + vectorization_info(sumf.a_matrices, tuple(large_a_matrices), buffer, input, position_mapping[sumf]) else: # Disable vectorization strategy no_vectorization(stage_sumfacts) -- GitLab