Implement large matrix construction

488dabda · Dominic Kempf · 5504e7d7 · 488dabda · 488dabda · 488dabda
Commit 488dabda authored 8 years ago by Dominic Kempf
--- a/python/dune/perftool/loopy/buffer.py
+++ b/python/dune/perftool/loopy/buffer.py
@@ -29,9 +29,9 @@ class FlipFlopBuffer(object):
        base = self.base_storage[self._current]
        # Construct a temporary name
-        name = kwargs.pop("name", get_counted_variable(self.identifier))
+        name = kwargs.pop("name", None)
        if name is None:
-            from pudb import set_trace; set_trace()
+            name = get_counted_variable(self.identifier)
        # Get geometric dimension
        formdata = get_global_context_value('formdata')

--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -75,14 +75,14 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
        # Initialize the buffer for the sum fact kernel
        shape = (product(mat.cols for mat in a_matrices),)
-        if index:
+        if index is not None:
            shape = shape + (4,)
-        initialize_buffer(buffer,
+        input = initialize_buffer(buffer,
-                          base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
+                                  base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
-                          num=2
+                                  num=2
-                          ).get_temporary(shape=shape,
+                                  ).get_temporary(shape=shape,
-                                          name=input,
+                                                  name=input,
-                                          )
+                                                  )
        # Setup the input!
        setup_theta(input, element, restriction, component, index)
@@ -90,14 +90,22 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
        # Add a sum factorization kernel that implements the
        # evaluation of the gradients of basis functions at quadrature
        # points (stage 1)
-        var, _ = sum_factorization_kernel(a_matrices,
+        if index:
+            var, _ = sum_factorization_kernel(a_matrices,
                                          buffer,
                                          1,
                                          preferred_position=i,
                                          insn_dep=frozenset({Writes(input)}),
+                                          output_name=name,
                                          )
+        else:
-        buffers.append(var)
+            var, _ = sum_factorization_kernel(a_matrices,
+                                          buffer,
+                                          1,
+                                          preferred_position=i,
+                                          insn_dep=frozenset({Writes(input)}),
+                                          )
+            buffers.append(var)
    # TODO this should be quite conditional!!!
    for i, buf in enumerate(buffers):
@@ -139,7 +147,7 @@ def pymbolic_trialfunction(element, restriction, component):
    # Flip flop buffers for sumfactorization
    shape = (product(mat.cols for mat in a_matrices),)
-    if vec:
+    if index is not None:
        shape = shape + (4,)
    initialize_buffer(buffer,
                      base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),

--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -321,14 +321,20 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
        a_matrices, buffer, input, index = get_vectorization_info(a_matrices)
        # Initialize a base storage for this buffer and get a temporay pointing to it
-#        shape = product(mat.cols for mat in a_matrices)
+        shape = tuple(mat.cols for mat in a_matrices)
-#        if vec:
+        dim_tags = ",".join(['f'] * dim)
-#            shape = (shape, 4)
+        if index is not None:
+            shape = shape + (4,)
+            dim_tags = dim_tags + ",c"
+            index = (index,)
+        else:
+            index = ()
        temp = initialize_buffer(buffer,
                                 base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
                                 num=2
-                                 ).get_temporary(shape=(quadrature_points_per_direction(),) * dim,
+                                 ).get_temporary(shape=shape,
-                                                 dim_tags=",".join(['f'] * dim),
+                                                 dim_tags=dim_tags,
                                                 name=input,
                                                 )
@@ -346,7 +352,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
        # Issue an instruction in the quadrature loop that fills the buffer
        # with the evaluation of the contribution at all quadrature points
-        assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()))
+        assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + index)
        contrib_dep = instruction(assignee=assignee,
                                  expression=expression,
                                  forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
@@ -403,7 +409,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
        transform(nest_quadrature_loops, visitor.inames)
-def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False, preferred_position=None):
+def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), preferred_position=None, output_name=None):
    """
    Calculate a sum factorization matrix product.
@@ -426,9 +432,13 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
    if get_global_context_value("dry_run", False):
        return SumfactKernel(a_matrices, buf, stage, preferred_position), frozenset()
-    ftags = "f,f" + (",vec" if add_vec_tag else "")
+    ftags = "f,f"
-    ctags = "c,c" + (",vec" if add_vec_tag else "")
+    ctags = "c,c"
-    vec_shape = (4,) if add_vec_tag else ()
+    vec_shape = ()
+    if isinstance(next(iter(a_matrices)), LargeAMatrix):
+        ftags = ftags + ",vec"
+        ctags = ctags + ",vec"
+        vec_shape = (4,)
    insn_dep = frozenset({barrier(depends_on=insn_dep,
                                  within_inames=additional_inames,
@@ -495,13 +505,15 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
    out_shape = tuple(mat.rows for mat in a_matrices)
    dim_tags = ",".join(['f'] * dim)
-    if add_vec_tag:
+    if isinstance(next(iter(a_matrices)), LargeAMatrix):
        out_shape = out_shape + vec_shape
        dim_tags = dim_tags + ",c"
    out = get_buffer_temporary(buf,
                               shape=out_shape,
-                               dim_tags=dim_tags)
+                               dim_tags=dim_tags,
+                               name=output_name,
+                               )
    silenced_warning('read_no_write({})'.format(out))
    return prim.Variable(out), insn_dep
--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
@@ -10,8 +10,8 @@ import loopy as lp
 @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, *args: a)
-def vectorization_info(a_matrices, buffer, input, index):
+def vectorization_info(a_matrices, new_a_matrices, buffer, input, index):
-    return (a_matrices, buffer, input, index)
+    return (new_a_matrices, buffer, input, index)
 def get_vectorization_info(a_matrices):
@@ -20,20 +20,21 @@ def get_vectorization_info(a_matrices):
        # Return dummy data
        return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
    try:
-        return vectorization_info(a_matrices, None, None, None)
+        return vectorization_info(a_matrices, None, None, None, None)
    except TypeError:
        raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt")
 def no_vectorization(sumfacts):
    for sumf in sumfacts:
-        vectorization_info(sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
+        vectorization_info(sumf.a_matrices, sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
 def decide_stage_vectorization_strategy(sumfacts, stage):
    stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage])
    if len(stage_sumfacts) in (3, 4):
        # Map the sum factorization to their position in the joint kernel
+        position_mapping = {}
        available = set(range(4))
        for sf in stage_sumfacts:
            if sf.preferred_position is not None:
@@ -41,16 +42,41 @@ def decide_stage_vectorization_strategy(sumfacts, stage):
                # Later on, more complicated stuff might be necessary here.
                assert sf.preferred_position in available
                available.discard(sf.preferred_position)
+                position_mapping[sf] = sf.preferred_position
+        # Choose a position for those that have no preferred one!
+        for sumf in stage_sumfacts:
+            if sumf.preferred_position is None:
+                position_mapping[sumf] = available.pop()
        # Enable vectorization strategy:
        input = get_counted_variable("joined_input")
        buffer = get_counted_variable("joined_buffer")
+        # Collect the large matrices!
+        large_a_matrices = []
+        for i in range(len(next(iter(stage_sumfacts)).a_matrices)):
+            # Assert that the matrices of all sum factorizations have the same size
+            assert len(set(tuple(sf.a_matrices[i].rows for sf in stage_sumfacts))) == 1
+            assert len(set(tuple(sf.a_matrices[i].cols for sf in stage_sumfacts))) == 1
+            # Collect the transpose/derivative information
+            transpose = [False] * 4
+            derivative = [False] * 4
+            for sf in stage_sumfacts:
+                transpose[position_mapping[sf]] = sf.a_matrices[i].transpose
+                derivative[position_mapping[sf]] = sf.a_matrices[i].derivative
+            from dune.perftool.sumfact.amatrix import LargeAMatrix
+            large = LargeAMatrix(rows=next(iter(stage_sumfacts)).a_matrices[i].rows,
+                                 cols=next(iter(stage_sumfacts)).a_matrices[i].cols,
+                                 transpose=tuple(transpose),
+                                 derivative=tuple(derivative),
+                                 )
+            large_a_matrices.append(large)
        for sumf in stage_sumfacts:
-            pref_pos = sumf.preferred_position
+            vectorization_info(sumf.a_matrices, tuple(large_a_matrices), buffer, input, position_mapping[sumf])
-            if pref_pos is None:
-                pref_pos = available.pop()
-            vectorization_info(sumf.a_matrices, buffer, input, pref_pos)
    else:
        # Disable vectorization strategy
        no_vectorization(stage_sumfacts)