WIP checkpoint 1

733675a9 · Dominic Kempf · 796c8301 · 733675a9 · 733675a9 · 733675a9
Commit 733675a9 authored 8 years ago by Dominic Kempf
--- a/python/dune/perftool/loopy/symbolic.py
+++ b/python/dune/perftool/loopy/symbolic.py
@@ -7,6 +7,7 @@ from pymbolic.mapper.substitutor import make_subst_func

 import loopy as lp
 import pymbolic.primitives as prim
+from pytools import ImmutableRecord
 from six.moves import intern


@@ -15,35 +16,72 @@ from six.moves import intern
 #


-class SumfactKernel(prim.Variable):
+class SumfactKernel(ImmutableRecord, prim.Variable):
    def __init__(self,
-                 a_matrices,
-                 buffer,
-                 stage,
-                 preferred_position,
-                 restriction,
+                 a_matrices=None,
+                 buffer=None,
+                 stage=1,
+                 preferred_position=None,
+                 restriction=0,
+                 within_inames=frozenset(),
+                 input=None,
+                 padding=frozenset(),
+                 index=None,
                 ):
+        # Check the input and apply defaults where necessary
+        assert isinstance(a_matrices, tuple)
+        from dune.perftool.sumfact.amatrix import AMatrixBase
+        assert all(isinstance(m, AMatrixBase) for m in a_matrices)
+
+        assert stage in (1, 3)
+
+        if preferred_position is not None:
+            assert isinstance(preferred_position, int)
+
        if not isinstance(restriction, tuple):
            restriction = (restriction, 0)

-        self.a_matrices = a_matrices
-        self.buffer = buffer
-        self.stage = stage
-        self.preferred_position = preferred_position
-        self.restriction = restriction
+        assert isinstance(within_inames, frozenset)
+
+        ImmutableRecord.__init__(self,
+                                 a_matrices=a_matrices,
+                                 buffer=buffer,
+                                 stage=stage,
+                                 preferred_position=preferred_position,
+                                 restriction=restriction,
+                                 within_inames=within_inames,
+                                 input=input,
+                                 padding=padding,
+                                 index=index,
+                                 )

        prim.Variable.__init__(self, "SUMFACT")

+    #
+    # The methods/fields needed to get a well-formed pymbolic node
+    #
    def __getinitargs__(self):
-        return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction)
+        return (self.a_matrices, self.buffer, self.stage, self.preferred_position, self.restriction, self.within_inames, self.input, self.padding, self.index)

    def stringifier(self):
        return lp.symbolic.StringifyMapper

-    init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction")
+    init_arg_names = ("a_matrices", "buffer", "stage", "preferred_position", "restriction", "within_inames", "input", "padding", "index")

    mapper_method = "map_sumfact_kernel"

+    #
+    # Some convenience methods to extract information about the sum factorization kernel
+    #
+
+    @property
+    def length(self):
+        return len(self.a_matrices)
+
+    @property
+    def vectorized(self):
+        return next(iter(a_matrices)).vectorized
+

 class FusedMultiplyAdd(prim.Expression):
    """ Represents an FMA operation """

--- a/python/dune/perftool/sumfact/amatrix.py
+++ b/python/dune/perftool/sumfact/amatrix.py
@@ -35,15 +35,19 @@ import loopy as lp
 import numpy


-class AMatrix(ImmutableRecord):
+class AMatrixBase(ImmutableRecord):
+    pass
+
+
+class AMatrix(AMatrixBase):
    def __init__(self, rows, cols, transpose=False, derivative=False, face=None):
-        ImmutableRecord.__init__(self,
-                                 rows=rows,
-                                 cols=cols,
-                                 transpose=transpose,
-                                 derivative=derivative,
-                                 face=face,
-                                 )
+        AMatrixBase.__init__(self,
+                             rows=rows,
+                             cols=cols,
+                             transpose=transpose,
+                             derivative=derivative,
+                             face=face,
+                             )

    @property
    def name(self):
@@ -57,16 +61,16 @@ class AMatrix(ImmutableRecord):
        return lp.TaggedVariable(name, "sumfac")


-class LargeAMatrix(ImmutableRecord):
+class LargeAMatrix(AMatrixBase):
    def __init__(self, rows, cols, transpose, derivative, face):
        assert isinstance(derivative, tuple)
-        ImmutableRecord.__init__(self,
-                                 rows=rows,
-                                 cols=cols,
-                                 transpose=transpose,
-                                 derivative=derivative,
-                                 face=face,
-                                 )
+        AMatrixBase.__init__(self,
+                             rows=rows,
+                             cols=cols,
+                             transpose=transpose,
+                             derivative=derivative,
+                             face=face,
+                             )

    @property
    def name(self):

--- a/python/dune/perftool/sumfact/basis.py
+++ b/python/dune/perftool/sumfact/basis.py
@@ -6,6 +6,7 @@ multiplication with the test function is part of the sum factorization kernel.

 from dune.perftool.generation import (backend,
                                      domain,
+                                      get_counted_variable,
                                      get_counter,
                                      get_global_context_value,
                                      iname,
@@ -74,9 +75,26 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
                                                facemod=get_facemod(restriction),
                                                )

-        # Get the vectorization info. If this happens during the dry run, we get dummies
-        from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
+        sf = SumfactKernel(a_matrices=a_matrices,
+                           restriction=restriction,
+                           preferred_position=i,
+                           )
+
+        from dune.perftool.sumfact.vectorization import attach_vectorization_info
+        sf = attach_vectorization_info(sf)
+
+        # Extract again, for compatibility
+        # TODO away!
+        a_matrices = sf.a_matrices
+        buf = sf.buffer
+        inp = sf.input
+        index = sf.index
+        padding = sf.padding
+
+        if buf is None:
+            buf = get_counted_variable("buffer")
+        if inp is None:
+            inp = get_counted_variable("input")

        # Initialize the buffer for the sum fact kernel
        shape = (product(mat.cols for mat in a_matrices),)
@@ -101,7 +119,8 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
        # Add a sum factorization kernel that implements the
        # evaluation of the gradients of basis functions at quadrature
        # points (stage 1)
-        var, insn_dep = sum_factorization_kernel(a_matrices,
+        if not get_global_context_value("dry_run", False):
+            var, insn_dep = sum_factorization_kernel(a_matrices,
                                                 buf,
                                                 1,
                                                 preferred_position=i,
@@ -110,6 +129,9 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
                                                 outshape=tuple(mat.rows for mat in a_matrices if mat.face is None),
                                                 direct_input=direct_input,
                                                 )
+        else:
+            var = sf
+
        buffers.append(var)

    # Check whether we want to return early with something that has the indexing
@@ -144,9 +166,26 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor):
    a_matrices = construct_amatrix_sequence(facedir=get_facedir(restriction),
                                            facemod=get_facemod(restriction),)

-    # Get the vectorization info. If this happens during the dry run, we get dummies
-    from dune.perftool.sumfact.vectorization import get_vectorization_info
-    a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, restriction)
+    sf = SumfactKernel(a_matrices=a_matrices,
+                       restriction=restriction,
+                       )
+
+    # TODO: Move this away!
+    from dune.perftool.sumfact.vectorization import attach_vectorization_info
+    sf = attach_vectorization_info(sf)
+
+    # Extract again, for compatibility
+    # TODO away!
+    a_matrices = sf.a_matrices
+    buf = sf.buffer
+    inp = sf.input
+    index = sf.index
+    padding = sf.padding
+
+    if buf is None:
+        buf = get_counted_variable("buffer")
+    if inp is None:
+        inp = get_counted_variable("input")

    # Flip flop buffers for sumfactorization
    shape = (product(mat.cols for mat in a_matrices),)
@@ -169,7 +208,8 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor):

    # Add a sum factorization kernel that implements the evaluation of
    # the basis functions at quadrature points (stage 1)
-    var, _ = sum_factorization_kernel(a_matrices,
+    if not get_global_context_value("dry_run", False):
+        var, _ = sum_factorization_kernel(a_matrices,
                                      buf,
                                      1,
                                      preferred_position=None,
@@ -178,6 +218,8 @@ def pymbolic_coefficient(element, restriction, component, coeff_func, visitor):
                                      restriction=restriction,
                                      direct_input=direct_input,
                                      )
+    else:
+        var = sf

    if index:
        index = (index,)

--- a/python/dune/perftool/sumfact/sumfact.py
+++ b/python/dune/perftool/sumfact/sumfact.py
@@ -14,6 +14,7 @@ from dune.perftool.generation import (backend,
                                      dump_accumulate_timer,
                                      function_mangler,
                                      generator_factory,
+                                      get_counted_variable,
                                      get_counter,
                                      get_global_context_value,
                                      globalarg,
@@ -143,9 +144,28 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                                facemod=get_facemod(accterm.argument.restriction),
                                                )

-        # Get the vectorization info. If this happens during the dry run, we get dummies
-        from dune.perftool.sumfact.vectorization import get_vectorization_info
-        a_matrices, buf, inp, index, padding = get_vectorization_info(a_matrices, (accterm.argument.restriction, restriction))
+        sf = SumfactKernel(a_matrices=a_matrices,
+                           restriction=(accterm.argument.restriction, restriction),
+                           stage=3,
+                           preferred_position=i if accterm.new_indices else None
+                           )
+
+        # TODO: Move this away!
+        from dune.perftool.sumfact.vectorization import attach_vectorization_info
+        sf = attach_vectorization_info(sf)
+
+        # Extract again, for compatibility
+        # TODO away!
+        a_matrices = sf.a_matrices
+        buf = sf.buffer
+        inp = sf.input
+        index = sf.index
+        padding = sf.padding
+
+        if buf is None:
+            buf = get_counted_variable("buffer")
+        if inp is None:
+            inp = get_counted_variable("input")

        # Initialize a base storage for this buffer and get a temporay pointing to it
        shape = tuple(mat.cols for mat in a_matrices if mat.face is None)
@@ -262,7 +282,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
        # Add a sum factorization kernel that implements the multiplication
        # with the test function (stage 3)
        pref_pos = i if accterm.new_indices else None
-        result, insn_dep = sum_factorization_kernel(a_matrices,
+        if not get_global_context_value("dry_run", False):
+            result, insn_dep = sum_factorization_kernel(a_matrices,
                                                    buf,
                                                    3,
                                                    insn_dep=insn_dep,
@@ -273,7 +294,8 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
                                                    direct_output=direct_output,
                                                    visitor=visitor
                                                    )
-
+        else:
+            result = sf
        # Determine the expression to accumulate with. This depends on the vectorization strategy!
        result = prim.Subscript(result, tuple(prim.Variable(i) for i in inames))
        vecinames = ()

--- a/python/dune/perftool/sumfact/vectorization.py
+++ b/python/dune/perftool/sumfact/vectorization.py
 """ Sum factorization vectorization """

+from dune.perftool.loopy.symbolic import SumfactKernel
 from dune.perftool.generation import (generator_factory,
                                      get_counted_variable,
+                                      get_global_context_value,
                                      )
 from dune.perftool.pdelab.restriction import (Restriction,
                                              restricted_name,
@@ -12,35 +14,27 @@ from dune.perftool.options import get_option
 import loopy as lp


-@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, r, *args: (a, r))
-def vectorization_info(a_matrices, restriction, new_a_matrices, buf, inp, index, padding):
-    return (new_a_matrices, buf, inp, index, padding)
+@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda o, n: o)
+def _cache_vectorization_info(old, new):
+    if new is None:
+        raise PerftoolError("Vectorization info for sum factorization kernel was not gathered correctly!")
+    else:
+        print("Registering old: {}".format(repr(old)))
+        print("Registering new: {}".format(repr(new)))
+    return new


-def get_vectorization_info(a_matrices, restriction):
-    if not isinstance(restriction, tuple):
-        restriction = (restriction, 0)
-    from dune.perftool.generation import get_global_context_value
+def attach_vectorization_info(sf):
+    assert isinstance(sf, SumfactKernel)
    if get_global_context_value("dry_run"):
-        # Return dummy data
-        return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None, frozenset())
-
-    # Try getting the vectorization info collected after dry run
-    vec = vectorization_info(a_matrices, restriction, None, None, None, None, None)
-    if vec[0] is None:
-        raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt")
-    return vec
+        return sf
+    else:
+        return _cache_vectorization_info(sf, None)


 def no_vectorization(sumfacts):
-    for sumf in sumfacts:
-        vectorization_info(sumf.a_matrices,
-                           sumf.restriction,
-                           sumf.a_matrices,
-                           get_counted_variable("buffer"),
-                           get_counted_variable("input"),
-                           None,
-                           frozenset())
+    for sf in sumfacts:
+        _cache_vectorization_info(sf, sf)


 def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
@@ -88,7 +82,13 @@ def decide_stage_vectorization_strategy(sumfacts, stage, restriction):
            large_a_matrices.append(large)

        for sumf in stage_sumfacts:
-            vectorization_info(sumf.a_matrices, sumf.restriction, tuple(large_a_matrices), buf, inp, position_mapping[sumf], frozenset(available))
+            _cache_vectorization_info(sumf,
+                                      sumf.copy(a_matrices=tuple(large_a_matrices),
+                                                buffer=buf,
+                                                input=inp,
+                                                index=position_mapping[sumf],
+                                                padding=frozenset(available))
+                                      )
    else:
        # Disable vectorization strategy
        no_vectorization(stage_sumfacts)