Skip to content
Snippets Groups Projects
Commit ead49c7e authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Merge branch 'feature/tensor-product-sf' into 'master'

Feature/tensor product sf

See merge request !101
parents 3116c6a6 fc427d44
No related branches found
No related tags found
No related merge requests found
......@@ -339,12 +339,37 @@ def sum_factorization_kernel(a_matrices,
restriction=0,
direct_input=None,
):
"""
Calculate a sum factorization matrix product.
"""Create a sum factorization kernel
Sum factorization can be written as
Y = R_{d-1} (A_{d-1} * ... * R_0 (A_0 * X)...)
with:
- X: Input rank d tensor of dimension n_0 x ... x n_{d-1}
- Y: Output rank d tensor of dimension m_0 x ... x m_{d-1}
- A_l: Values of 1D basis evaluations at quadrature points in l
direction, matrix of dimension m_l x n_l
- R_l: Transformation operator that permutes the underlying data
vector of the rank d tensor in such a way that the fastest
direction gets the slowest direction
In the l'th step we have the following setup:
- A_l: Matrix of dimensions m_l x n_l
- X_l: Rank d tensor of dimensions n_l x ... x n_{d-1} x m_0 x ... x m_{l-1}
- R_l: Transformation operator
Y = A_{d-1}*...*A_0*X
Looking at the indizes the following will happen:
X --> [n_l,...,n_{d-1},m_0,...,m_{l-1}]
A_l * X --> [m_l,n_l] * [n_l, ...] = [m_l,n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
R_l (A_l*X) --> [n_{l+1},...,n_{d-1},m_0,...,m_{l-1}]
where X is the input matrix and Y is the output variable.
So the multiplication with A_l is reduction over one index and the
transformation brings the next reduction index in the fastest
position.
Note: In the code below the transformation step is directly done
in the reduction instruction by adapting the assignee!
Arguments:
----------
......@@ -353,22 +378,33 @@ def sum_factorization_kernel(a_matrices,
Order of application is from 0 up.
buf: A string identifying the flip flop buffer in use
for intermediate results. The memory is expected to be
pre-initialized with the input.
insn_dep: an instruction ID that the first issued instruction
pre-initialized with the input or you have to provide
direct_input (FastDGGridOperator).
insn_dep: An instruction ID that the first issued instruction
should depend upon. All following ones will depend on each
other.
additional_inames: Instructions will be executed within those
inames (needed for stage 3 in jacobians).
preferred_position: Will be used in the dry run to order kernels
when doing vectorization e.g. (dx u,dy u,dz u, u).
outshape: Shape of the output.
restriction: Restriction for faces values.
direct_input: Global data structure containing input for
sumfactorization (e.g. when using FastDGGridOperator).
"""
if get_global_context_value("dry_run", False):
return SumfactKernel(a_matrices, buf, stage, preferred_position, restriction), frozenset()
ftags = "f,f"
ctags = "c,c"
ftags = ",".join(["f"]*len(a_matrices))
ctags = ",".join(["c"]*len(a_matrices))
vec_shape = ()
if next(iter(a_matrices)).vectorized:
ftags = ftags + ",vec"
ctags = ctags + ",vec"
vec_shape = (4,)
# Measure times and count operations in c++ code
if get_option("instrumentation_level") >= 4:
timer_name = assembler_routine_name() + '_kernel' + '_stage{}'.format(stage)
post_include('HP_DECLARE_TIMER({});'.format(timer_name), filetag='operatorfile')
......@@ -377,17 +413,18 @@ def sum_factorization_kernel(a_matrices,
depends_on=insn_dep,
within_inames=additional_inames)})
# Put a barrier before the sumfactorization kernel
insn_dep = frozenset({barrier(depends_on=insn_dep,
within_inames=additional_inames,
)})
# Product of all matrices
for l, a_matrix in enumerate(a_matrices):
# Compute the correct shapes of in- and output matrices of this matrix-matrix multiplication
# and get inames that realize the product.
inp_shape = (a_matrix.cols, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
out_shape = (a_matrix.rows, product(mat.rows for mat in a_matrices[:l]) * product(mat.cols for mat in a_matrices[l + 1:]))
i = sumfact_iname(out_shape[0], "row")
j = sumfact_iname(out_shape[1], "col")
inp_shape = (a_matrix.cols,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
out_shape = (a_matrix.rows,) + tuple(mat.cols for mat in a_matrices[l + 1:]) + tuple(mat.rows for mat in a_matrices[:l])
out_inames = tuple(sumfact_iname(length, "out_inames_" + str(k)) for k, length in enumerate(out_shape))
vec_iname = ()
if a_matrix.vectorized:
iname = sumfact_iname(4, "vec")
......@@ -404,19 +441,20 @@ def sum_factorization_kernel(a_matrices,
else:
k_expr = 0
# Setup the input of the sum factorization kernel. In the first matrix multiplication
# this can be taken from
# Setup the input of the sum factorization kernel. In the
# first matrix multiplication this can be taken from
# * an input temporary (default)
# * a global data structure (if FastDGGridOperator is in use)
# * a value from a global data structure, broadcasted to a vector type (vectorized + FastDGGridOperator)
if l == 0 and direct_input is not None:
globalarg(direct_input, dtype=np.float64, shape=inp_shape)
if a_matrix.vectorized:
input_summand = prim.Call(prim.Variable("Vec4d"), (prim.Subscript(prim.Variable(direct_input),
(k_expr, prim.Variable(j))),))
input_summand = prim.Call(prim.Variable("Vec4d"),
(prim.Subscript(prim.Variable(direct_input),
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:])),))
else:
input_summand = prim.Subscript(prim.Variable(direct_input),
(k_expr, prim.Variable(j)) + vec_iname)
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)
else:
# Get a temporary that interprets the base storage of the input
# as a column-major matrix. In later iteration of the amatrix loop
......@@ -429,35 +467,47 @@ def sum_factorization_kernel(a_matrices,
silenced_warning('read_no_write({})'.format(inp))
input_summand = prim.Subscript(prim.Variable(inp),
(k_expr, prim.Variable(j)) + vec_iname)
(k_expr,) + tuple(prim.Variable(j) for j in out_inames[1:]) + vec_iname)
switch_base_storage(buf)
# Get a temporary that interprets the base storage of the output
# as row-major matrix
# Get a temporary that interprets the base storage of the output.
#
# Note: In this step the reordering of the fastest directions
# is happening. The new direction (out_inames[0]) and the
# corresponding shape (out_shape[0]) goes to the end (slowest
# direction) and everything stays column major (ftags->fortran
# style).
out = get_buffer_temporary(buf,
shape=out_shape + vec_shape,
dim_tags=ctags)
shape=tuple(out_shape[1:]) + (out_shape[0],) + vec_shape,
dim_tags=ftags)
# Write the matrix-matrix multiplication expression
matprod = Product((prim.Subscript(prim.Variable(a_matrix.name), (prim.Variable(i), k_expr) + vec_iname),
input_summand
))
# matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
# (prim.Variable(i), k_expr) + vec_iname),
# input_summand))
matprod = Product((prim.Subscript(prim.Variable(a_matrix.name),
(prim.Variable(out_inames[0]), k_expr) + vec_iname),
input_summand))
# ... which may be a reduction, if k>0
if a_matrix.cols != 1:
# ... which may be a reduction, if k>0
matprod = lp.Reduction("sum", k, matprod)
# Here we also move the new direction (out_inames[0]) to the end
assignee = prim.Subscript(prim.Variable(out), tuple(prim.Variable(i) for i in out_inames[1:]) + (prim.Variable(out_inames[0]),) + vec_iname)
# Issue the reduction instruction that implements the multiplication
# at the same time store the instruction ID for the next instruction to depend on
assignee = prim.Subscript(prim.Variable(out), (prim.Variable(i), prim.Variable(j)) + vec_iname)
insn_dep = frozenset({instruction(assignee=assignee,
expression=matprod,
forced_iname_deps=frozenset({i, j}).union(additional_inames),
forced_iname_deps=frozenset([iname for iname in out_inames]).union(additional_inames),
forced_iname_deps_is_final=True,
depends_on=insn_dep,
)
})
# Measure times and count operations in c++ code
if get_option("instrumentation_level") >= 4:
insn_dep = frozenset({instruction(code="HP_TIMER_STOP({});".format(timer_name),
depends_on=insn_dep,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment