Skip to content
Snippets Groups Projects
Commit 8cb22cf0 authored by Dominic Kempf's avatar Dominic Kempf
Browse files

WIP

parent 15b6bf4c
No related branches found
No related tags found
No related merge requests found
......@@ -116,7 +116,7 @@ class DuneCExpressionToCodeMapper(CExpressionToCodeMapper):
x = self.rec(expr.numerator, PREC_PRODUCT)
return "{} & {}".format(x, n - 1)
else:
return CExpressionToCodeMapper.map_remainder(expr, enclosing_prec)
return CExpressionToCodeMapper.map_remainder(self, expr, enclosing_prec)
class DuneASTBuilder(CASTBuilder):
......
......@@ -67,7 +67,7 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
# if the positioning within a SIMD vectors coincides with the index!
direct_indexing_is_possible = True
buffers = []
expressions = []
insn_dep = frozenset()
for i in range(world_dimension()):
# Construct the matrix sequence for this sum factorization
......@@ -97,7 +97,7 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
from dune.perftool.sumfact.realization import realize_sum_factorization_kernel
var, insn_dep = realize_sum_factorization_kernel(vsf.copy(insn_dep=vsf.insn_dep.union(insn_dep)))
buffers.append(var)
expressions.append(prim.Subscript(var, vsf.quadrature_index(sf)))
# Check whether we want to return early with something that has the indexing
# already handled! This happens with vectorization when the index coincides
......@@ -107,12 +107,11 @@ def pymbolic_coefficient_gradient(element, restriction, component, coeff_func, v
return maybe_wrap_subscript(var, vsf.quadrature_index(sf, visitor.indices)), None
# TODO this should be quite conditional!!!
for i, buf in enumerate(buffers):
for i, expr in enumerate(expressions):
# Write solution from sumfactorization to gradient variable
assignee = prim.Subscript(prim.Variable(name), i)
expression = prim.Subscript(buf, vsf.quadrature_index(sf))
instruction(assignee=assignee,
expression=expression,
expression=expr,
forced_iname_deps=frozenset(get_backend("quad_inames")()),
forced_iname_deps_is_final=True,
)
......
......@@ -412,7 +412,10 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
@property
def padded_indices(self):
indices = set(range(self.horizontal_width)) - set(range(len(self.kernels)))
indices = set(range(self.vector_width))
for h in range(len(self.kernels)):
for v in range(self.vertical_width):
indices.discard(h * self.horizontal_width + v)
return tuple(self.kernels[0].quadrature_index(None) + (i,) for i in indices)
@property
......@@ -439,10 +442,11 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
return True
def horizontal_index(self, sf):
try:
return self.kernels.index(sf)
except ValueError:
return 0
key = tuple(mat.derivative for mat in sf.matrix_sequence)
for i, k in enumerate(self.kernels):
if tuple(mat.derivative for mat in k.matrix_sequence) == key:
return i
return 0
def _sliced_expr(self):
quad_inames = quadrature_inames()
......@@ -486,7 +490,7 @@ class VectorizedSumfactKernel(SumfactKernelBase, ImmutableRecord, prim.Variable)
sliced = prim.Variable(quad_inames[i])
i = i + 1
return self.horizontal_index(sf) * self.vertical_width + sliced % self.vertical_width
return self.horizontal_index(sf) + prim.Remainder(sliced, self.vertical_width)
def vec_index_tuple(self, sf):
return (self.vec_index(sf),)
......
......@@ -47,37 +47,51 @@ def no_vectorization(sumfacts):
def vertical_vectorization_strategy(sumfact, depth):
# Assert that this is not already sliced
assert all(mat.slice_size is None for mat in sumfact.matrix_sequence)
result = {}
# Determine which of the matrices in the kernel should be sliced
def determine_slice_direction():
for i, mat in enumerate(sumfact.matrix_sequence):
def determine_slice_direction(sf):
for i, mat in enumerate(sf.matrix_sequence):
if mat.quadrature_size % depth == 0:
return i
elif mat.quadrature_size != 1:
raise PerftoolError("Vertical vectorization is not possible!")
sliced = determine_slice_direction()
kernels = []
oldtab = sumfact.matrix_sequence[sliced]
for i in range(depth):
seq = list(sumfact.matrix_sequence)
seq[sliced] = oldtab.copy(slice_size=depth,
slice_index=i)
kernels.append(sumfact.copy(matrix_sequence=tuple(seq)))
buffer = get_counted_variable("vertical_buffer")
input = get_counted_variable("vertical_input")
if isinstance(sumfact, SumfactKernel):
kernels = [sumfact]
else:
assert isinstance(sumfact, VectorizedSumfactKernel)
kernels = sumfact.kernels
newkernels = []
for sf in kernels:
sliced = determine_slice_direction(sf)
oldtab = sf.matrix_sequence[sliced]
for i in range(depth):
seq = list(sf.matrix_sequence)
seq[sliced] = oldtab.copy(slice_size=depth,
slice_index=i)
newkernels.append(sf.copy(matrix_sequence=tuple(seq)))
if isinstance(sumfact, SumfactKernel):
buffer = get_counted_variable("vertical_buffer")
input = get_counted_variable("vertical_input")
result[sumfact] = VectorizedSumfactKernel(kernels=tuple(newkernels),
buffer=buffer,
input=input,
vertical_width=depth,
)
else:
assert isinstance(sumfact, VectorizedSumfactKernel)
for sf in kernels:
result[sf] = sumfact.copy(kernels=tuple(newkernels),
vertical_width=depth,
)
vsf = VectorizedSumfactKernel(kernels=tuple(kernels),
buffer=buffer,
input=input,
vertical_width=depth,
)
return {sumfact: vsf}
return result
def horizontal_vectorization_strategy(sumfacts, width):
def horizontal_vectorization_strategy(sumfacts, width, allow_padding=1):
result = {}
todo = set(sumfacts)
......@@ -107,20 +121,31 @@ def horizontal_vectorization_strategy(sumfacts, width):
input = get_counted_variable("joined_input")
for sumf in kernels:
if len(kernels) in (width, width - 1):
if len(kernels) in range(width - allow_padding, width + 1):
result[sumf] = VectorizedSumfactKernel(kernels=kernels,
horizontal_width=width,
buffer=buffer,
input=input,
)
else:
result[sumf] = no_vec(sumf)
return result
def diagonal_vectorization_stratget(sumfacts, width):
return horizontal_vectorization_strategy(sumfacts, width)
def diagonal_vectorization_strategy(sumfacts, width):
if width == 4:
horizontal, vertical = 2, 2
else:
raise NotImplementedError
result = {}
horizontal_kernels = horizontal_vectorization_strategy(sumfacts, horizontal, allow_padding=0)
for sf in horizontal_kernels:
vert = vertical_vectorization_strategy(horizontal_kernels[sf], width // horizontal_kernels[sf].horizontal_width)
for k in vert:
result[k] = vert[k]
return result
def decide_vectorization_strategy():
......@@ -145,11 +170,14 @@ def decide_vectorization_strategy():
width = get_vcl_type_size(np.float64)
sfdict.update(**vertical_vectorization_strategy(sumfact, width))
elif get_option("vectorize_diagonal"):
width = get_vcl_type_size(np.float64)
sfdict.update(**diagonal_vectorization_stragegy(sumfact, width))
inputkeys = set(sf.input_key for sf in sumfacts)
for inputkey in inputkeys:
width = get_vcl_type_size(np.float64)
sumfact_filter = [sf for sf in sumfacts if sf.input_key == inputkey]
sfdict.update(**diagonal_vectorization_strategy(sumfact_filter, width))
else:
sfdict.update(**no_vectorization(sumfacts))
# Register the results
for old, new in sfdict.items():
_cache_vectorization_info(old, new)
for sf in sumfacts:
_cache_vectorization_info(sf, sfdict.get(sf, no_vec(sf)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment