Skip to content
Snippets Groups Projects
Commit 488dabda authored by Dominic Kempf's avatar Dominic Kempf
Browse files

Implement large matrix construction

parent 5504e7d7
No related branches found
No related tags found
No related merge requests found
...@@ -29,9 +29,9 @@ class FlipFlopBuffer(object): ...@@ -29,9 +29,9 @@ class FlipFlopBuffer(object):
base = self.base_storage[self._current] base = self.base_storage[self._current]
# Construct a temporary name # Construct a temporary name
name = kwargs.pop("name", get_counted_variable(self.identifier)) name = kwargs.pop("name", None)
if name is None: if name is None:
from pudb import set_trace; set_trace() name = get_counted_variable(self.identifier)
# Get geometric dimension # Get geometric dimension
formdata = get_global_context_value('formdata') formdata = get_global_context_value('formdata')
......
...@@ -75,14 +75,14 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) ...@@ -75,14 +75,14 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
# Initialize the buffer for the sum fact kernel # Initialize the buffer for the sum fact kernel
shape = (product(mat.cols for mat in a_matrices),) shape = (product(mat.cols for mat in a_matrices),)
if index: if index is not None:
shape = shape + (4,) shape = shape + (4,)
initialize_buffer(buffer, input = initialize_buffer(buffer,
base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
num=2 num=2
).get_temporary(shape=shape, ).get_temporary(shape=shape,
name=input, name=input,
) )
# Setup the input! # Setup the input!
setup_theta(input, element, restriction, component, index) setup_theta(input, element, restriction, component, index)
...@@ -90,14 +90,22 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component) ...@@ -90,14 +90,22 @@ def sumfact_evaluate_coefficient_gradient(element, name, restriction, component)
# Add a sum factorization kernel that implements the # Add a sum factorization kernel that implements the
# evaluation of the gradients of basis functions at quadrature # evaluation of the gradients of basis functions at quadrature
# points (stage 1) # points (stage 1)
var, _ = sum_factorization_kernel(a_matrices, if index:
var, _ = sum_factorization_kernel(a_matrices,
buffer, buffer,
1, 1,
preferred_position=i, preferred_position=i,
insn_dep=frozenset({Writes(input)}), insn_dep=frozenset({Writes(input)}),
output_name=name,
) )
else:
buffers.append(var) var, _ = sum_factorization_kernel(a_matrices,
buffer,
1,
preferred_position=i,
insn_dep=frozenset({Writes(input)}),
)
buffers.append(var)
# TODO this should be quite conditional!!! # TODO this should be quite conditional!!!
for i, buf in enumerate(buffers): for i, buf in enumerate(buffers):
...@@ -139,7 +147,7 @@ def pymbolic_trialfunction(element, restriction, component): ...@@ -139,7 +147,7 @@ def pymbolic_trialfunction(element, restriction, component):
# Flip flop buffers for sumfactorization # Flip flop buffers for sumfactorization
shape = (product(mat.cols for mat in a_matrices),) shape = (product(mat.cols for mat in a_matrices),)
if vec: if index is not None:
shape = shape + (4,) shape = shape + (4,)
initialize_buffer(buffer, initialize_buffer(buffer,
base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
......
...@@ -321,14 +321,20 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -321,14 +321,20 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
a_matrices, buffer, input, index = get_vectorization_info(a_matrices) a_matrices, buffer, input, index = get_vectorization_info(a_matrices)
# Initialize a base storage for this buffer and get a temporay pointing to it # Initialize a base storage for this buffer and get a temporay pointing to it
# shape = product(mat.cols for mat in a_matrices) shape = tuple(mat.cols for mat in a_matrices)
# if vec: dim_tags = ",".join(['f'] * dim)
# shape = (shape, 4) if index is not None:
shape = shape + (4,)
dim_tags = dim_tags + ",c"
index = (index,)
else:
index = ()
temp = initialize_buffer(buffer, temp = initialize_buffer(buffer,
base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices), base_storage_size=product(max(mat.rows, mat.cols) for mat in a_matrices),
num=2 num=2
).get_temporary(shape=(quadrature_points_per_direction(),) * dim, ).get_temporary(shape=shape,
dim_tags=",".join(['f'] * dim), dim_tags=dim_tags,
name=input, name=input,
) )
...@@ -346,7 +352,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -346,7 +352,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
# Issue an instruction in the quadrature loop that fills the buffer # Issue an instruction in the quadrature loop that fills the buffer
# with the evaluation of the contribution at all quadrature points # with the evaluation of the contribution at all quadrature points
assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames())) assignee = Subscript(Variable(temp), tuple(Variable(i) for i in quadrature_inames()) + index)
contrib_dep = instruction(assignee=assignee, contrib_dep = instruction(assignee=assignee,
expression=expression, expression=expression,
forced_iname_deps=frozenset(quadrature_inames() + visitor.inames), forced_iname_deps=frozenset(quadrature_inames() + visitor.inames),
...@@ -403,7 +409,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id): ...@@ -403,7 +409,7 @@ def generate_accumulation_instruction(visitor, accterm, measure, subdomain_id):
transform(nest_quadrature_loops, visitor.inames) transform(nest_quadrature_loops, visitor.inames)
def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), add_vec_tag=False, preferred_position=None): def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), additional_inames=frozenset({}), preferred_position=None, output_name=None):
""" """
Calculate a sum factorization matrix product. Calculate a sum factorization matrix product.
...@@ -426,9 +432,13 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add ...@@ -426,9 +432,13 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
if get_global_context_value("dry_run", False): if get_global_context_value("dry_run", False):
return SumfactKernel(a_matrices, buf, stage, preferred_position), frozenset() return SumfactKernel(a_matrices, buf, stage, preferred_position), frozenset()
ftags = "f,f" + (",vec" if add_vec_tag else "") ftags = "f,f"
ctags = "c,c" + (",vec" if add_vec_tag else "") ctags = "c,c"
vec_shape = (4,) if add_vec_tag else () vec_shape = ()
if isinstance(next(iter(a_matrices)), LargeAMatrix):
ftags = ftags + ",vec"
ctags = ctags + ",vec"
vec_shape = (4,)
insn_dep = frozenset({barrier(depends_on=insn_dep, insn_dep = frozenset({barrier(depends_on=insn_dep,
within_inames=additional_inames, within_inames=additional_inames,
...@@ -495,13 +505,15 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add ...@@ -495,13 +505,15 @@ def sum_factorization_kernel(a_matrices, buf, stage, insn_dep=frozenset({}), add
out_shape = tuple(mat.rows for mat in a_matrices) out_shape = tuple(mat.rows for mat in a_matrices)
dim_tags = ",".join(['f'] * dim) dim_tags = ",".join(['f'] * dim)
if add_vec_tag: if isinstance(next(iter(a_matrices)), LargeAMatrix):
out_shape = out_shape + vec_shape out_shape = out_shape + vec_shape
dim_tags = dim_tags + ",c" dim_tags = dim_tags + ",c"
out = get_buffer_temporary(buf, out = get_buffer_temporary(buf,
shape=out_shape, shape=out_shape,
dim_tags=dim_tags) dim_tags=dim_tags,
name=output_name,
)
silenced_warning('read_no_write({})'.format(out)) silenced_warning('read_no_write({})'.format(out))
return prim.Variable(out), insn_dep return prim.Variable(out), insn_dep
...@@ -10,8 +10,8 @@ import loopy as lp ...@@ -10,8 +10,8 @@ import loopy as lp
@generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, *args: a) @generator_factory(item_tags=("vecinfo", "dryrundata"), cache_key_generator=lambda a, *args: a)
def vectorization_info(a_matrices, buffer, input, index): def vectorization_info(a_matrices, new_a_matrices, buffer, input, index):
return (a_matrices, buffer, input, index) return (new_a_matrices, buffer, input, index)
def get_vectorization_info(a_matrices): def get_vectorization_info(a_matrices):
...@@ -20,20 +20,21 @@ def get_vectorization_info(a_matrices): ...@@ -20,20 +20,21 @@ def get_vectorization_info(a_matrices):
# Return dummy data # Return dummy data
return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) return (a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
try: try:
return vectorization_info(a_matrices, None, None, None) return vectorization_info(a_matrices, None, None, None, None)
except TypeError: except TypeError:
raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt") raise PerftoolError("Sumfact Vectorization data should have been collected in dry run, but wasnt")
def no_vectorization(sumfacts): def no_vectorization(sumfacts):
for sumf in sumfacts: for sumf in sumfacts:
vectorization_info(sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None) vectorization_info(sumf.a_matrices, sumf.a_matrices, get_counted_variable("buffer"), get_counted_variable("input"), None)
def decide_stage_vectorization_strategy(sumfacts, stage): def decide_stage_vectorization_strategy(sumfacts, stage):
stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage]) stage_sumfacts = frozenset([sf for sf in sumfacts if sf.stage == stage])
if len(stage_sumfacts) in (3, 4): if len(stage_sumfacts) in (3, 4):
# Map the sum factorization to their position in the joint kernel # Map the sum factorization to their position in the joint kernel
position_mapping = {}
available = set(range(4)) available = set(range(4))
for sf in stage_sumfacts: for sf in stage_sumfacts:
if sf.preferred_position is not None: if sf.preferred_position is not None:
...@@ -41,16 +42,41 @@ def decide_stage_vectorization_strategy(sumfacts, stage): ...@@ -41,16 +42,41 @@ def decide_stage_vectorization_strategy(sumfacts, stage):
# Later on, more complicated stuff might be necessary here. # Later on, more complicated stuff might be necessary here.
assert sf.preferred_position in available assert sf.preferred_position in available
available.discard(sf.preferred_position) available.discard(sf.preferred_position)
position_mapping[sf] = sf.preferred_position
# Choose a position for those that have no preferred one!
for sumf in stage_sumfacts:
if sumf.preferred_position is None:
position_mapping[sumf] = available.pop()
# Enable vectorization strategy: # Enable vectorization strategy:
input = get_counted_variable("joined_input") input = get_counted_variable("joined_input")
buffer = get_counted_variable("joined_buffer") buffer = get_counted_variable("joined_buffer")
# Collect the large matrices!
large_a_matrices = []
for i in range(len(next(iter(stage_sumfacts)).a_matrices)):
# Assert that the matrices of all sum factorizations have the same size
assert len(set(tuple(sf.a_matrices[i].rows for sf in stage_sumfacts))) == 1
assert len(set(tuple(sf.a_matrices[i].cols for sf in stage_sumfacts))) == 1
# Collect the transpose/derivative information
transpose = [False] * 4
derivative = [False] * 4
for sf in stage_sumfacts:
transpose[position_mapping[sf]] = sf.a_matrices[i].transpose
derivative[position_mapping[sf]] = sf.a_matrices[i].derivative
from dune.perftool.sumfact.amatrix import LargeAMatrix
large = LargeAMatrix(rows=next(iter(stage_sumfacts)).a_matrices[i].rows,
cols=next(iter(stage_sumfacts)).a_matrices[i].cols,
transpose=tuple(transpose),
derivative=tuple(derivative),
)
large_a_matrices.append(large)
for sumf in stage_sumfacts: for sumf in stage_sumfacts:
pref_pos = sumf.preferred_position vectorization_info(sumf.a_matrices, tuple(large_a_matrices), buffer, input, position_mapping[sumf])
if pref_pos is None:
pref_pos = available.pop()
vectorization_info(sumf.a_matrices, buffer, input, pref_pos)
else: else:
# Disable vectorization strategy # Disable vectorization strategy
no_vectorization(stage_sumfacts) no_vectorization(stage_sumfacts)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment