diff --git a/python/dune/codegen/loopy/transformations/performance.py b/python/dune/codegen/loopy/transformations/performance.py new file mode 100644 index 0000000000000000000000000000000000000000..b8995e32687078aa8272086f210b6da5e088e916 --- /dev/null +++ b/python/dune/codegen/loopy/transformations/performance.py @@ -0,0 +1,7 @@ +from dune.codegen.options import get_form_option +from dune.codegen.sumfact.transformations import sumfact_performance_transformations + +def performance_transformations(kernel, signature): + if get_form_option("sumfact"): + kernel = sumfact_performance_transformations(kernel, signature) + return kernel diff --git a/python/dune/codegen/pdelab/localoperator.py b/python/dune/codegen/pdelab/localoperator.py index 70ab9f4793a6163559d070e33e3600f9cd8c5578..3c06fa2bc0dcd14f68248a4c063dc0ba906b2568 100644 --- a/python/dune/codegen/pdelab/localoperator.py +++ b/python/dune/codegen/pdelab/localoperator.py @@ -593,6 +593,10 @@ def extract_kernel_from_cache(tag, name, signature, wrap_in_cgen=True, add_timin for trafo in transformations: kernel = trafo[0](kernel, *trafo[1], **trafo[2]) + # Apply performance transformations + from dune.codegen.loopy.transformations.performance import performance_transformations + kernel = performance_transformations(kernel, signature) + from dune.codegen.loopy import heuristic_duplication kernel = heuristic_duplication(kernel) diff --git a/python/dune/codegen/sumfact/accumulation.py b/python/dune/codegen/sumfact/accumulation.py index 8a22a1d8f45c79d90b9a6e9fed643bdc30787c59..c1ccd13a5552980db0be3788651065dc8a39654c 100644 --- a/python/dune/codegen/sumfact/accumulation.py +++ b/python/dune/codegen/sumfact/accumulation.py @@ -548,9 +548,6 @@ def generate_accumulation_instruction(expr, visitor): from dune.codegen.sumfact.vectorization import attach_vectorization_info vsf = attach_vectorization_info(sf) - from dune.codegen.sumfact.transformations import attach_transformations - vsf = attach_transformations(sf, vsf) - # Make sure we have a buffer that we can set up the input with buffer = vsf.buffer if buffer is None: diff --git a/python/dune/codegen/sumfact/basis.py b/python/dune/codegen/sumfact/basis.py index 57287efeb4aaef4d0243799e3561731c98aa2272..757d51870aa7ccf690b9362b9b0522a4892f792b 100644 --- a/python/dune/codegen/sumfact/basis.py +++ b/python/dune/codegen/sumfact/basis.py @@ -220,9 +220,6 @@ class SumfactBasisMixin(GenericBasisMixin): from dune.codegen.sumfact.vectorization import attach_vectorization_info vsf = attach_vectorization_info(sf) - from dune.codegen.sumfact.transformations import attach_transformations - vsf = attach_transformations(sf, vsf) - self.indices = None # If this sum factorization kernel was not used in the dry run we diff --git a/python/dune/codegen/sumfact/geometry.py b/python/dune/codegen/sumfact/geometry.py index acb3599b0ffce6d4f393d90d6249f3083328cb56..8ac8aaa4faa1a4849c3b88fb54f87b0f87ca7b2b 100644 --- a/python/dune/codegen/sumfact/geometry.py +++ b/python/dune/codegen/sumfact/geometry.py @@ -224,9 +224,6 @@ class SumfactMultiLinearGeometryMixin(GenericPDELabGeometryMixin): from dune.codegen.sumfact.vectorization import attach_vectorization_info vsf = attach_vectorization_info(sf) - from dune.codegen.sumfact.transformations import attach_transformations - vsf = attach_transformations(sf, vsf) - # If this sum factorization kernel was not used in the dry run we # just return 0 if vsf == 0: @@ -545,9 +542,6 @@ def _name_jacobian(i, j, restriction, visitor): from dune.codegen.sumfact.vectorization import attach_vectorization_info vsf = attach_vectorization_info(sf) - from dune.codegen.sumfact.transformations import attach_transformations - vsf = attach_transformations(sf, vsf) - # If this sum factorization kernel was not used in the dry run we # just return 0 if vsf == 0: diff --git a/python/dune/codegen/sumfact/transformations.py b/python/dune/codegen/sumfact/transformations.py index f059c4c567852da01f116eeeb158eb2a208b6f89..ffe5ca4181843233b9693ddda9bc117afae2d54f 100644 --- a/python/dune/codegen/sumfact/transformations.py +++ b/python/dune/codegen/sumfact/transformations.py @@ -1,19 +1,22 @@ +import re + import loopy as lp import pymbolic.primitives as prim import islpy as isl from dune.codegen.generation import get_global_context_value from dune.codegen.loopy.transformations.remove_reductions import remove_all_reductions -from dune.codegen.options import get_form_option +from dune.codegen.options import get_form_option, get_option from dune.codegen.pdelab.geometry import world_dimension +from dune.codegen.error import CodegenAutotuneError -def move_zero_assignment_up(knl, move_up_inames): +def move_zero_assignment_up(kernel, move_up_inames): if len(move_up_inames) == 0: - return knl + return kernel # Find the instruction we want to move around cond = lp.match.Tagged('set_zero') - instructions = lp.find_instructions(knl, cond) + instructions = lp.find_instructions(kernel, cond) move_iname_set = set(map(lambda x: prim.Variable(x), move_up_inames)) instr = None for i in instructions: @@ -25,12 +28,12 @@ def move_zero_assignment_up(knl, move_up_inames): assert (instr!=None) # Remove it - knl = lp.remove_instructions(knl, set([instr.id])) + kernel = lp.remove_instructions(kernel, set([instr.id])) # Create loop domains: In order to move it upwards we need to create # additional loops iname_appendix = '_move_up' - domains = knl.domains + domains = kernel.domains for iname in move_up_inames: # Find loop bound for this iname for dom in domains: @@ -66,12 +69,12 @@ def move_zero_assignment_up(knl, move_up_inames): instructions = [] instructions.append(instr.copy(assignee=assignee, within_inames=frozenset(within_inames))) - knl = knl.copy(instructions=knl.instructions + instructions, + kernel = kernel.copy(instructions=kernel.instructions + instructions, domains=domains) # Add dependency to inner assignment instructions cond = lp.match.Tagged('assignment') - assignment_instructions = lp.find_instructions(knl, cond) + assignment_instructions = lp.find_instructions(kernel, cond) instr = None for i in assignment_instructions: instr_iname_set = set(i.assignee.index_tuple) @@ -82,12 +85,12 @@ def move_zero_assignment_up(knl, move_up_inames): id_zero = instructions[0].id cond = lp.match.Id(instr.id) - knl = lp.add_dependency(knl, cond, id_zero) + kernel = lp.add_dependency(kernel, cond, id_zero) - return knl + return kernel -def reorder_loops_in_tensor_contraction(knl, iname_order): +def reorder_loops_in_tensor_contraction(kernel, iname_order): """Reorder the loop nest of the tensor contractions iname_order is a string that specifies the loop order. We use the following convention: @@ -119,7 +122,7 @@ def reorder_loops_in_tensor_contraction(knl, iname_order): # needs a rework anyway so I just do the 3D case first. assert dim==3 - knl = remove_all_reductions(knl) + kernel = remove_all_reductions(kernel) # TODO: Doc after rewrite reduction_iname = 'j' @@ -132,7 +135,7 @@ def reorder_loops_in_tensor_contraction(knl, iname_order): # cond = lp.match.Tagged('set_zero') cond = lp.match.Tagged('assignment') - instructions = lp.find_instructions(knl, cond) + instructions = lp.find_instructions(kernel, cond) for instr in instructions: inames = tuple(map(lambda x: x.name, instr.assignee.index_tuple)) current_move_up_inames = [] @@ -141,12 +144,11 @@ def reorder_loops_in_tensor_contraction(knl, iname_order): if i.find(j) >= 0: current_move_up_inames.append(i) - knl = move_zero_assignment_up(knl, current_move_up_inames) + kernel = move_zero_assignment_up(kernel, current_move_up_inames) # TODO: There should be a better method than searching the string for # 'sf_red'. Unfortunately there are sometimes Call instructions due to # broadcasts. That makes different ways difficult. - import re regex = re.compile('sf_red_([0-9]*)') reduction_index = set(regex.findall(str(instr))) assert len(reduction_index) == 1 @@ -162,90 +164,21 @@ def reorder_loops_in_tensor_contraction(knl, iname_order): for i in current_move_up_inames: prefered_iname_order.append(i) prefered_iname_order = tuple(prefered_iname_order) - knl = lp.prioritize_loops(knl, prefered_iname_order) - - return knl - - -class SumfactKernelFunctionTransformation(object): - def kernel_transformation(self): - """Transformation that will be applied to sumfact kernel function""" - raise NotImplementedError - - def name_appendix(self): - """Name will be appended to name of sumfact kernel function""" - raise NotImplementedError - - -class LoopOrderTransformation(SumfactKernelFunctionTransformation): - def __init__(self, order): - self.order = order - - def kernel_transformation(self): - return (reorder_loops_in_tensor_contraction, self.order, {}) - - def name_appendix(self): - return 'looporder{}'.format(self.order) - - -def autotune_loop_order(sf): - # Baseline is the non transformed kernel - from dune.codegen.sumfact.autotune import autotune_realization - best_order = None - best_cost = autotune_realization(sf) - - # Go through all loop orderings. Note: Due to reduction removal there can - # be differences even in the case where the loop order here is the same as - # above. - import itertools as it - loop_orders = [''.join(p) for p in it.permutations('lkij')] - for order in loop_orders: - from dune.codegen.sumfact.transformations import reorder_loops_in_tensor_contraction - trafo = LoopOrderTransformation(order) - transformed_sf = sf.copy(transformations=sf.transformations + (trafo,)) - cost = autotune_realization(transformed_sf) - if cost < best_cost: - best_cost = cost - best_order = order - - return best_order - -def attach_transformations(sf, vsf): - if vsf == 0: - return 0 - - if get_global_context_value("dry_run") == None: - - from dune.codegen.options import set_form_option, set_option - # TODO - # - # set_form_option("sumfact_optimize_loop_order", True) - # set_option("autotune_google_benchmark", True) - if get_form_option("sumfact_optimize_loop_order"): - if get_global_context_value("integral_type") == 'cell': - # Find best loop order and store transformation in sumfact kernel - loop_order = autotune_loop_order(vsf) - print("palpo loop_order: {}".format(loop_order)) - if loop_order != None: - trafo = LoopOrderTransformation(loop_order) - vsf = vsf.copy(transformations=vsf.transformations + (trafo,)) - - # Map the original kernel to the transformed kernel - from dune.codegen.sumfact.vectorization import _cache_vectorization_info - _cache_vectorization_info(sf, vsf) - - # TODO - # - # Exapmle for such an transformation - # if get_global_context_value("integral_type") == 'cell': - # loop_order = 'lkji' - # trafo = LoopOrderTransformation(loop_order) - # vsf = vsf.copy(transformations=vsf.transformations + (trafo,)) - - # # Map the original kernel to the transformed kernel - # from dune.codegen.sumfact.vectorization import _cache_vectorization_info - # _cache_vectorization_info(sf, vsf) - - return vsf - - return sf + kernel = lp.prioritize_loops(kernel, prefered_iname_order) + + return kernel + + +def sumfact_performance_transformations(kernel, signature): + if kernel.name.startswith('sfimpl'): + # from dune.codegen.loopy.transformations.matchfma import match_fused_multiply_add + # kernel = match_fused_multiply_add(kernel) + # kernel = reorder_loops_in_tensor_contraction(kernel, 'ijlk') + + # from dune.codegen.sumfact.autotune import autotune_realization + # test = autotune_realization(kernel=kernel, signature=signature) + + # from pudb import set_trace; set_trace() + + pass + return kernel