diff --git a/python/dune/codegen/sumfact/autotune.py b/python/dune/codegen/sumfact/autotune.py index 8537a819a2e3ed67d66bb67aa8ccb91c93ee0b34..a1d7442addc4e2d1d720b1cc81d8a914c1f75532 100644 --- a/python/dune/codegen/sumfact/autotune.py +++ b/python/dune/codegen/sumfact/autotune.py @@ -115,11 +115,12 @@ def write_setup_code(sf, filename, define_thetas=True): constructor_knl = lp.get_one_scheduled_kernel(constructor_knl) # Allocate buffers + alignment = get_option("max_vector_width") // 8 size = max(product(m.quadrature_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width, product(m.basis_size for m in sf.matrix_sequence_quadrature_permuted) * sf.vector_width) size = int(size * (get_option("precision_bits") / 8)) - f.writelines([" char buffer0[{}] __attribute__ ((aligned (32)));\n".format(size), - " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), + f.writelines([" char buffer0[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), + " char buffer1[{}] __attribute__ ((aligned ({})));\n".format(size, alignment), ]) # Setup fastdg inputs @@ -128,7 +129,7 @@ def write_setup_code(sf, filename, define_thetas=True): f.write("{} = 0;\n".format(arg)) else: size = sf.interface.fastdg_interface_object_size - f.write(" RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], size)) + f.write(" RF {}[{}] __attribute__ ((aligned ({})));\n".format(arg.split()[-1], size, alignment)) # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real),