diff --git a/python/dune/perftool/sumfact/autotune.py b/python/dune/perftool/sumfact/autotune.py index a9517c43b9144aa19a200e10931d8dc455928b81..1b48dffc14871588d19b278fa77c004f92079984 100644 --- a/python/dune/perftool/sumfact/autotune.py +++ b/python/dune/perftool/sumfact/autotune.py @@ -64,6 +64,7 @@ def generate_standalone_code(sf, filename, logname): "#include<dune/perftool/common/tsc.hh>\n", "#include<dune/perftool/common/vectorclass.hh>\n", "#include<dune/perftool/sumfact/onedquadrature.hh>\n", + "#include<dune/perftool/sumfact/horizontaladd.hh>\n", "#include<random>\n", "#include<fstream>\n", "\n" @@ -98,6 +99,16 @@ def generate_standalone_code(sf, filename, logname): " char buffer1[{}] __attribute__ ((aligned (32)));\n".format(size), ]) + # Setup fastdg inputs + for arg in sf.interface.signature_args: + if "jacobian" in arg: + f.write("{} = 0;\n".format(arg)) + else: + basis_size = product(m.basis_size for m in sf.matrix_sequence) + if sf.within_inames: + basis_size = basis_size * basis_size + f.write("RF {}[{}] __attribute__ ((aligned (32)));\n".format(arg.split()[-1], product(m.basis_size for m in sf.matrix_sequence))) + # Write stuff into the input buffer f.writelines([" {0} *input = ({0} *)buffer0;\n".format(real), " {0} *output = ({0} *)buffer{1};\n".format(real, sf.length % 2), @@ -129,7 +140,7 @@ def generate_standalone_code(sf, filename, logname): # Add setup code for theta matrices. We add some lines not necessary, # but it would be more work to remove them than keeping them. for line in lp.generate_body(constructor_knl).split("\n")[1:-1]: - if "gfsu" not in line and "meshwidth" not in line: + if "gfsu" not in line and "meshwidth" not in line and "geometry" not in line: f.write(" {}\n".format(line)) # INtroduces a variable that makes sure that the kernel cannot be optimized away