diff --git a/bin/barplot_coarse.py b/bin/barplot_coarse.py index 67de3aefb4e9bb0d4ef42a2b61a51fd28ddc873d..47906024390a599c882437cf439a50685c45d430 100644 --- a/bin/barplot_coarse.py +++ b/bin/barplot_coarse.py @@ -19,19 +19,15 @@ opframe = pandas.read_csv("./operations.csv", av = opframe[opframe.what == "alpha_volume_kernel"] ask = opframe[opframe.what == "alpha_skeleton_kernel"] -ab = opframe[opframe.what == "alpha_boundary_kernel"] deg = [i - 0.3 for i in sorted(av['degree'])] _, y1 = list(itertools.izip(*sorted(itertools.izip(av['degree'], av['ops'])))) _, y2 = list(itertools.izip(*sorted(itertools.izip(ask['degree'], ask['ops'])))) y2 = [a+b for a, b in zip(y1, y2)] -_, y3 = list(itertools.izip(*sorted(itertools.izip(ab['degree'], ab['ops'])))) -y3 = [a+b for a, b in zip(y2, y3)] -y4 = [1.0] * len(deg) +y3 = [1.0] * len(deg) -r4 = ax.bar(deg, y4, width, color="grey") -r3 = ax.bar(deg, y3, width, color="green") -r2 = ax.bar(deg, y2, width, color="blue") +r3 = ax.bar(deg, y3, width, color="grey") +r2 = ax.bar(deg, y2, width, color="blue") r1 = ax.bar(deg, y1, width, color="red") ax.set_ylabel("Percentage") @@ -46,19 +42,15 @@ timeframe = pandas.read_csv("./timeratios.csv", av = timeframe[timeframe.what == "alpha_volume_kernel"] ask = timeframe[timeframe.what == "alpha_skeleton_kernel"] -ab = timeframe[timeframe.what == "alpha_boundary_kernel"] deg = [i + 0.3 for i in sorted(av['degree'])] _, y1 = list(itertools.izip(*sorted(itertools.izip(av['degree'], av['ops'])))) _, y2 = list(itertools.izip(*sorted(itertools.izip(ask['degree'], ask['ops'])))) y2 = [a+b for a, b in zip(y1, y2)] -_, y3 = list(itertools.izip(*sorted(itertools.izip(ab['degree'], ab['ops'])))) -y3 = [a+b for a, b in zip(y2, y3)] -y4 = [1.0] * len(deg) +y3 = [1.0] * len(deg) -r4 = ax.bar(deg, y4, width, color="grey", label="PDELab overhead") -r3 = ax.bar(deg, y3, width, color="green", label="Boundary integrals") -r2 = ax.bar(deg, y2, width, color="blue", label="Skeleton integrals") +r3 = ax.bar(deg, y3, width, color="grey", label="PDELab overhead") +r2 = ax.bar(deg, y2, width, color="blue", label="Skeleton integrals") r1 = ax.bar(deg, y1, width, color="red", label="Volume integrals") ax.legend(loc=3, diff --git a/bin/barplot_fine.py b/bin/barplot_fine.py index 4e050c80ef775780c559ed82bfa6f4a2acba1564..47f4514da839afc43d7c588ce4a7158e5d08e5b0 100644 --- a/bin/barplot_fine.py +++ b/bin/barplot_fine.py @@ -28,7 +28,8 @@ av3 = opframe[opframe.what == "alpha_volume_kernel_stage3"] ask1 = opframe[opframe.what == "alpha_skeleton_kernel_stage1"] ask2 = opframe[opframe.what == "alpha_skeleton_kernel_quadratureloop"] ask3 = opframe[opframe.what == "alpha_skeleton_kernel_stage3"] -ab = opframe[opframe.what == "alpha_boundary_kernel"] +setup1 = opframe[opframe.what == "alpha_volume_kernel_setup"] +setup2 = opframe[opframe.what == "alpha_skeleton_kernel_setup"] deg = [i - width/1.8 for i in sorted(av1['degree'])] @@ -38,23 +39,26 @@ def update(frame, result=None): return y else: return [a+b for a, b in zip(y, result)] - + + y1 = update(av1) y2 = update(av2, y1) y3 = update(av3, y2) -y4 = update(ask1, y3) -y5 = update(ask2, y4) -y6 = update(ask3, y5) -y7 = update(ab, y6) -y8 = [1.0] * len(deg) - -r8 = ax.bar(deg, y8, width, color="grey") -r7 = ax.bar(deg, y7, width, color="green") -r6 = ax.bar(deg, y6, width, color="xkcd:light blue") -r5 = ax.bar(deg, y5, width, color="xkcd:blue") -r4 = ax.bar(deg, y4, width, color="xkcd:dark blue") -r3 = ax.bar(deg, y3, width, color="yellow") -r2 = ax.bar(deg, y2, width, color="orange") +y4 = update(setup1, y3) +y5 = update(ask1, y4) +y6 = update(ask2, y5) +y7 = update(ask3, y6) +y8 = update(setup2, y7) +y9 = [1.0] * len(deg) + +r9 = ax.bar(deg, y9, width, color="grey") +r8 = ax.bar(deg, y8, width, color="xkcd:light green") +r7 = ax.bar(deg, y7, width, color="xkcd:light blue") +r6 = ax.bar(deg, y6, width, color="xkcd:blue") +r5 = ax.bar(deg, y5, width, color="xkcd:dark blue") +r4 = ax.bar(deg, y4, width, color="green") +r3 = ax.bar(deg, y3, width, color="yellow") +r2 = ax.bar(deg, y2, width, color="orange") r1 = ax.bar(deg, y1, width, color="red") ax.set_ylabel("Percentage") @@ -73,25 +77,28 @@ av3 = timeframe[timeframe.what == "alpha_volume_kernel_stage3"] ask1 = timeframe[timeframe.what == "alpha_skeleton_kernel_stage1"] ask2 = timeframe[timeframe.what == "alpha_skeleton_kernel_quadratureloop"] ask3 = timeframe[timeframe.what == "alpha_skeleton_kernel_stage3"] -ab = timeframe[timeframe.what == "alpha_boundary_kernel"] +setup1 = timeframe[timeframe.what == "alpha_volume_kernel_setup"] +setup2 = timeframe[timeframe.what == "alpha_skeleton_kernel_setup"] deg = [i + width/1.8 for i in sorted(av1['degree'])] y1 = update(av1) y2 = update(av2, y1) y3 = update(av3, y2) -y4 = update(ask1, y3) -y5 = update(ask2, y4) -y6 = update(ask3, y5) -y7 = update(ab, y6) -y8 = [1.0] * len(deg) - -r8 = ax.bar(deg, y8, width, label="PDELab overhead", color="grey") -r7 = ax.bar(deg, y7, width, label="Boundary", color="green") -r6 = ax.bar(deg, y6, width, label="Skeleton, backward SF", color="xkcd:light blue") -r5 = ax.bar(deg, y5, width, label="Skeleton, Quadrature", color="xkcd:blue") -r4 = ax.bar(deg, y4, width, label="Skeleton, forward SF", color="xkcd:dark blue") -r3 = ax.bar(deg, y3, width, label="Volume, backward SF", color="yellow") -r2 = ax.bar(deg, y2, width, label="Volume, Quadrature", color="orange") +y4 = update(setup1, y3) +y5 = update(ask1, y4) +y6 = update(ask2, y5) +y7 = update(ask3, y6) +y8 = update(setup2, y7) +y9 = [1.0] * len(deg) + +r9 = ax.bar(deg, y9, width, label="PDELab overhead", color="grey") +r8 = ax.bar(deg, y8, width, label="Geometry evaluations Skeleton", color="xkcd:light green") +r7 = ax.bar(deg, y7, width, label="Skeleton, backward SF", color="xkcd:light blue") +r6 = ax.bar(deg, y6, width, label="Skeleton, Quadrature", color="xkcd:blue") +r5 = ax.bar(deg, y5, width, label="Skeleton, forward SF", color="xkcd:dark blue") +r4 = ax.bar(deg, y4, width, label="Geometry evaluations Volume", color="green") +r3 = ax.bar(deg, y3, width, label="Volume, backward SF", color="yellow") +r2 = ax.bar(deg, y2, width, label="Volume, Quadrature", color="orange") r1 = ax.bar(deg, y1, width, label="Volume, forward SF", color="red") lgd = ax.legend(loc=3, diff --git a/bin/knltimings.sh b/bin/knltimings.sh index e299237fa622cc5c6f7022365c0d98cb60e886c2..4da92dca0743450a78f36f568ea7d50d5cb0d64e 100755 --- a/bin/knltimings.sh +++ b/bin/knltimings.sh @@ -7,12 +7,12 @@ then fi # Search for runnable executables -FILES=$(ls *.ini) +FILES=$(ls *.ini | grep -v '^verify') for inifile in $FILES do line=$(grep ^"opcounter = " $inifile) extract=${line##opcounter = } - UPPER=1 + UPPER=10 if [ $extract -eq 1 ] then UPPER=1 diff --git a/bin/process_measurements.py b/bin/process_measurements.py index 0d45c7293befca1c166617746c13069c3e9c57cc..53fb0873dee3a3cf3719c2faf5f98bc2a2925a18 100755 --- a/bin/process_measurements.py +++ b/bin/process_measurements.py @@ -22,10 +22,27 @@ def get_reference_kernel(kernel): return kernel +def get_reference_integral_kernel(kernel): + if kernel.startswith("alpha") and len(kernel.split("_")) == 4: + return "_".join(kernel.split("_")[:-1]) + if kernel.startswith("jacobian_apply") and len(kernel.split("_")) == 5: + return "_".join(kernel.split("_")[:-1]) + return kernel + + +def get_siblings_kernel(kernel): + s = kernel.split("_") + if (kernel.startswith("alpha") and len(s) == 4) or (kernel.startswith("jacobian_apply") and len(s) == 5): + for suffix in ["setup", "stage1", "quadratureloop", "stage3"]: + yield "_".join(s[:-1] + [suffix]) + else: + yield kernel + def calculate_operations_percentage(): - frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ') + frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ') ops = frame[frame.what != "time"] ops = ops.groupby(('rank', 'ident', 'kernel'))['value'].max().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max() + with open('operations.csv', 'w') as out: for key in ops.keys(): ident, kernel = key @@ -34,18 +51,26 @@ def calculate_operations_percentage(): def calculate_times_percentage(): - frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ') - time = frame[frame.what == "time"] - time = time.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max() + frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ') + + time4 = frame[(frame.what == "time") & (frame.level == 4)] + time4 = time4.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max() + time3 = frame[(frame.what == "time") & (frame.level == 3)] + time3 = time3.groupby(('rank', 'ident', 'kernel'))['value'].min().to_frame().reset_index().groupby(('ident', 'kernel'))['value'].max() + with open('timeratios.csv', 'w') as out: - for key in time.keys(): + for key in time4.keys(): ident, kernel = key degree = re.match(".*deg([0-9]*).*", ident).group(1) - out.write(" ".join([ident, degree, kernel, str(time[ident][kernel] / time[ident][get_reference_kernel(kernel)]) + "\n"])) + stage = time4[ident][kernel] + sum4 = sum(time4[ident][k] for k in get_siblings_kernel(kernel)) + int3 = time3[ident][get_reference_integral_kernel(kernel)] + ref = time3[ident][get_reference_kernel(kernel)] + out.write(" ".join([ident, degree, kernel, str(float(stage*int3) / float(sum4*ref)) + "\n"])) def calculate_floprate(): - frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ') + frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ') time = frame[frame.what == "time"] ops = frame[frame.what != "time"] @@ -56,11 +81,12 @@ def calculate_floprate(): for key in time.keys(): ident, kernel = key degree = re.match(".*deg([0-9]*).*", ident).group(1) - out.write(" ".join([ident, degree, kernel, str((ops[ident][kernel] / time[ident][kernel]) / 1e9)]) + "\n") + operations = ops[ident][kernel] + out.write(" ".join([ident, degree, kernel, str((operations / time[ident][kernel]) / 1e9)]) + "\n") def calculate_doftimes(): - frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'ident', 'kernel', 'what', 'value'), delimiter=' ') + frame = pandas.read_csv('timings.csv', header=None, names=('rank', 'level', 'ident', 'kernel', 'what', 'value'), delimiter=' ') dofs = frame[frame.what == "dofs"] time = frame[frame.what == "time"] diff --git a/dune/perftool/common/opcounter.hh b/dune/perftool/common/opcounter.hh index 9bcd4ce2fc252d6fde7b2c50339904d6cffc6ae4..5a0115e6b5649d45b86db1432d83a66330e00e0e 100644 --- a/dune/perftool/common/opcounter.hh +++ b/dune/perftool/common/opcounter.hh @@ -158,22 +158,22 @@ namespace oc { blend_count = 0; } - template<typename Stream> - void reportOperations(Stream& os, std::string exec, std::string kernel, bool doReset = false) + template<typename Stream, typename T> + void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false) { auto total = addition_count + multiplication_count + division_count + exp_count + pow_count + sin_count + sqrt_count + comparison_count; if (total == 0) return; - os << exec << " " << kernel << " additions " << addition_count << std::endl - << exec << " " << kernel << " multiplications " << multiplication_count << std::endl - << exec << " " << kernel << " divisions " << division_count << std::endl - << exec << " " << kernel << " exp " << exp_count << std::endl - << exec << " " << kernel << " pow " << pow_count << std::endl - << exec << " " << kernel << " sin " << sin_count << std::endl - << exec << " " << kernel << " sqrt " << sqrt_count << std::endl - << exec << " " << kernel << " comparisons " << comparison_count << std::endl - << exec << " " << kernel << " blends " << blend_count << std::endl - << exec << " " << kernel << " totalops " << total << std::endl; + os << level << " " << exec << " " << kernel << " additions " << addition_count << std::endl + << level << " " << exec << " " << kernel << " multiplications " << multiplication_count << std::endl + << level << " " << exec << " " << kernel << " divisions " << division_count << std::endl + << level << " " << exec << " " << kernel << " exp " << exp_count << std::endl + << level << " " << exec << " " << kernel << " pow " << pow_count << std::endl + << level << " " << exec << " " << kernel << " sin " << sin_count << std::endl + << level << " " << exec << " " << kernel << " sqrt " << sqrt_count << std::endl + << level << " " << exec << " " << kernel << " comparisons " << comparison_count << std::endl + << level << " " << exec << " " << kernel << " blends " << blend_count << std::endl + << level << " " << exec << " " << kernel << " totalops " << total << std::endl; if (doReset) reset(); @@ -240,10 +240,10 @@ namespace oc { counters.reset(); } - template<typename Stream> - static void reportOperations(Stream& os, std::string exec, std::string kernel, bool doReset = false) + template<typename Stream, typename T> + static void reportOperations(Stream& os, T level, std::string exec, std::string kernel, bool doReset = false) { - counters.reportOperations(os, exec, kernel, doReset); + counters.reportOperations(os, level, exec, kernel, doReset); } static Counters counters; diff --git a/dune/perftool/common/timer_chrono.hh b/dune/perftool/common/timer_chrono.hh index 6e2098a10a2433d0a3ffdf108d8d226ed82cf3f0..d1b0bd9b5d50e43a32a07623ef417c76337d8af6 100644 --- a/dune/perftool/common/timer_chrono.hh +++ b/dune/perftool/common/timer_chrono.hh @@ -79,35 +79,35 @@ #ifdef ENABLE_COUNTER -#define DUMP_TIMER(name,os,reset)\ +#define DUMP_TIMER(level,name,os,reset)\ if (HP_TIMER_ELAPSED(name) > 1e-12) \ - os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset); + os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ + HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) \ +#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) \ if (HP_TIMER_ELAPSED(name) > 1e-12) \ - os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ time += HP_TIMER_ELAPSED(name); \ ops += HP_TIMER_OPCOUNTERS(name); \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset); + HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); #elif defined ENABLE_HP_TIMERS -#define DUMP_TIMER(name,os,reset) \ +#define DUMP_TIMER(level,name,os,reset) \ if (HP_TIMER_ELAPSED(name) > 1e-12) \ - os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ if (reset) HP_TIMER_RESET(name); -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) \ +#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) \ if (HP_TIMER_ELAPSED(name) > 1e-12) \ - os << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << HP_TIMER_ELAPSED(name) << std::endl; \ time += HP_TIMER_ELAPSED(name); \ if (reset) HP_TIMER_RESET(name); #else -#define DUMP_TIMER(name,os,reset) -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) +#define DUMP_TIMER(level,name,os,reset) +#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) #endif diff --git a/dune/perftool/common/timer_tsc.hh b/dune/perftool/common/timer_tsc.hh index a893fdae507fc2319ce16206b0baefe30e305b81..449badc271f35c81cef500b234eb2d6eb315eaaf 100644 --- a/dune/perftool/common/timer_tsc.hh +++ b/dune/perftool/common/timer_tsc.hh @@ -77,35 +77,35 @@ #ifdef ENABLE_COUNTER -#define DUMP_TIMER(name,os,reset)\ +#define DUMP_TIMER(level,name,os,reset)\ if (HP_TIMER_DURATION(name) > 1e-12) \ - os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset); + os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) \ +#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) \ if (HP_TIMER_DURATION(name) > 1e-12) \ - os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ time += HP_TIMER_DURATION(name); \ ops += HP_TIMER_OPCOUNTERS(name); \ - HP_TIMER_OPCOUNTERS(name).reportOperations(os,ident,#name,reset); + HP_TIMER_OPCOUNTERS(name).reportOperations(os,#level,ident,#name,reset); #elif defined ENABLE_HP_TIMERS -#define DUMP_TIMER(name,os,reset) \ +#define DUMP_TIMER(level,name,os,reset) \ if (HP_TIMER_DURATION(name) > 1e-12) \ - os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ if (reset) HP_TIMER_RESET(name); -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) \ +#define DUMP_AND_ACCUMULATE_TIMER(level, name,os,reset,time,ops) \ if (HP_TIMER_DURATION(name) > 1e-12) \ - os << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ + os << #level << " " << ident << " " << #name << " time " << Dune::PDELab::TSC::seconds(HP_TIMER_DURATION(name)) << std::endl; \ time += HP_TIMER_DURATION(name); \ if (reset) HP_TIMER_RESET(name); #else -#define DUMP_TIMER(name,os,reset) -#define DUMP_AND_ACCUMULATE_TIMER(name,os,reset,time,ops) +#define DUMP_TIMER(level,name,os,reset) +#define DUMP_AND_ACCUMULATE_TIMER(level,name,os,reset,time,ops) #endif diff --git a/dune/perftool/sumfact/transposereg.hh b/dune/perftool/sumfact/transposereg.hh index 9d924bf900afc5fc1865b142cb0501d8a78b7a34..6a40a8ea466f1c36fdbe18d5b7557441589cefdf 100644 --- a/dune/perftool/sumfact/transposereg.hh +++ b/dune/perftool/sumfact/transposereg.hh @@ -33,15 +33,30 @@ void transpose_reg(Vec4d& a0, Vec4d& a1) void transpose_reg(Vec8d& a0, Vec8d& a1, Vec8d& a2, Vec8d& a3) { - Vec8d b0, b1, b2, b3; - b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1); - b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1); - b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3); - b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3); - a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2); - a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2); - a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3); - a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3); + auto ac = _mm512_castpd_si512(a); + auto bc = _mm512_castpd_si512(b); + auto cc = _mm512_castpd_si512(c); + auto dc = _mm512_castpd_si512(d); + + auto t1 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(ac, 0xCC, bc, ac, 6)); + auto t2 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(bc, 0x33, bc, ac, 2)); + auto t3 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(cc, 0xCC, dc, cc, 6)); + auto t4 = _mm512_castsi512_pd(_mm512_mask_alignr_epi64(dc, 0x33, dc, cc, 2)); + + a = _mm512_insertf64x4(t1, _mm512_extractf64x4_pd(t3, 0), 1); + c = _mm512_insertf64x4(t3, _mm512_extractf64x4_pd(t1, 1), 0); + b = _mm512_insertf64x4(t2, _mm512_extractf64x4_pd(t4, 0), 1); + d = _mm512_insertf64x4(t4, _mm512_extractf64x4_pd(t2, 1), 0); +// +// Vec8d b0, b1, b2, b3; +// b0 = blend8d<0,1,8,9,2,3,10,11>(a0, a1); +// b1 = blend8d<4,5,12,13,6,7,14,15>(a0, a1); +// b2 = blend8d<0,1,8,9,2,3,10,11>(a2, a3); +// b3 = blend8d<4,5,12,13,6,7,14,15>(a2, a3); +// a0 = blend8d<0,1,2,3,8,9,10,11>(b0, b2); +// a1 = blend8d<4,5,6,7,12,13,14,15>(b0, b2); +// a2 = blend8d<0,1,2,3,8,9,10,11>(b1, b3); +// a3 = blend8d<4,5,6,7,12,13,14,15>(b1, b3); } void transpose_reg (Vec8d& a0, Vec8d& a1) diff --git a/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch b/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch new file mode 100644 index 0000000000000000000000000000000000000000..10f1cd949df56339c25682fb578343e16774ceef --- /dev/null +++ b/patches/vectorclass/0001-Add-an-alternative-implementation-of-horizontal_add-.patch @@ -0,0 +1,41 @@ +From d46b04686c44bea537773c99e5001482972a454a Mon Sep 17 00:00:00 2001 +From: Dominic Kempf <dominic.kempf@iwr.uni-heidelberg.de> +Date: Thu, 12 Oct 2017 15:14:48 +0200 +Subject: [PATCH] Add an alternative implementation of horizontal_add for Vec8d + +--- + vectorf512.h | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/vectorf512.h b/vectorf512.h +index 0845d12..fb1a4d4 100644 +--- a/vectorf512.h ++++ b/vectorf512.h +@@ -1339,13 +1339,19 @@ static inline Vec8d if_mul (Vec8db const & f, Vec8d const & a, Vec8d const & b) + + + // General arithmetic functions, etc. ++extern __inline double ++__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) ++_mm512_cvtsd_f64 (__m512d __A) ++{ ++ return __A[0]; ++} + + // Horizontal add: Calculates the sum of all vector elements. +-static inline double horizontal_add (Vec8d const & a) { +-#if defined(__INTEL_COMPILER) +- return _mm512_reduce_add_pd(a); +-#else +- return horizontal_add(a.get_low() + a.get_high()); ++static inline double horizontal_add (Vec8d const & x) { ++ __m512d intermediate = _mm512_add_pd(x, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(x), _mm512_castpd_si512(x), 1)$ ++ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castp$ ++ intermediate = _mm512_add_pd(intermediate, _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(intermediate), _mm512_castp$ ++ return _mm512_cvtsd_f64(intermediate); + #endif + } + +-- +2.1.4 + diff --git a/python/dune/perftool/generation/cpp.py b/python/dune/perftool/generation/cpp.py index 0db90171a584a2237314fbf399e89656c7154a21..0f44b6950ef9b111f90f0d7a922835912c5b4500 100644 --- a/python/dune/perftool/generation/cpp.py +++ b/python/dune/perftool/generation/cpp.py @@ -2,7 +2,7 @@ Define some generators based on the caching mechanism that are commonly needed for code generation """ - +from dune.perftool.options import get_option from dune.perftool.generation import generator_factory from dune.perftool.cgen.clazz import AccessModifier, BaseClass, ClassMember @@ -48,5 +48,5 @@ def dump_accumulate_timer(name): # reset = name_time_dumper_reset() reset = 'false' - code = "DUMP_TIMER({},{},{});".format(name, os, reset) + code = "DUMP_TIMER({},{},{},{});".format(get_option("instrumentation_level"), name, os, reset) return code diff --git a/python/dune/perftool/pdelab/driver/__init__.py b/python/dune/perftool/pdelab/driver/__init__.py index 2c38881f4a7351a199bc43f1515d8fc55c9914d2..46ced0b36a5032a32f1aa2ca59e9879e998a96c8 100644 --- a/python/dune/perftool/pdelab/driver/__init__.py +++ b/python/dune/perftool/pdelab/driver/__init__.py @@ -295,7 +295,7 @@ def generate_driver(formdatas, data): post_include("HP_DECLARE_TIMER(driver);\n", filetag="driver") contents.insert(0, Line(text="HP_TIMER_START(driver);\n")) contents.insert(len(contents) - 1, Line(text="HP_TIMER_STOP(driver);\n")) - contents.insert(len(contents) - 1, Line(text="DUMP_TIMER(driver, {}, true);\n".format(timestream))) + contents.insert(len(contents) - 1, Line(text="DUMP_TIMER({}, driver, {}, true);\n".format(get_option("instrumentation_level"), timestream))) contents.insert(0, Line(text="\n")) driver_body = Block(contents) driver = FunctionBody(driver_signature, driver_body) diff --git a/python/dune/perftool/pdelab/driver/solve.py b/python/dune/perftool/pdelab/driver/solve.py index 389a10c4c350801d71b7035e7a8115de8f8cff16..8042492c357580f23270f01718d02fc2823d99bd 100644 --- a/python/dune/perftool/pdelab/driver/solve.py +++ b/python/dune/perftool/pdelab/driver/solve.py @@ -67,7 +67,7 @@ def dune_solve(): solve = ["HP_TIMER_START(solve);", "{}".format(solve), "HP_TIMER_STOP(solve);", - "DUMP_TIMER(solve, {}, true);".format(timestream), + "DUMP_TIMER({}, solve, {}, true);".format(get_option("instrumentation_level"), timestream), ] if get_option('instrumentation_level') >= 3: solve.extend(print_times) diff --git a/python/dune/perftool/pdelab/driver/timings.py b/python/dune/perftool/pdelab/driver/timings.py index d420e61eba8adfcad4188f3cdf9e7441de610ba1..5c43a67a1550749dd92776688940744d711d2731 100644 --- a/python/dune/perftool/pdelab/driver/timings.py +++ b/python/dune/perftool/pdelab/driver/timings.py @@ -11,7 +11,9 @@ from dune.perftool.pdelab.driver import (get_formdata, name_initree, name_mpihelper, ) -from dune.perftool.pdelab.driver.gridfunctionspace import (name_trial_gfs, +from dune.perftool.pdelab.driver.gridfunctionspace import (name_leafview, + name_trial_gfs, + type_leafview, ) from dune.perftool.pdelab.driver.gridoperator import (name_gridoperator, name_localoperator, @@ -37,9 +39,16 @@ def name_timing_identifier(): @preamble def dump_dof_numbers(stream): ident = name_timing_identifier() - return "{} << {} << \" dofs dofs \" << {}.size() << std::endl;".format(stream, - ident, - name_trial_gfs()) + level = get_option("instrumentation_level") + include_file("dune/pdelab/common/partitionviewentityset.hh", filetag="driver") + gvt = type_leafview() + gv = name_leafview() + return ["Dune::PDELab::NonOverlappingEntitySet<{}> es({});".format(gvt, gv), + "{} << \"{} \" << {} << \" dofs dofs \" << {}.maxLocalSize() * es.size(0) << std::endl;".format(stream, + level, + ident, + name_trial_gfs()) + ] @preamble @@ -98,7 +107,7 @@ def evaluate_residual_timer(): evaluation = ["HP_TIMER_START(residual_evaluation);", "{}.residual({}, r);".format(n_go, v), "HP_TIMER_STOP(residual_evaluation);", - "DUMP_TIMER(residual_evaluation, {}, true);".format(timestream)] + "DUMP_TIMER({}, residual_evaluation, {}, true);".format(get_option("instrumentation_level"), timestream)] evaluation.extend(print_times) else: evaluation = ["{}.residual({}, r);".format(n_go, v)] @@ -137,7 +146,7 @@ def apply_jacobian_timer(): evaluation = ["HP_TIMER_START(apply_jacobian);", "{}.jacobian_apply({}, j);".format(n_go, v), "HP_TIMER_STOP(apply_jacobian);", - "DUMP_TIMER(apply_jacobian, {}, true);".format(timestream)] + "DUMP_TIMER({}, apply_jacobian, {}, true);".format(get_option("instrumentation_level"), timestream)] evaluation.extend(print_times) else: evaluation = ["{}.jacobian_apply({}, j);".format(n_go, v)] @@ -174,7 +183,7 @@ def assemble_matrix_timer(): assembly = ["HP_TIMER_START(matrix_assembly);", "{}.jacobian({},m);".format(n_go, v), "HP_TIMER_STOP(matrix_assembly);", - "DUMP_TIMER(matrix_assembly, {}, true);".format(timestream)] + "DUMP_TIMER({}, matrix_assembly, {}, true);".format(get_option("instrumentation_level"), timestream)] assembly.extend(print_times) else: assembly = ["{}.jacobian({},m);".format(n_go, v)] diff --git a/python/dune/perftool/ufl/visitor.py b/python/dune/perftool/ufl/visitor.py index 972a4bb7ac39c847c841993b12165b8264a36f73..7a3b15d201c5968b0eb4524964e4270523d9be1b 100644 --- a/python/dune/perftool/ufl/visitor.py +++ b/python/dune/perftool/ufl/visitor.py @@ -55,8 +55,8 @@ class UFL2LoopyVisitor(ModifiedTerminalTracker): self.current_info = info expr = self._call(o, False) if expr != 0: - from dune.perftool.sympy import simplify_pymbolic_expression - expr = simplify_pymbolic_expression(expr) + # from dune.perftool.sympy import simplify_pymbolic_expression + # expr = simplify_pymbolic_expression(expr) self.interface.generate_accumulation_instruction(expr, self) def _call(self, o, do_predicates):