diff --git a/.gitignore b/.gitignore index 202c75c..779591a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /ntl-9.6.2.tar.gz /ntl-9.6.4.tar.gz /ntl-9.7.0.tar.gz +/ntl-9.8.0.tar.gz diff --git a/ntl-loadtime-cpu.patch b/ntl-loadtime-cpu.patch new file mode 100644 index 0000000..117711f --- /dev/null +++ b/ntl-loadtime-cpu.patch @@ -0,0 +1,2286 @@ +--- doc/config.txt.orig 2016-04-26 05:40:15.000000000 -0600 ++++ doc/config.txt 2016-04-26 18:22:38.925719916 -0600 +@@ -291,6 +291,7 @@ NTL_GF2X_NOINLINE=off + NTL_GF2X_ALTCODE=off + NTL_GF2X_ALTCODE1=off + NTL_PCLMUL=off ++NTL_LOADTIME_CPU=off + + GMP_INCDIR=$(GMP_PREFIX)/include + GMP_LIBDIR=$(GMP_PREFIX)/lib +@@ -638,6 +639,10 @@ NTL_PCLMUL=off + # switch to enable the PCLMUL instruction on x86 machines for faster arithmetic + # over GF(2)[X] (without relying on the gf2x package) + ++NTL_LOADTIME_CPU=off ++ ++# switch to check CPU characteristics at load time and use routines ++# optimized for the executing CPU. + + + ########## More GMP Options: +--- include/NTL/config.h.orig 2016-04-26 05:40:16.000000000 -0600 ++++ include/NTL/config.h 2016-04-26 18:22:38.929719590 -0600 +@@ -616,6 +616,23 @@ using the configure script. + #endif + + ++#if 0 ++#define NTL_LOADTIME_CPU ++ ++/* ++ * With this flag enabled, detect advanced CPU features at load time instead ++ * of at compile time. This flag is intended for distributions, so that they ++ * can compile for the lowest common denominator CPU, but still support newer ++ * CPUs. ++ * ++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later. ++ * ++ * To re-build after changing this flag: ++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a ++ */ ++ ++#endif ++ + + + +--- include/NTL/ctools.h.orig 2016-04-26 05:40:15.000000000 -0600 ++++ include/NTL/ctools.h 2016-04-26 18:22:38.930719509 -0600 +@@ -422,6 +422,137 @@ void _ntl_swap(T*& a, T*& b) + // this should be big enough to satisfy any SIMD instructions, + // and it should also be as big as a cache line + ++/* Determine CPU characteristics at runtime */ ++#ifdef NTL_LOADTIME_CPU ++#if !defined(__x86_64__) ++#error Runtime CPU support is only available on x86_64. ++#endif ++#ifndef __GNUC__ ++#error Runtime CPU support is only available with GCC. ++#endif ++#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6) ++#error Runtime CPU support is only available with GCC 4.6 or later. ++#endif ++ ++#include ++#ifndef bit_PCLMUL ++#define bit_PCLMUL (1 << 1) ++#endif ++#ifndef bit_AVX ++#define bit_AVX (1 << 28) ++#endif ++#ifndef bit_FMA ++#define bit_FMA (1 << 12) ++#endif ++#ifndef bit_AVX2 ++#define bit_AVX2 (1 << 5) ++#endif ++ ++#define BASE_FUNC(type,name) static type name##_base ++#define TARGET_FUNC(arch,suffix,type,name) \ ++ static type __attribute__((target (arch))) name##_##suffix ++#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name) ++#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name) ++#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name) ++#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name) ++#define PCLMUL_RESOLVER(type,name,params) \ ++ extern "C" { \ ++ static void __attribute__((optimize ("O0"))) \ ++ (*resolve_##name (void))(void) { \ ++ if (__builtin_expect(have_pclmul, 0) < 0) { \ ++ unsigned int eax, ebx, ecx, edx; \ ++ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ ++ have_pclmul = ((ecx & bit_PCLMUL) != 0); \ ++ have_avx = ((ecx & bit_AVX) != 0); \ ++ have_fma = ((ecx & bit_FMA) != 0); \ ++ } else { \ ++ have_pclmul = 0; \ ++ have_avx = 0; \ ++ have_fma = 0; \ ++ } \ ++ } \ ++ if (have_avx) return (void (*)(void))&name##_avx; \ ++ if (have_pclmul) return (void (*)(void))&name##_pclmul; \ ++ return (void (*)(void))&name##_base; \ ++ } \ ++ } \ ++ type __attribute__((ifunc ("resolve_" #name))) name params ++#define AVX_RESOLVER(type,name,params) \ ++ extern "C" { \ ++ static void __attribute__((optimize ("O0"))) \ ++ (*resolve_##name (void))(void) { \ ++ if (__builtin_expect(have_pclmul, 0) < 0) { \ ++ unsigned int eax, ebx, ecx, edx; \ ++ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ ++ have_pclmul = ((ecx & bit_PCLMUL) != 0); \ ++ have_avx = ((ecx & bit_AVX) != 0); \ ++ have_fma = ((ecx & bit_FMA) != 0); \ ++ } else { \ ++ have_pclmul = 0; \ ++ have_avx = 0; \ ++ have_fma = 0; \ ++ } \ ++ } \ ++ return have_avx \ ++ ? (void (*)(void))&name##_avx \ ++ : (void (*)(void))&name##_base; \ ++ } \ ++ } \ ++ type __attribute__((ifunc ("resolve_" #name))) name params ++#define FMA_RESOLVER(type,name,params) \ ++ extern "C" { \ ++ static void __attribute__((optimize ("O0"))) \ ++ (*resolve_##name (void))(void) { \ ++ if (__builtin_expect(have_pclmul, 0) < 0) { \ ++ unsigned int eax, ebx, ecx, edx; \ ++ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ ++ have_pclmul = ((ecx & bit_PCLMUL) != 0); \ ++ have_avx = ((ecx & bit_AVX) != 0); \ ++ have_fma = ((ecx & bit_FMA) != 0); \ ++ } else { \ ++ have_pclmul = 0; \ ++ have_avx = 0; \ ++ have_fma = 0; \ ++ } \ ++ } \ ++ return have_fma \ ++ ? (void (*)(void))&name##_fma \ ++ : (void (*)(void))&name##_avx; \ ++ } \ ++ } \ ++ type __attribute__((ifunc ("resolve_" #name))) name params ++#define AVX2_RESOLVER(type,name,params) \ ++ extern "C" { \ ++ static void __attribute__((optimize ("O0"))) \ ++ (*resolve_##name (void))(void) { \ ++ if (__builtin_expect(have_avx2, 0) < 0) { \ ++ unsigned int eax, ebx, ecx, edx; \ ++ if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) { \ ++ have_avx2 = ((ebx & bit_AVX2) != 0); \ ++ } else { \ ++ have_avx2 = 0; \ ++ } \ ++ } \ ++ if (__builtin_expect(have_pclmul, 0) < 0) { \ ++ unsigned int eax, ebx, ecx, edx; \ ++ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { \ ++ have_pclmul = ((ecx & bit_PCLMUL) != 0); \ ++ have_avx = ((ecx & bit_AVX) != 0); \ ++ have_fma = ((ecx & bit_FMA) != 0); \ ++ } else { \ ++ have_pclmul = 0; \ ++ have_avx = 0; \ ++ have_fma = 0; \ ++ } \ ++ } \ ++ unsigned int eax, ebx, ecx, edx; \ ++ return have_avx2 \ ++ ? (void (*)(void))&name##_avx2 \ ++ : (void (*)(void))&name##_fma; \ ++ } \ ++ } \ ++ type __attribute__((ifunc ("resolve_" #name))) name params ++#endif + + + +--- include/NTL/def_config.h.orig 2016-04-26 05:40:15.000000000 -0600 ++++ include/NTL/def_config.h 2016-04-26 18:22:38.930719509 -0600 +@@ -616,6 +616,22 @@ using the configure script. + #endif + + ++#if 0 ++#define NTL_LOADTIME_CPU ++ ++/* ++ * With this flag enabled, detect advanced CPU features at load time instead ++ * of at compile time. This flag is intended for distributions, so that they ++ * can compile for the lowest common denominator CPU, but still support newer ++ * CPUs. ++ * ++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later. ++ * ++ * To re-build after changing this flag: ++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a ++ */ ++ ++#endif + + + +--- src/cfile.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/cfile 2016-04-26 18:22:38.931719428 -0600 +@@ -616,6 +616,23 @@ using the configure script. + #endif + + ++#if @{NTL_LOADTIME_CPU} ++#define NTL_LOADTIME_CPU ++ ++/* ++ * With this flag enabled, detect advanced CPU features at load time instead ++ * of at compile time. This flag is intended for distributions, so that they ++ * can compile for the lowest common denominator CPU, but still support newer ++ * CPUs. ++ * ++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later. ++ * ++ * To re-build after changing this flag: ++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a ++ */ ++ ++#endif ++ + + @{WIZARD_HACK} + +--- src/DispSettings.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/DispSettings.c 2016-04-26 18:22:38.931719428 -0600 +@@ -186,6 +186,9 @@ cout << "Performance Options:\n"; + cout << "NTL_PCLMUL\n"; + #endif + ++#ifdef NTL_LOADTIME_CPU ++ cout << "NTL_LOADTIME_CPU\n"; ++#endif + + cout << "\n\n"; + +--- src/DoConfig.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/DoConfig 2016-04-26 18:24:47.237292382 -0600 +@@ -1,7 +1,7 @@ + # This is a perl script, invoked from a shell + + # use warnings; # this doesn't work on older versions of perl +- ++use Config; + + + %MakeFlag = ( +@@ -82,6 +82,7 @@ + 'NTL_RANGE_CHECK' => 'off', + 'NTL_FFT_BIGTAB' => 'off', + 'NTL_FFT_LAZYMUL' => 'off', ++'NTL_LOADTIME_CPU' => 'off', + + ); + +@@ -148,6 +149,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' & + } + + ++# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE ++ ++if ($ConfigFlag{'NTL_LOADTIME_CPU'} eq 'on') { ++ if (!$Config{archname} =~ /x86_64/) { ++ die "Error: NTL_LOADTIME_CPU currently only available with x86_64...sorry\n"; ++ } ++ $ConfigFlag{'NTL_GF2X_NOINLINE'} = 'on'; ++} ++ + + # some special MakeVal values that are determined by SHARED + +--- src/GF2X1.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/GF2X1.c 2016-04-26 18:22:38.933719265 -0600 +@@ -19,7 +19,7 @@ + // simple scaling factor for some crossover points: + // we use a lower crossover of the underlying multiplication + // is faster +-#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL)) ++#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL) || defined (NTL_LOADTIME_CPU)) + #define XOVER_SCALE (1L) + #else + #define XOVER_SCALE (2L) +--- src/GF2X.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/GF2X.c 2016-04-26 18:22:38.933719265 -0600 +@@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned + __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); + _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0)); + } ++#elif defined (NTL_LOADTIME_CPU) ++ ++#include ++ ++static int have_pclmul = -1; ++static int have_avx = -1; ++static int have_fma = -1; ++ ++#define NTL_INLINE inline ++ ++#define pclmul_mul1(c,a,b) do { \ ++ __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \ ++ __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \ ++ _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0)); \ ++} while (0) ++ + #else + + +@@ -579,6 +595,27 @@ void add(GF2X& x, const GF2X& a, const G + + + ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ NTL_EFF_BB_MUL_CODE0 ++} ++ ++PCLMUL_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ pclmul_mul1(c, a, b); ++} ++ ++AVX_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ pclmul_mul1(c, a, b); ++} ++ ++PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); ++ ++#else ++ + static + void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) + { +@@ -592,6 +629,7 @@ NTL_EFF_BB_MUL_CODE0 + + } + ++#endif + + #ifdef NTL_GF2X_NOINLINE + +@@ -616,6 +654,51 @@ NTL_EFF_BB_MUL_CODE0 + #endif + + ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,Mul1) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ NTL_EFF_BB_MUL_CODE1 ++} ++ ++PCLMUL_FUNC(void,Mul1) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] = carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] = carry; ++} ++ ++AVX_FUNC(void,Mul1) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] = carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] = carry; ++} ++ ++PCLMUL_RESOLVER(static void,Mul1, ++ (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); ++ ++#else ++ + static + void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) + { +@@ -643,6 +726,53 @@ NTL_EFF_BB_MUL_CODE1 + + } + ++#endif ++ ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,AddMul1) ++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) ++{ ++ NTL_EFF_BB_MUL_CODE2 ++} ++ ++PCLMUL_FUNC(void,AddMul1) ++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] ^= carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] ^= carry; ++} ++ ++AVX_FUNC(void,AddMul1) ++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] ^= carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] ^= carry; ++} ++ ++PCLMUL_RESOLVER(static void,AddMul1, ++ (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)); ++ ++#else ++ + static + void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a) + { +@@ -671,6 +801,52 @@ NTL_EFF_BB_MUL_CODE2 + + } + ++#endif ++ ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,Mul1_short) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ NTL_EFF_SHORT_BB_MUL_CODE1 ++} ++ ++PCLMUL_FUNC(void,Mul1_short) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] = carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] = carry; ++} ++ ++AVX_FUNC(void,Mul1_short) ++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) ++{ ++ long i; ++ unsigned long carry, prod[2]; ++ ++ carry = 0; ++ for (i = 0; i < sb; i++) { ++ pclmul_mul1(prod, bp[i], a); ++ cp[i] = carry ^ prod[0]; ++ carry = prod[1]; ++ } ++ ++ cp[sb] = carry; ++} ++ ++PCLMUL_RESOLVER(static void,Mul1_short, ++ (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)); ++ ++#else + + static + void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a) +@@ -699,9 +875,29 @@ NTL_EFF_SHORT_BB_MUL_CODE1 + + } + ++#endif ++ ++ ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ NTL_EFF_HALF_BB_MUL_CODE0 ++} ++ ++PCLMUL_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ pclmul_mul1(c, a, b); ++} + ++AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) ++{ ++ pclmul_mul1(c, a, b); ++} + ++PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)); + ++#else + + static + void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b) +@@ -716,6 +912,8 @@ NTL_EFF_HALF_BB_MUL_CODE0 + + } + ++#endif ++ + + // mul2...mul8 hard-code 2x2...8x8 word multiplies. + // I adapted these routines from LiDIA (except mul3, see below). +@@ -1627,6 +1825,77 @@ static const _ntl_ulong sqrtab[256] = { + + + ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,sqr)(GF2X& c, const GF2X& a) ++{ ++ long sa = a.xrep.length(); ++ if (sa <= 0) { ++ clear(c); ++ return; ++ } ++ ++ c.xrep.SetLength(sa << 1); ++ _ntl_ulong *cp = c.xrep.elts(); ++ const _ntl_ulong *ap = a.xrep.elts(); ++ ++ for (long i = sa-1; i >= 0; i--) { ++ _ntl_ulong *c = cp + (i << 1); ++ _ntl_ulong a = ap[i]; ++ _ntl_ulong hi, lo; ++ ++ NTL_BB_SQR_CODE ++ ++ c[0] = lo; ++ c[1] = hi; ++ } ++ ++ c.normalize(); ++ return; ++} ++ ++PCLMUL_FUNC(void,sqr)(GF2X& c, const GF2X& a) ++{ ++ long sa = a.xrep.length(); ++ if (sa <= 0) { ++ clear(c); ++ return; ++ } ++ ++ c.xrep.SetLength(sa << 1); ++ _ntl_ulong *cp = c.xrep.elts(); ++ const _ntl_ulong *ap = a.xrep.elts(); ++ ++ for (long i = sa-1; i >= 0; i--) ++ pclmul_mul1 (cp + (i << 1), ap[i], ap[i]); ++ ++ c.normalize(); ++ return; ++} ++ ++AVX_FUNC(void,sqr)(GF2X& c, const GF2X& a) ++{ ++ long sa = a.xrep.length(); ++ if (sa <= 0) { ++ clear(c); ++ return; ++ } ++ ++ c.xrep.SetLength(sa << 1); ++ _ntl_ulong *cp = c.xrep.elts(); ++ const _ntl_ulong *ap = a.xrep.elts(); ++ ++ for (long i = sa-1; i >= 0; i--) ++ pclmul_mul1 (cp + (i << 1), ap[i], ap[i]); ++ ++ c.normalize(); ++ return; ++} ++ ++PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a)); ++ ++#else ++ + static inline + void sqr1(_ntl_ulong *c, _ntl_ulong a) + { +@@ -1667,6 +1936,7 @@ void sqr(GF2X& c, const GF2X& a) + return; + } + ++#endif + + + void LeftShift(GF2X& c, const GF2X& a, long n) +--- src/InitSettings.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/InitSettings.c 2016-04-26 20:06:27.078557786 -0600 +@@ -150,6 +150,11 @@ int main() + cout << "NTL_RANGE_CHECK=0\n"; + #endif + ++#ifdef NTL_LOADTIME_CPU ++ cout << "NTL_LOADTIME_CPU=1\n"; ++#else ++ cout << "NTL_LOADTIME_CPU=0\n"; ++#endif + + // the following is synthetically defined + #ifdef NTL_LONGLONG_SP_MULMOD +--- src/lzz_pX1.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/lzz_pX1.c 2016-04-26 18:22:38.934719184 -0600 +@@ -4,6 +4,12 @@ + + #ifdef NTL_HAVE_AVX + #include ++#elif defined(NTL_LOADTIME_CPU) ++#include ++ ++static int have_pclmul = -1; ++static int have_avx = -1; ++static int have_fma = -1; + #endif + + +@@ -1076,6 +1082,175 @@ void Comp3Mod(zz_pX& x1, zz_pX& x2, zz_p + + + ++#ifdef NTL_LOADTIME_CPU ++ ++BASE_FUNC(void,build) ++(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) ++{ ++ altH.orig = &H; ++ altH.mem.kill(); ++ altH.row.kill(); ++ ++ if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; } ++ ++ altH.n = F.n; ++ altH.m = H.H.length()-1; ++ ++ long p = zz_p::modulus(); ++ long n = altH.n; ++ long m = altH.m; ++ ++ ++ if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) && ++ cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) { ++ altH.strategy = 1; ++ altH.pinv_L = sp_PrepRem(p); ++ } ++ else { ++ altH.strategy = 2; ++ altH.pinv_LL = make_sp_ll_reduce_struct(p); ++ } ++ ++ ++ if (altH.strategy == 1 || altH.strategy == 2) { ++ ++ altH.row.SetLength(n); ++ long **row = altH.row.elts(); ++ ++ const long AllocAmt = 1L << 18; ++ ++ long BlockSize = (AllocAmt + m - 1)/m; ++ long NumBlocks = (n + BlockSize - 1)/BlockSize; ++ ++ altH.mem.SetLength(NumBlocks); ++ ++ for (long i = 0; i < NumBlocks; i++) { ++ long first = i*BlockSize; ++ long last = min(n, first + BlockSize); ++ altH.mem[i].SetLength((last-first)*m); ++ for (long j = first; j < last; j++) { ++ row[j] = altH.mem[i].elts() + (j-first)*m; ++ } ++ } ++ ++ for (long i = 0; i < m; i++) { ++ const zz_p* ptr = H.H[i].rep.elts(); ++ long len = H.H[i].rep.length(); ++ for (long j = 0; j < len; j++) ++ row[j][i] = rep(ptr[j]); ++ for (long j = len; j < n; j++) ++ row[j][i] = 0; ++ } ++ } ++} ++ ++AVX_FUNC(void,build) ++(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) ++{ ++ altH.orig = &H; ++ altH.mem.kill(); ++ altH.row.kill(); ++ altH.dmem.kill(); ++ altH.drow.kill(); ++ ++ if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; } ++ ++ altH.n = F.n; ++ altH.m = H.H.length()-1; ++ ++ long p = zz_p::modulus(); ++ long n = altH.n; ++ long m = altH.m; ++ ++ if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) && ++ m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) { ++ altH.strategy = 3; ++ altH.pinv_L = sp_PrepRem(p); ++ } ++ else if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) && ++ cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) { ++ altH.strategy = 1; ++ altH.pinv_L = sp_PrepRem(p); ++ } ++ else { ++ altH.strategy = 2; ++ altH.pinv_LL = make_sp_ll_reduce_struct(p); ++ } ++ ++ ++ if (altH.strategy == 1 || altH.strategy == 2) { ++ ++ altH.row.SetLength(n); ++ long **row = altH.row.elts(); ++ ++ const long AllocAmt = 1L << 18; ++ ++ long BlockSize = (AllocAmt + m - 1)/m; ++ long NumBlocks = (n + BlockSize - 1)/BlockSize; ++ ++ altH.mem.SetLength(NumBlocks); ++ ++ for (long i = 0; i < NumBlocks; i++) { ++ long first = i*BlockSize; ++ long last = min(n, first + BlockSize); ++ altH.mem[i].SetLength((last-first)*m); ++ for (long j = first; j < last; j++) { ++ row[j] = altH.mem[i].elts() + (j-first)*m; ++ } ++ } ++ ++ for (long i = 0; i < m; i++) { ++ const zz_p* ptr = H.H[i].rep.elts(); ++ long len = H.H[i].rep.length(); ++ for (long j = 0; j < len; j++) ++ row[j][i] = rep(ptr[j]); ++ for (long j = len; j < n; j++) ++ row[j][i] = 0; ++ } ++ } else { ++ ++ // sanity check ++ if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow"); ++ ++ long npanels = (n+15)/16; ++ long panel_size = 16*m; ++ ++ const long AllocAmt = 1L << 18; ++ ++ long BlockSize = (AllocAmt + panel_size - 1)/panel_size; ++ long NumBlocks = (npanels + BlockSize - 1)/BlockSize; ++ ++ altH.dmem.SetLength(NumBlocks); ++ altH.drow.SetLength(npanels); ++ double **drow = altH.drow.elts(); ++ ++ for (long i = 0; i < NumBlocks; i++) { ++ long first = i*BlockSize; ++ long last = min(npanels, first + BlockSize); ++ altH.dmem[i].SetLength((last-first)*panel_size); ++ ++ double *ptr = altH.dmem[i].get(); ++ ++ for (long j = first; j < last; j++) ++ drow[j] = ptr + (j-first)*panel_size; ++ } ++ ++ for (long i = 0; i < m; i++) { ++ const zz_p *ptr = H.H[i].rep.elts(); ++ long len = H.H[i].rep.length(); ++ for (long j = 0; j < len; j++) ++ drow[j/16][(i*16) + (j%16)] = rep(ptr[j]); ++ for (long j = len; j < npanels*16; j++) ++ drow[j/16][(i*16) + (j%16)] = 0; ++ } ++ } ++} ++ ++AVX_RESOLVER(void,build, ++ (zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)); ++ ++#else ++ + void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F) + { + altH.orig = &H; +@@ -1194,11 +1369,82 @@ void build(zz_pXAltArgument& altH, const + #endif + } + ++#endif ++ + + #ifdef NTL_HAVE_LL_TYPE + ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,mul16rowsD) ++(double *x, const double *a, const double *b, long n) ++{ ++ __m256d avec0, avec1, avec2, avec3; ++ ++ __m256d acc0 = _mm256_setzero_pd(); ++ __m256d acc1 = _mm256_setzero_pd(); ++ __m256d acc2 = _mm256_setzero_pd(); ++ __m256d acc3 = _mm256_setzero_pd(); ++ ++ __m256d bvec; ++ ++ for (long i = 0; i < n; i++) { ++ bvec = _mm256_broadcast_sd(&b[i]); ++ ++ avec0 = _mm256_load_pd(a); a += 4; ++ avec1 = _mm256_load_pd(a); a += 4; ++ avec2 = _mm256_load_pd(a); a += 4; ++ avec3 = _mm256_load_pd(a); a += 4; ++ ++ acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0); ++ acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1); ++ acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2); ++ acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3); ++ } ++ ++ _mm256_store_pd(x + 0*4, acc0); ++ _mm256_store_pd(x + 1*4, acc1); ++ _mm256_store_pd(x + 2*4, acc2); ++ _mm256_store_pd(x + 3*4, acc3); ++} ++ ++FMA_FUNC(void,mul16rowsD) ++(double *x, const double *a, const double *b, long n) ++{ ++ __m256d avec0, avec1, avec2, avec3; ++ ++ __m256d acc0 = _mm256_setzero_pd(); ++ __m256d acc1 = _mm256_setzero_pd(); ++ __m256d acc2 = _mm256_setzero_pd(); ++ __m256d acc3 = _mm256_setzero_pd(); ++ ++ __m256d bvec; ++ ++ for (long i = 0; i < n; i++) { ++ bvec = _mm256_broadcast_sd(&b[i]); ++ ++ avec0 = _mm256_load_pd(a); a += 4; ++ avec1 = _mm256_load_pd(a); a += 4; ++ avec2 = _mm256_load_pd(a); a += 4; ++ avec3 = _mm256_load_pd(a); a += 4; ++ ++ acc0 = _mm256_fmadd_pd(avec0, bvec, acc0); ++ acc1 = _mm256_fmadd_pd(avec1, bvec, acc1); ++ acc2 = _mm256_fmadd_pd(avec2, bvec, acc2); ++ acc3 = _mm256_fmadd_pd(avec3, bvec, acc3); ++ } ++ ++ _mm256_store_pd(x + 0*4, acc0); ++ _mm256_store_pd(x + 1*4, acc1); ++ _mm256_store_pd(x + 2*4, acc2); ++ _mm256_store_pd(x + 3*4, acc3); ++} ++ ++FMA_RESOLVER(static void,mul16rowsD, ++ (double *x, const double *a, const double *b, long n)); ++ ++#elif defined(NTL_HAVE_AVX) + +-#ifdef NTL_HAVE_AVX + static + void mul16rowsD(double *x, const double *a, const double *b, long n) + { +@@ -1243,6 +1489,114 @@ void mul16rowsD(double *x, const double + _mm256_store_pd(x + 3*4, acc3); + } + ++#endif ++ ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,mul16rows2D) ++(double *x, double *x_, const double *a, const double *b, const double *b_, long n) ++{ ++ __m256d avec0, avec1, avec2, avec3; ++ ++ __m256d acc0 = _mm256_setzero_pd(); ++ __m256d acc1 = _mm256_setzero_pd(); ++ __m256d acc2 = _mm256_setzero_pd(); ++ __m256d acc3 = _mm256_setzero_pd(); ++ ++ __m256d acc0_ = _mm256_setzero_pd(); ++ __m256d acc1_ = _mm256_setzero_pd(); ++ __m256d acc2_ = _mm256_setzero_pd(); ++ __m256d acc3_ = _mm256_setzero_pd(); ++ ++ ++ __m256d bvec; ++ __m256d bvec_; ++ ++ for (long i = 0; i < n; i++) { ++ bvec = _mm256_broadcast_sd(&b[i]); ++ bvec_ = _mm256_broadcast_sd(&b_[i]); ++ ++ avec0 = _mm256_load_pd(a); a += 4; ++ avec1 = _mm256_load_pd(a); a += 4; ++ avec2 = _mm256_load_pd(a); a += 4; ++ avec3 = _mm256_load_pd(a); a += 4; ++ ++ acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0); ++ acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1); ++ acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2); ++ acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3); ++ ++ acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_); ++ acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_); ++ acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_); ++ acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_); ++ } ++ ++ _mm256_store_pd(x + 0*4, acc0); ++ _mm256_store_pd(x + 1*4, acc1); ++ _mm256_store_pd(x + 2*4, acc2); ++ _mm256_store_pd(x + 3*4, acc3); ++ ++ _mm256_store_pd(x_ + 0*4, acc0_); ++ _mm256_store_pd(x_ + 1*4, acc1_); ++ _mm256_store_pd(x_ + 2*4, acc2_); ++ _mm256_store_pd(x_ + 3*4, acc3_); ++} ++ ++FMA_FUNC(void,mul16rows2D) ++(double *x, double *x_, const double *a, const double *b, const double *b_, long n) ++{ ++ __m256d avec0, avec1, avec2, avec3; ++ ++ __m256d acc0 = _mm256_setzero_pd(); ++ __m256d acc1 = _mm256_setzero_pd(); ++ __m256d acc2 = _mm256_setzero_pd(); ++ __m256d acc3 = _mm256_setzero_pd(); ++ ++ __m256d acc0_ = _mm256_setzero_pd(); ++ __m256d acc1_ = _mm256_setzero_pd(); ++ __m256d acc2_ = _mm256_setzero_pd(); ++ __m256d acc3_ = _mm256_setzero_pd(); ++ ++ ++ __m256d bvec; ++ __m256d bvec_; ++ ++ for (long i = 0; i < n; i++) { ++ bvec = _mm256_broadcast_sd(&b[i]); ++ bvec_ = _mm256_broadcast_sd(&b_[i]); ++ ++ avec0 = _mm256_load_pd(a); a += 4; ++ avec1 = _mm256_load_pd(a); a += 4; ++ avec2 = _mm256_load_pd(a); a += 4; ++ avec3 = _mm256_load_pd(a); a += 4; ++ ++ acc0 = _mm256_fmadd_pd(avec0, bvec, acc0); ++ acc1 = _mm256_fmadd_pd(avec1, bvec, acc1); ++ acc2 = _mm256_fmadd_pd(avec2, bvec, acc2); ++ acc3 = _mm256_fmadd_pd(avec3, bvec, acc3); ++ ++ acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_); ++ acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_); ++ acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_); ++ acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_); ++ } ++ ++ _mm256_store_pd(x + 0*4, acc0); ++ _mm256_store_pd(x + 1*4, acc1); ++ _mm256_store_pd(x + 2*4, acc2); ++ _mm256_store_pd(x + 3*4, acc3); ++ ++ _mm256_store_pd(x_ + 0*4, acc0_); ++ _mm256_store_pd(x_ + 1*4, acc1_); ++ _mm256_store_pd(x_ + 2*4, acc2_); ++ _mm256_store_pd(x_ + 3*4, acc3_); ++} ++ ++FMA_RESOLVER(static void,mul16rows2D, ++ (double *x, double *x_, const double *a, const double *b, const double *b_, long n)); ++ ++#elif defined(NTL_HAVE_AVX) + static + void mul16rows2D(double *x, double *x_, const double *a, const double *b, const double *b_, long n) + { +@@ -1309,6 +1663,7 @@ void mul16rows2D(double *x, double *x_, + _mm256_store_pd(x_ + 3*4, acc3_); + } + ++#endif + + #endif + +@@ -1422,7 +1777,7 @@ void CompMod_L(zz_pX& x, const zz_pX& g, + } + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + + static + void InnerProduct_AVX(zz_pX& x, const Vec& v, long low, long high, +@@ -1534,7 +1889,6 @@ void CompMod_AVX(zz_pX& x, const zz_pX& + + x = t; + } +-#endif + + + +@@ -1570,6 +1924,14 @@ void CompMod(zz_pX& x, const zz_pX& g, c + break; + + #endif ++#ifdef NTL_LOADTIME_CPU ++ case 3: ++ if (have_avx) { ++ CompMod_AVX(x, g, A, F); ++ break; ++ } ++ /* FALLTHRU */ ++#endif + + default: + LogicError("CompMod: bad strategy"); +--- src/mat_lzz_p.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/mat_lzz_p.c 2016-04-26 18:22:38.937718940 -0600 +@@ -10,6 +10,15 @@ + + #ifdef NTL_HAVE_AVX + #include ++#define AVX_ACTIVE 1 ++#elif defined(NTL_LOADTIME_CPU) ++#include ++#define AVX_ACTIVE have_avx ++ ++static int have_pclmul = -1; ++static int have_avx = -1; ++static int have_fma = -1; ++static int have_avx2 = -1; + #endif + + NTL_START_IMPL +@@ -632,7 +641,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A, + + #ifdef NTL_HAVE_LL_TYPE + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + + #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1) + // max int representable exactly as a double +@@ -646,19 +655,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A, + + + // MUL_ADD(a, b, c): a += b*c ++#define FMA_MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) ++#define AVX_MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c)) + #ifdef NTL_HAVE_FMA +-#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) ++#define MUL_ADD(a, b, c) FMA_MUL_ADD(a, b, c) + #else +-#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c)) ++#define MUL_ADD(a, b, c) AVX_MUL_ADD(a, b, c) + #endif + +-#if 0 +-static +-void muladd1_by_32(double *x, const double *a, const double *b, long n) ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,muladd1_by_32) ++(double *x, const double *a, const double *b, long n) + { +- __m256d avec, bvec; ++ __m256d acc0=_mm256_load_pd(x + 0*4); ++ __m256d acc1=_mm256_load_pd(x + 1*4); ++ __m256d acc2=_mm256_load_pd(x + 2*4); ++ __m256d acc3=_mm256_load_pd(x + 3*4); ++ __m256d acc4=_mm256_load_pd(x + 4*4); ++ __m256d acc5=_mm256_load_pd(x + 5*4); ++ __m256d acc6=_mm256_load_pd(x + 6*4); ++ __m256d acc7=_mm256_load_pd(x + 7*4); ++ ++ long i = 0; ++ for (; i <= n-4; i +=4) { ++ ++ // the following code sequences are a bit faster than ++ // just doing 4 _mm256_broadcast_sd's ++ // it requires a to point to aligned storage, however ++ ++#if 1 ++ // this one seems slightly faster ++ __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0)); ++ __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2)); ++#else ++ __m256d avec = _mm256_load_pd(a); ++ __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0); ++ __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11); ++ ++#endif ++ ++ __m256d avec0 = _mm256_permute_pd(a0101, 0); ++ __m256d avec1 = _mm256_permute_pd(a0101, 0xf); ++ __m256d avec2 = _mm256_permute_pd(a2323, 0); ++ __m256d avec3 = _mm256_permute_pd(a2323, 0xf); ++ ++ a += 4; ++ ++ __m256d bvec; + ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec2, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec3, bvec); ++ } ++ ++ for (; i < n; i++) { ++ __m256d avec = _mm256_broadcast_sd(a); a++; ++ __m256d bvec; ++ ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 0*4, acc0); ++ _mm256_store_pd(x + 1*4, acc1); ++ _mm256_store_pd(x + 2*4, acc2); ++ _mm256_store_pd(x + 3*4, acc3); ++ _mm256_store_pd(x + 4*4, acc4); ++ _mm256_store_pd(x + 5*4, acc5); ++ _mm256_store_pd(x + 6*4, acc6); ++ _mm256_store_pd(x + 7*4, acc7); ++} + ++FMA_FUNC(void,muladd1_by_32) ++(double *x, const double *a, const double *b, long n) ++{ + __m256d acc0=_mm256_load_pd(x + 0*4); + __m256d acc1=_mm256_load_pd(x + 1*4); + __m256d acc2=_mm256_load_pd(x + 2*4); +@@ -668,19 +778,82 @@ void muladd1_by_32(double *x, const doub + __m256d acc6=_mm256_load_pd(x + 6*4); + __m256d acc7=_mm256_load_pd(x + 7*4); + ++ long i = 0; ++ for (; i <= n-4; i +=4) { + +- for (long i = 0; i < n; i++) { +- avec = _mm256_broadcast_sd(a); a++; ++ // the following code sequences are a bit faster than ++ // just doing 4 _mm256_broadcast_sd's ++ // it requires a to point to aligned storage, however ++ ++#if 1 ++ // this one seems slightly faster ++ __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0)); ++ __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2)); ++#else ++ __m256d avec = _mm256_load_pd(a); ++ __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0); ++ __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11); + ++#endif + +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec); +- bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec); ++ __m256d avec0 = _mm256_permute_pd(a0101, 0); ++ __m256d avec1 = _mm256_permute_pd(a0101, 0xf); ++ __m256d avec2 = _mm256_permute_pd(a2323, 0); ++ __m256d avec3 = _mm256_permute_pd(a2323, 0xf); ++ ++ a += 4; ++ ++ __m256d bvec; ++ ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec0, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec0, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec1, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec1, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec2, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec2, bvec); ++ ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec3, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec3, bvec); ++ } ++ ++ for (; i < n; i++) { ++ __m256d avec = _mm256_broadcast_sd(a); a++; ++ __m256d bvec; ++ ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec, bvec); ++ bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec, bvec); + } + + +@@ -694,6 +867,9 @@ void muladd1_by_32(double *x, const doub + _mm256_store_pd(x + 7*4, acc7); + } + ++FMA_RESOLVER(static void,muladd1_by_32, ++ (double *x, const double *a, const double *b, long n)); ++ + #else + + static +@@ -800,7 +976,164 @@ void muladd1_by_32(double *x, const doub + #endif + + // experiment: process two rows at a time +-#if 1 ++#ifdef NTL_LOADTIME_CPU ++AVX_FUNC(void,muladd2_by_32) ++(double *x, const double *a, const double *b, long n) ++{ ++ __m256d avec0, avec1, bvec; ++ __m256d acc00, acc01, acc02, acc03; ++ __m256d acc10, acc11, acc12, acc13; ++ ++ ++ // round 0 ++ ++ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); ++ ++ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); ++ ++ // round 1 ++ ++ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); ++ ++ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); ++ ++} ++ ++FMA_FUNC(void,muladd2_by_32) ++(double *x, const double *a, const double *b, long n) ++{ ++ __m256d avec0, avec1, bvec; ++ __m256d acc00, acc01, acc02, acc03; ++ __m256d acc10, acc11, acc12, acc13; ++ ++ ++ // round 0 ++ ++ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); ++ ++ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); ++ ++ // round 1 ++ ++ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); ++ ++ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); ++ ++} ++ ++FMA_RESOLVER(static void,muladd2_by_32, ++ (double *x, const double *a, const double *b, long n)); ++ ++#else ++ + static + void muladd2_by_32(double *x, const double *a, const double *b, long n) + { +@@ -876,96 +1209,217 @@ void muladd2_by_32(double *x, const doub + _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); + + } ++#endif + +-#else + +-static +-void muladd2_by_32(double *x, const double *a, const double *b, long n) ++ ++// experiment: process three rows at a time ++// NOTE: this makes things slower on an AVX1 platform --- not enough registers ++// it could be faster on AVX2/FMA, where there should be enough registers ++ ++#ifdef NTL_LOADTIME_CPU ++FMA_FUNC(void,muladd3_by_32) ++(double *x, const double *a, const double *b, long n) + { +- long i, j; +- __m256d bvec; ++ __m256d avec0, avec1, avec2, bvec; + __m256d acc00, acc01, acc02, acc03; + __m256d acc10, acc11, acc12, acc13; ++ __m256d acc20, acc21, acc22, acc23; ++ + ++ // round 0 + +- for (j = 0; j < 2; j++) { ++ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); + +- acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); ++ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); + +- acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); +- acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2)); ++ acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ); ++ acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ); ++ acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ); ++ acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ); + +- for (i = 0; i <= n-4; i+=4) { +- __m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0)); +- __m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2)); +- __m256d avec00 = _mm256_permute_pd(a0_0101, 0); +- __m256d avec01 = _mm256_permute_pd(a0_0101, 0xf); +- __m256d avec02 = _mm256_permute_pd(a0_2323, 0); +- __m256d avec03 = _mm256_permute_pd(a0_2323, 0xf); ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); + +- __m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ)); +- __m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ)); +- __m256d avec10 = _mm256_permute_pd(a1_0101, 0); +- __m256d avec11 = _mm256_permute_pd(a1_0101, 0xf); +- __m256d avec12 = _mm256_permute_pd(a1_2323, 0); +- __m256d avec13 = _mm256_permute_pd(a1_2323, 0xf); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); ++ } + +- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec); +- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec); +- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec); +- bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec); + +- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec); +- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec); +- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec); +- bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec); ++ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); + +- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec); +- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec); +- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec); +- bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec); ++ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); + +- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec); +- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec); +- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec); +- bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec); +- } ++ _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20); ++ _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21); ++ _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22); ++ _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23); + +- for (; i < n; i++) { +- __m256d avec0 = _mm256_broadcast_sd(&a[i]); +- __m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ // round 1 + +- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); +- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); +- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); +- bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); +- } ++ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); + ++ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); + +- _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00); +- _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01); +- _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02); +- _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03); ++ acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ); ++ acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ); ++ acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ); ++ acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ); + +- _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10); +- _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11); +- _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12); +- _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13); ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); + ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); + } ++ ++ ++ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); ++ ++ _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20); ++ _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21); ++ _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22); ++ _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23); ++ + } +-#endif + ++AVX2_FUNC(void,muladd3_by_32) ++(double *x, const double *a, const double *b, long n) ++{ ++ __m256d avec0, avec1, avec2, bvec; ++ __m256d acc00, acc01, acc02, acc03; ++ __m256d acc10, acc11, acc12, acc13; ++ __m256d acc20, acc21, acc22, acc23; ++ ++ ++ // round 0 + ++ acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ); + +-// experiment: process three rows at a time +-// NOTE: this makes things slower on an AVX1 platform --- not enough registers +-// it could be faster on AVX2/FMA, where there should be enough registers ++ acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ); ++ ++ acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ); ++ acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ); ++ acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ); ++ acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13); ++ ++ _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20); ++ _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21); ++ _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22); ++ _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23); ++ ++ // round 1 ++ ++ acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ); ++ acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ); ++ acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ); ++ acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ); ++ ++ acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ); ++ acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ); ++ acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ); ++ acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ); ++ ++ acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ); ++ acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ); ++ acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ); ++ acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ); ++ ++ for (long i = 0; i < n; i++) { ++ avec0 = _mm256_broadcast_sd(&a[i]); ++ avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); ++ avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); ++ ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec); ++ bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec); ++ } ++ ++ ++ _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00); ++ _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01); ++ _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02); ++ _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03); ++ ++ _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10); ++ _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11); ++ _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12); ++ _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13); ++ ++ _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20); ++ _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21); ++ _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22); ++ _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23); ++ ++} ++ ++AVX2_RESOLVER(static void,muladd3_by_32, ++ (double *x, const double *a, const double *b, long n)); ++ ++#else + + static + void muladd3_by_32(double *x, const double *a, const double *b, long n) +@@ -1066,6 +1520,32 @@ void muladd3_by_32(double *x, const doub + + } + ++#endif ++ ++#ifdef NTL_LOADTIME_CPU ++ ++static inline ++void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n) ++{ ++ long i = first; ++ ++ if (have_fma) { ++ // processing three rows at a time is faster ++ for (; i <= last-3; i+=3) ++ muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); ++ for (; i < last; i++) ++ muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); ++ } else { ++ // process only two rows at a time: not enough registers :-( ++ for (; i <= last-2; i+=2) ++ muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); ++ for (; i < last; i++) ++ muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n); ++ } ++} ++ ++#else ++ + static inline + void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n) + { +@@ -1085,8 +1565,79 @@ void muladd_all_by_32(long first, long l + #endif + } + ++#endif ++ + + // this assumes n is a multiple of 16 ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,muladd_interval) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ __m256d xvec0, xvec1, xvec2, xvec3; ++ __m256d yvec0, yvec1, yvec2, yvec3; ++ ++ __m256d cvec = _mm256_broadcast_sd(&c); ++ ++ for (long i = 0; i < n; i += 16, x += 16, y += 16) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ xvec1 = _mm256_load_pd(x+1*4); ++ xvec2 = _mm256_load_pd(x+2*4); ++ xvec3 = _mm256_load_pd(x+3*4); ++ ++ yvec0 = _mm256_load_pd(y+0*4); ++ yvec1 = _mm256_load_pd(y+1*4); ++ yvec2 = _mm256_load_pd(y+2*4); ++ yvec3 = _mm256_load_pd(y+3*4); ++ ++ AVX_MUL_ADD(xvec0, yvec0, cvec); ++ AVX_MUL_ADD(xvec1, yvec1, cvec); ++ AVX_MUL_ADD(xvec2, yvec2, cvec); ++ AVX_MUL_ADD(xvec3, yvec3, cvec); ++ ++ _mm256_store_pd(x + 0*4, xvec0); ++ _mm256_store_pd(x + 1*4, xvec1); ++ _mm256_store_pd(x + 2*4, xvec2); ++ _mm256_store_pd(x + 3*4, xvec3); ++ } ++} ++ ++FMA_FUNC(void,muladd_interval) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ __m256d xvec0, xvec1, xvec2, xvec3; ++ __m256d yvec0, yvec1, yvec2, yvec3; ++ ++ __m256d cvec = _mm256_broadcast_sd(&c); ++ ++ for (long i = 0; i < n; i += 16, x += 16, y += 16) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ xvec1 = _mm256_load_pd(x+1*4); ++ xvec2 = _mm256_load_pd(x+2*4); ++ xvec3 = _mm256_load_pd(x+3*4); ++ ++ yvec0 = _mm256_load_pd(y+0*4); ++ yvec1 = _mm256_load_pd(y+1*4); ++ yvec2 = _mm256_load_pd(y+2*4); ++ yvec3 = _mm256_load_pd(y+3*4); ++ ++ FMA_MUL_ADD(xvec0, yvec0, cvec); ++ FMA_MUL_ADD(xvec1, yvec1, cvec); ++ FMA_MUL_ADD(xvec2, yvec2, cvec); ++ FMA_MUL_ADD(xvec3, yvec3, cvec); ++ ++ _mm256_store_pd(x + 0*4, xvec0); ++ _mm256_store_pd(x + 1*4, xvec1); ++ _mm256_store_pd(x + 2*4, xvec2); ++ _mm256_store_pd(x + 3*4, xvec3); ++ } ++} ++ ++FMA_RESOLVER(static void,muladd_interval, ++ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); ++ ++#else ++ + static inline + void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) + { +@@ -1117,9 +1668,109 @@ void muladd_interval(double * NTL_RESTRI + _mm256_store_pd(x + 3*4, xvec3); + } + } ++#endif + + // this one is more general: does not assume that n is a + // multiple of 16 ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,muladd_interval1) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ ++ __m256d xvec0, xvec1, xvec2, xvec3; ++ __m256d yvec0, yvec1, yvec2, yvec3; ++ __m256d cvec; ++ ++ if (n >= 4) ++ cvec = _mm256_broadcast_sd(&c); ++ ++ long i=0; ++ for (; i <= n-16; i += 16, x += 16, y += 16) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ xvec1 = _mm256_load_pd(x+1*4); ++ xvec2 = _mm256_load_pd(x+2*4); ++ xvec3 = _mm256_load_pd(x+3*4); ++ ++ yvec0 = _mm256_load_pd(y+0*4); ++ yvec1 = _mm256_load_pd(y+1*4); ++ yvec2 = _mm256_load_pd(y+2*4); ++ yvec3 = _mm256_load_pd(y+3*4); ++ ++ AVX_MUL_ADD(xvec0, yvec0, cvec); ++ AVX_MUL_ADD(xvec1, yvec1, cvec); ++ AVX_MUL_ADD(xvec2, yvec2, cvec); ++ AVX_MUL_ADD(xvec3, yvec3, cvec); ++ ++ _mm256_store_pd(x + 0*4, xvec0); ++ _mm256_store_pd(x + 1*4, xvec1); ++ _mm256_store_pd(x + 2*4, xvec2); ++ _mm256_store_pd(x + 3*4, xvec3); ++ } ++ ++ for (; i <= n-4; i += 4, x += 4, y += 4) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ yvec0 = _mm256_load_pd(y+0*4); ++ AVX_MUL_ADD(xvec0, yvec0, cvec); ++ _mm256_store_pd(x + 0*4, xvec0); ++ } ++ ++ for (; i < n; i++, x++, y++) { ++ *x += (*y)*c; ++ } ++} ++ ++FMA_FUNC(void,muladd_interval1) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ ++ __m256d xvec0, xvec1, xvec2, xvec3; ++ __m256d yvec0, yvec1, yvec2, yvec3; ++ __m256d cvec; ++ ++ if (n >= 4) ++ cvec = _mm256_broadcast_sd(&c); ++ ++ long i=0; ++ for (; i <= n-16; i += 16, x += 16, y += 16) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ xvec1 = _mm256_load_pd(x+1*4); ++ xvec2 = _mm256_load_pd(x+2*4); ++ xvec3 = _mm256_load_pd(x+3*4); ++ ++ yvec0 = _mm256_load_pd(y+0*4); ++ yvec1 = _mm256_load_pd(y+1*4); ++ yvec2 = _mm256_load_pd(y+2*4); ++ yvec3 = _mm256_load_pd(y+3*4); ++ ++ FMA_MUL_ADD(xvec0, yvec0, cvec); ++ FMA_MUL_ADD(xvec1, yvec1, cvec); ++ FMA_MUL_ADD(xvec2, yvec2, cvec); ++ FMA_MUL_ADD(xvec3, yvec3, cvec); ++ ++ _mm256_store_pd(x + 0*4, xvec0); ++ _mm256_store_pd(x + 1*4, xvec1); ++ _mm256_store_pd(x + 2*4, xvec2); ++ _mm256_store_pd(x + 3*4, xvec3); ++ } ++ ++ for (; i <= n-4; i += 4, x += 4, y += 4) { ++ xvec0 = _mm256_load_pd(x+0*4); ++ yvec0 = _mm256_load_pd(y+0*4); ++ FMA_MUL_ADD(xvec0, yvec0, cvec); ++ _mm256_store_pd(x + 0*4, xvec0); ++ } ++ ++ for (; i < n; i++, x++, y++) { ++ *x += (*y)*c; ++ } ++} ++ ++FMA_RESOLVER(static void,muladd_interval1, ++ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); ++ ++#else ++ + static inline + void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) + { +@@ -1165,11 +1816,74 @@ void muladd_interval1(double * NTL_RESTR + *x += (*y)*c; + } + } ++#endif + + #define AVX_PD_SZ (4) + + // experimental: assumes n is a multiple of 4 in the range [0..32] +-#if 1 ++#ifdef NTL_LOADTIME_CPU ++ ++AVX_FUNC(void,muladd_interval2) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ n /= 4; ++ if (n <= 0 || n > 8) return; ++ ++ x += n*4; ++ y += n*4; ++ ++ // n in [1..8] ++ ++ __m256d xvec, yvec, cvec; ++ ++ cvec = _mm256_broadcast_sd(&c); ++ ++ switch (n) { ++ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); ++ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); ++ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); ++ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); ++ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); ++ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); ++ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); ++ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); ++ } ++ ++} ++ ++FMA_FUNC(void,muladd_interval2) ++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) ++{ ++ n /= 4; ++ if (n <= 0 || n > 8) return; ++ ++ x += n*4; ++ y += n*4; ++ ++ // n in [1..8] ++ ++ __m256d xvec, yvec, cvec; ++ ++ cvec = _mm256_broadcast_sd(&c); ++ ++ switch (n) { ++ case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec); ++ case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec); ++ case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec); ++ case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec); ++ case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec); ++ case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec); ++ case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec); ++ case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec); ++ } ++ ++} ++ ++FMA_RESOLVER(static void,muladd_interval2, ++ (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)); ++ ++#else ++ + static inline + void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) + { +@@ -1197,13 +1911,6 @@ void muladd_interval2(double * NTL_RESTR + } + + } +-#else +-static inline +-void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n) +-{ +- for (long i = 0; i < n; i++) +- x[i] += y[i]*c; +-} + #endif + + #endif +@@ -2245,10 +2952,10 @@ void alt_mul_LL(const mat_window_zz_p& X + } + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + +-static +-void blk_mul_DD(const mat_window_zz_p& X, ++static void __attribute__((target ("avx,pclmul"))) ++blk_mul_DD(const mat_window_zz_p& X, + const const_mat_window_zz_p& A, const const_mat_window_zz_p& B) + { + long n = A.NumRows(); +@@ -2615,8 +3322,9 @@ void mul_base (const mat_window_zz_p& X, + + long V = MAT_BLK_SZ*4; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined (NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +@@ -2696,7 +3404,8 @@ void mul_strassen(const mat_window_zz_p& + // this code determines if mul_base triggers blk_mul_DD, + // in which case a higher crossover is used + +-#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX)) ++#if (defined(NTL_HAVE_LL_TYPE) && (defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU))) ++ if (AVX_ACTIVE) + { + long V = MAT_BLK_SZ*4; + long p = zz_p::modulus(); +@@ -3196,10 +3905,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con + + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + +-static +-void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) ++static void __attribute__((target ("avx,pclmul"))) ++alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) + { + long n = A.NumRows(); + +@@ -3365,10 +4074,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co + + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + +-static +-void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) ++static void __attribute__((target ("avx,pclmul"))) ++blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax) + { + long n = A.NumRows(); + +@@ -4126,8 +4835,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c + else if (n/MAT_BLK_SZ < 4) { + long V = 64; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +@@ -4152,8 +4862,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c + else { + long V = 4*MAT_BLK_SZ; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +@@ -4559,10 +5270,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p& + + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + +-static +-void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, ++static void __attribute__((target ("avx,pclmul"))) ++alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, + vec_zz_p *xp, bool trans, bool relax) + { + long n = A.NumRows(); +@@ -4749,10 +5460,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p& + + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + +-static +-void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, ++static void __attribute__((target ("avx,pclmul"))) ++blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, + vec_zz_p *xp, bool trans, bool relax) + { + long n = A.NumRows(); +@@ -5563,8 +6274,9 @@ void tri(zz_p& d, const mat_zz_p& A, con + else if (n/MAT_BLK_SZ < 4) { + long V = 64; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +@@ -5589,8 +6301,9 @@ void tri(zz_p& d, const mat_zz_p& A, con + else { + long V = 4*MAT_BLK_SZ; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +@@ -5836,7 +6549,7 @@ long elim_basic(const mat_zz_p& A, mat_z + #ifdef NTL_HAVE_LL_TYPE + + +-#ifdef NTL_HAVE_AVX ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) + + + static inline +@@ -7289,8 +8002,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i + else { + long V = 4*MAT_BLK_SZ; + +-#ifdef NTL_HAVE_AVX +- if (p-1 <= MAX_DBL_INT && ++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU) ++ if (AVX_ACTIVE && ++ p-1 <= MAX_DBL_INT && + V <= (MAX_DBL_INT-(p-1))/(p-1) && + V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) { + +--- src/QuickTest.c.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/QuickTest.c 2016-04-26 18:22:38.938718859 -0600 +@@ -335,6 +335,9 @@ cerr << "Performance Options:\n"; + cerr << "NTL_PCLMUL\n"; + #endif + ++#ifdef NTL_LOADTIME_CPU ++ cerr << "NTL_LOADTIME_CPU\n"; ++#endif + + cerr << "\n\n"; + +--- src/WizardAux.orig 2016-04-26 05:40:15.000000000 -0600 ++++ src/WizardAux 2016-04-26 18:22:38.938718859 -0600 +@@ -94,6 +94,7 @@ system("make InitSettings"); + 'NTL_PCLMUL' => 0, + 'NTL_FFT_BIGTAB' => 0, + 'NTL_FFT_LAZYMUL' => 0, ++'NTL_LOADTIME_CPU' => 0, + + 'WIZARD_HACK' => '#define NTL_WIZARD_HACK', + diff --git a/ntl.spec b/ntl.spec index 2a8f5fc..89f1902 100644 --- a/ntl.spec +++ b/ntl.spec @@ -10,15 +10,18 @@ Summary: High-performance algorithms for vectors, matrices, and polynomials Name: ntl -Version: 9.7.0 +Version: 9.8.0 Release: 1%{?dist} License: GPLv2+ URL: http://shoup.net/ntl/ -Group: System Environment/Libraries Source0: http://shoup.net/ntl/%{name}-%{version}.tar.gz Source1: multilib_template.h +# Detect CPU at load time, optionally use PCLMUL, AVX, FMA, and AVX2 features. +# This patch was sent upstream, but upstream prefers that the entire library +# be built for a specific CPU, which we cannot do in Fedora. +Patch0: %{name}-loadtime-cpu.patch BuildRequires: gcc-c++ %if 0%{?gf2x} @@ -47,14 +50,12 @@ NTL provides high quality implementations of state-of-the-art algorithms for: %package devel Summary: Development files for %{name} -Group: Development/Libraries Requires: %{name}%{?_isa} = %{version}-%{release} %description devel %{summary}. %package static Summary: Static libraries for %{name} -Group: Development/Libraries Requires: %{name}-devel%{?_isa} = %{version}-%{release} #Requires: gmp-devel %description static @@ -62,7 +63,8 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release} %prep -%setup -q +%setup -q +%patch0 %build @@ -78,6 +80,11 @@ pushd src NATIVE=off \ %{?gf2x:NTL_GF2X_LIB=on} \ NTL_PCLMUL=off \ + NTL_DISABLE_TLS_HACK=on \ +%ifarch x86_64 + NTL_LOADTIME_CPU=on \ + WIZARD=off \ +%endif SHARED=on popd @@ -86,8 +93,10 @@ make -C src V=1 %check -# skip by default, takes a *long, long, long* (days?) time -- Rex -%{?_with_check:make -C src check} +# skip on non-x86_64, takes a *long, long, long* (days?) time -- Rex +%ifarch x86_64 +make -C src check +%endif %install @@ -127,7 +136,7 @@ done %files %doc README %license doc/copying.txt -%{_libdir}/libntl.so.22* +%{_libdir}/libntl.so.24* %files devel %doc doc/* @@ -141,6 +150,11 @@ done %changelog +* Fri Apr 29 2016 Jerry James - 9.8.0-1 +- ntl-9.8.0 +- Add -loadtime-cpu patch +- Enable the check script on x86_64 + * Sat Mar 19 2016 Jerry James - 9.7.0-1 - ntl-9.7.0 diff --git a/sources b/sources index c422b75..d04a3a8 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -02f6076903f482d4bf15f62fb7ee260f ntl-9.7.0.tar.gz +a7e87d859511c15023169fa0fcf9903b ntl-9.8.0.tar.gz