Blob Blame History Raw
--- doc/config.txt.orig	2016-11-18 11:39:17.000000000 -0700
+++ doc/config.txt	2017-01-12 15:07:36.859377026 -0700
@@ -300,6 +300,7 @@ NTL_AVOID_BRANCHING=off
 NTL_GF2X_NOINLINE=off
 NTL_GF2X_ALTCODE=off
 NTL_GF2X_ALTCODE1=off
+NTL_LOADTIME_CPU=off
 
 GMP_INCDIR=$(GMP_PREFIX)/include
 GMP_LIBDIR=$(GMP_PREFIX)/lib
@@ -597,6 +598,10 @@ NTL_GF2X_ALTCODE1=off
 
 # Yet another alternative implementation for GF2X multiplication.
 
+NTL_LOADTIME_CPU=off
+
+# switch to check CPU characteristics at load time and use routines
+# optimized for the executing CPU.
 
 
 ########## More GMP Options:
--- include/NTL/config.h.orig	2016-11-18 11:39:17.000000000 -0700
+++ include/NTL/config.h	2017-01-12 15:07:36.860377023 -0700
@@ -475,6 +475,20 @@ using the configure script.
 #endif
 
 
+#if 0
+#define NTL_LOADTIME_CPU
+
+/*
+ * With this flag enabled, detect advanced CPU features at load time instead
+ * of at compile time.  This flag is intended for distributions, so that they
+ * can compile for the lowest common denominator CPU, but still support newer
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ */
+
+#endif
+
 
 
 
--- include/NTL/ctools.h.orig	2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/ctools.h	2017-01-12 15:07:36.861377020 -0700
@@ -447,6 +447,136 @@ char *_ntl_make_aligned(char *p, long al
 // and it should also be as big as a cache line
 
 
+/* Determine CPU characteristics at runtime */
+#ifdef NTL_LOADTIME_CPU
+#if !defined(__x86_64__)
+#error Runtime CPU support is only available on x86_64.
+#endif
+#ifndef __GNUC__
+#error Runtime CPU support is only available with GCC.
+#endif
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
+#error Runtime CPU support is only available with GCC 4.6 or later.
+#endif
+
+#include <cpuid.h>
+#ifndef bit_PCLMUL
+#define bit_PCLMUL	(1 << 1)
+#endif
+#ifndef bit_AVX
+#define bit_AVX		(1 << 28)
+#endif
+#ifndef bit_FMA
+#define bit_FMA		(1 << 12)
+#endif
+#ifndef bit_AVX2
+#define bit_AVX2	(1 << 5)
+#endif
+
+#define BASE_FUNC(type,name) static type name##_base
+#define TARGET_FUNC(arch,suffix,type,name)                              \
+   static type __attribute__((target (arch))) name##_##suffix
+#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name)
+#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name)
+#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name)
+#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name)
+#define PCLMUL_RESOLVER(type,name,params)                               \
+   extern "C" {                                                         \
+      static void __attribute__((optimize ("O0")))                      \
+         (*resolve_##name (void))(void) {                               \
+         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
+               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
+               have_avx = ((ecx & bit_AVX) != 0);                       \
+               have_fma = ((ecx & bit_FMA) != 0);                       \
+            } else {                                                    \
+               have_pclmul = 0;                                         \
+               have_avx = 0;                                            \
+               have_fma = 0;                                            \
+            }                                                           \
+         }                                                              \
+         if (have_avx) return (void (*)(void))&name##_avx;              \
+         if (have_pclmul) return (void (*)(void))&name##_pclmul;        \
+         return (void (*)(void))&name##_base;                           \
+      }                                                                 \
+   }                                                                    \
+   type __attribute__((ifunc ("resolve_" #name))) name params
+#define AVX_RESOLVER(type,name,params)                                  \
+   extern "C" {                                                         \
+      static void __attribute__((optimize ("O0")))                      \
+         (*resolve_##name (void))(void) {                               \
+         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
+               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
+               have_avx = ((ecx & bit_AVX) != 0);                       \
+               have_fma = ((ecx & bit_FMA) != 0);                       \
+            } else {                                                    \
+               have_pclmul = 0;                                         \
+               have_avx = 0;                                            \
+               have_fma = 0;                                            \
+            }                                                           \
+         }                                                              \
+         return have_avx                                                \
+               ? (void (*)(void))&name##_avx                            \
+               : (void (*)(void))&name##_base;                          \
+      }                                                                 \
+   }                                                                    \
+   type __attribute__((ifunc ("resolve_" #name))) name params
+#define FMA_RESOLVER(type,name,params)                                  \
+   extern "C" {                                                         \
+      static void __attribute__((optimize ("O0")))                      \
+         (*resolve_##name (void))(void) {                               \
+         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
+               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
+               have_avx = ((ecx & bit_AVX) != 0);                       \
+               have_fma = ((ecx & bit_FMA) != 0);                       \
+            } else {                                                    \
+               have_pclmul = 0;                                         \
+               have_avx = 0;                                            \
+               have_fma = 0;                                            \
+            }                                                           \
+         }                                                              \
+         return have_fma                                                \
+               ? (void (*)(void))&name##_fma                            \
+               : (void (*)(void))&name##_avx;                           \
+      }                                                                 \
+   }                                                                    \
+   type __attribute__((ifunc ("resolve_" #name))) name params
+#define AVX2_RESOLVER(type,name,params)                                 \
+   extern "C" {                                                         \
+      static void __attribute__((optimize ("O0")))                      \
+         (*resolve_##name (void))(void) {                               \
+         if (__builtin_expect(have_avx2, 0) < 0) {                      \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {               \
+               have_avx2 = ((ebx & bit_AVX2) != 0);                     \
+            } else {                                                    \
+               have_avx2 = 0;                                           \
+            }                                                           \
+         }                                                              \
+         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
+            unsigned int eax, ebx, ecx, edx;                            \
+            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
+               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
+               have_avx = ((ecx & bit_AVX) != 0);                       \
+               have_fma = ((ecx & bit_FMA) != 0);                       \
+            } else {                                                    \
+               have_pclmul = 0;                                         \
+               have_avx = 0;                                            \
+               have_fma = 0;                                            \
+            }                                                           \
+         }                                                              \
+         return have_avx2                                               \
+               ? (void (*)(void))&name##_avx2                           \
+               : (void (*)(void))&name##_fma;                           \
+      }                                                                 \
+   }                                                                    \
+   type __attribute__((ifunc ("resolve_" #name))) name params
+#endif
 
 #ifdef NTL_HAVE_BUILTIN_CLZL
 
--- include/NTL/def_config.h.orig	2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/def_config.h	2017-01-12 15:07:36.861377020 -0700
@@ -475,6 +475,19 @@ using the configure script.
 #endif
 
 
+#if 0
+#define NTL_LOADTIME_CPU
+
+/*
+ * With this flag enabled, detect advanced CPU features at load time instead
+ * of at compile time.  This flag is intended for distributions, so that they
+ * can compile for the lowest common denominator CPU, but still support newer
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ */
+
+#endif
 
 
 
--- include/NTL/MatPrime.h.orig	2016-11-18 11:39:16.000000000 -0700
+++ include/NTL/MatPrime.h	2017-01-12 16:15:17.307205250 -0700
@@ -20,7 +20,7 @@ NTL_OPEN_NNS
 
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 #define NTL_MatPrime_NBITS (23)
 #else
 #define NTL_MatPrime_NBITS NTL_SP_NBITS
--- src/cfile.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/cfile	2017-01-12 15:07:36.862377017 -0700
@@ -475,6 +475,20 @@ using the configure script.
 #endif
 
 
+#if @{NTL_LOADTIME_CPU}
+#define NTL_LOADTIME_CPU
+
+/*
+ * With this flag enabled, detect advanced CPU features at load time instead
+ * of at compile time.  This flag is intended for distributions, so that they
+ * can compile for the lowest common denominator CPU, but still support newer
+ * CPUs.
+ *
+ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
+ */
+
+#endif
+
 
 @{WIZARD_HACK}
 
--- src/DispSettings.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/DispSettings.cpp	2017-01-12 15:07:36.863377014 -0700
@@ -164,6 +164,10 @@ cout << "Performance Options:\n";
    cout << "NTL_GF2X_NOINLINE\n";
 #endif
 
+#ifdef NTL_LOADTIME_CPU
+   cout << "NTL_LOADTIME_CPU\n";
+#endif
+
 
    cout << "***************************/\n";
    cout << "\n\n";
--- src/DoConfig.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/DoConfig	2017-01-12 15:07:36.864377011 -0700
@@ -1,7 +1,7 @@
 # This is a perl script, invoked from a shell
 
 # use warnings;  # this doesn't work on older versions of perl
-
+use Config;
 
 
 %MakeFlag = (
@@ -82,6 +82,7 @@
 'NTL_GF2X_NOINLINE'       => 'off',
 'NTL_GF2X_ALTCODE'        => 'off',
 'NTL_GF2X_ALTCODE1'       => 'off',
+'NTL_LOADTIME_CPU'        => 'off',
 
 
 );
@@ -191,6 +192,15 @@ if ($ConfigFlag{'NTL_THREAD_BOOST'} eq '
 }
 
 
+# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE
+
+if ($ConfigFlag{'NTL_LOADTIME_CPU'} eq 'on') {
+   if (!$Config{archname} =~ /x86_64/) {
+      die "Error: NTL_LOADTIME_CPU currently only available with x86_64...sorry\n";
+   }
+   $ConfigFlag{'NTL_GF2X_NOINLINE'} = 'on';
+}
+
 
 # some special MakeVal values that are determined by SHARED
 
--- src/GF2X1.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/GF2X1.cpp	2017-01-12 15:07:36.866377005 -0700
@@ -19,7 +19,7 @@
 // simple scaling factor for some crossover points:
 // we use a lower crossover of the underlying multiplication
 // is faster  
-#if (defined(NTL_GF2X_LIB) || defined(NTL_HAVE_PCLMUL))
+#if (defined(NTL_GF2X_LIB) || defined(NTL_HAVE_PCLMUL) || defined(NTL_LOADTIME_CPU))
 #define XOVER_SCALE (1L)
 #else
 #define XOVER_SCALE (2L)
--- src/GF2X.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/GF2X.cpp	2017-01-12 15:07:36.867377002 -0700
@@ -28,6 +28,22 @@ pclmul_mul1 (unsigned long *c, unsigned
    _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
 }
 
+#elif defined (NTL_LOADTIME_CPU)
+
+#include <wmmintrin.h>
+
+static int have_pclmul = -1;
+static int have_avx = -1;
+static int have_fma = -1;
+
+#define NTL_INLINE inline
+
+#define pclmul_mul1(c,a,b) do {                                          \
+   __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \
+   __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \
+   _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0));     \
+} while (0)
+
 #else
 
 
@@ -576,6 +592,27 @@ void add(GF2X& x, const GF2X& a, const G
 
 
 
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   NTL_EFF_BB_MUL_CODE0
+}
+
+PCLMUL_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   pclmul_mul1(c, a, b);
+}
+
+AVX_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   pclmul_mul1(c, a, b);
+}
+
+PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
+
+#else
+
 static NTL_INLINE
 void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
 {
@@ -588,6 +625,7 @@ NTL_EFF_BB_MUL_CODE0
 
 }
 
+#endif
 
 #ifdef NTL_GF2X_NOINLINE
 
@@ -612,6 +650,51 @@ NTL_EFF_BB_MUL_CODE0
 #endif
 
 
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,Mul1)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   NTL_EFF_BB_MUL_CODE1
+}
+
+PCLMUL_FUNC(void,Mul1)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] = carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] = carry;
+}
+
+AVX_FUNC(void,Mul1)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] = carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] = carry;
+}
+
+PCLMUL_RESOLVER(static void,Mul1,
+    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
+
+#else
+
 static 
 void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
 {
@@ -639,6 +722,53 @@ NTL_EFF_BB_MUL_CODE1
 
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,AddMul1)
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
+{
+   NTL_EFF_BB_MUL_CODE2
+}
+
+PCLMUL_FUNC(void,AddMul1)
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] ^= carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] ^= carry;
+}
+
+AVX_FUNC(void,AddMul1)
+(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] ^= carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] ^= carry;
+}
+
+PCLMUL_RESOLVER(static void,AddMul1,
+    (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a));
+
+#else
+
 static 
 void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
 {
@@ -667,6 +797,52 @@ NTL_EFF_BB_MUL_CODE2
 
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,Mul1_short)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   NTL_EFF_SHORT_BB_MUL_CODE1
+}
+
+PCLMUL_FUNC(void,Mul1_short)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] = carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] = carry;
+}
+
+AVX_FUNC(void,Mul1_short)
+(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+{
+   long i;
+   unsigned long carry, prod[2];
+
+   carry = 0;
+   for (i = 0; i < sb; i++) {
+      pclmul_mul1(prod, bp[i], a);
+      cp[i] = carry ^ prod[0];
+      carry = prod[1];
+   }
+
+   cp[sb] = carry;
+}
+
+PCLMUL_RESOLVER(static void,Mul1_short,
+    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
+
+#else
 
 static 
 void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
@@ -695,10 +871,31 @@ NTL_EFF_SHORT_BB_MUL_CODE1
 
 }
 
+#endif
 
 
 
+#ifdef NTL_LOADTIME_CPU
 
+BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   NTL_EFF_HALF_BB_MUL_CODE0
+}
+
+PCLMUL_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   pclmul_mul1(c, a, b);
+}
+ 
+AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+{
+   pclmul_mul1(c, a, b);
+}
+ 
+PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
+ 
+#else
+ 
 static 
 void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
 {
@@ -712,6 +909,7 @@ NTL_EFF_HALF_BB_MUL_CODE0
 
 }
 
+#endif
 
 // mul2...mul8 hard-code 2x2...8x8 word multiplies.
 // I adapted these routines from LiDIA (except mul3, see below).
@@ -1623,6 +1821,77 @@ static const _ntl_ulong sqrtab[256] = {
 
 
 
+#ifdef NTL_LOADTIME_CPU
+
+BASE_FUNC(void,sqr)(GF2X& c, const GF2X& a)
+{
+   long sa = a.xrep.length();
+   if (sa <= 0) {
+      clear(c);
+      return;
+   }
+
+   c.xrep.SetLength(sa << 1);
+   _ntl_ulong *cp = c.xrep.elts();
+   const _ntl_ulong *ap = a.xrep.elts();
+
+   for (long i = sa-1; i >= 0; i--) {
+      _ntl_ulong *c = cp + (i << 1);
+      _ntl_ulong a = ap[i];
+      _ntl_ulong hi, lo;
+
+      NTL_BB_SQR_CODE
+
+      c[0] = lo;
+      c[1] = hi;
+   }
+
+   c.normalize();
+   return;
+}
+
+PCLMUL_FUNC(void,sqr)(GF2X& c, const GF2X& a)
+{
+   long sa = a.xrep.length();
+   if (sa <= 0) {
+      clear(c);
+      return;
+   }
+
+   c.xrep.SetLength(sa << 1);
+   _ntl_ulong *cp = c.xrep.elts();
+   const _ntl_ulong *ap = a.xrep.elts();
+
+   for (long i = sa-1; i >= 0; i--)
+      pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
+
+   c.normalize();
+   return;
+}
+
+AVX_FUNC(void,sqr)(GF2X& c, const GF2X& a)
+{
+   long sa = a.xrep.length();
+   if (sa <= 0) {
+      clear(c);
+      return;
+   }
+
+   c.xrep.SetLength(sa << 1);
+   _ntl_ulong *cp = c.xrep.elts();
+   const _ntl_ulong *ap = a.xrep.elts();
+
+   for (long i = sa-1; i >= 0; i--)
+      pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
+
+   c.normalize();
+   return;
+}
+
+PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a));
+
+#else
+
 static inline
 void sqr1(_ntl_ulong *c, _ntl_ulong a)
 {
@@ -1663,6 +1932,7 @@ void sqr(GF2X& c, const GF2X& a)
    return;
 }
 
+#endif
 
 
 void LeftShift(GF2X& c, const GF2X& a, long n)
--- src/InitSettings.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/InitSettings.cpp	2017-01-12 15:07:36.867377002 -0700
@@ -148,6 +148,11 @@ int main()
    cout << "NTL_RANGE_CHECK=0\n";
 #endif
 
+#ifdef NTL_LOADTIME_CPU
+   cout << "NTL_LOADTIME_CPU=1\n";
+#else
+   cout << "NTL_LOADTIME_CPU=0\n";
+#endif
 
 // the following are not actual config flags, but help
 // in the Wizard logic
--- src/mat_lzz_p.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/mat_lzz_p.cpp	2017-01-12 21:47:53.774949563 -0700
@@ -10,6 +10,15 @@
 
 #ifdef NTL_HAVE_AVX
 #include <immintrin.h>
+#define AVX_ACTIVE 1
+#elif defined(NTL_LOADTIME_CPU)
+#include <immintrin.h>
+#define AVX_ACTIVE have_avx
+
+static int have_pclmul = -1;
+static int have_avx = -1;
+static int have_fma = -1;
+static int have_avx2 = -1;
 #endif
 
 NTL_START_IMPL
@@ -626,7 +635,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
 
 #ifdef NTL_HAVE_LL_TYPE
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
 #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
 // max int representable exactly as a double
@@ -640,18 +649,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
 
 
 // MUL_ADD(a, b, c): a += b*c
+#define FMA_MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) 
+#define AVX_MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
 #ifdef NTL_HAVE_FMA
-#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) 
+#define MUL_ADD(a, b, c) FMA_MUL_ADD(a, b, c)
 #else
-#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
+#define MUL_ADD(a, b, c) AVX_MUL_ADD(a, b, c)
 #endif
 
-static
-void muladd1_by_32(double *x, const double *a, const double *b, long n)
+#ifdef NTL_LOADTIME_CPU
+
+AVX_FUNC(void,muladd1_by_32)
+(double *x, const double *a, const double *b, long n)
 {
-   __m256d avec, bvec;
+   __m256d acc0=_mm256_load_pd(x + 0*4);
+   __m256d acc1=_mm256_load_pd(x + 1*4);
+   __m256d acc2=_mm256_load_pd(x + 2*4);
+   __m256d acc3=_mm256_load_pd(x + 3*4);
+   __m256d acc4=_mm256_load_pd(x + 4*4);
+   __m256d acc5=_mm256_load_pd(x + 5*4);
+   __m256d acc6=_mm256_load_pd(x + 6*4);
+   __m256d acc7=_mm256_load_pd(x + 7*4);
+
+   long i = 0;
+   for (; i <= n-4; i +=4) {
 
+      // the following code sequences are a bit faster than
+      // just doing 4 _mm256_broadcast_sd's
+      // it requires a to point to aligned storage, however
+
+#if 1
+     // this one seems slightly faster
+      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
+      __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
+#else
+      __m256d avec = _mm256_load_pd(a);
+      __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
+      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
+
+#endif
 
+      __m256d avec0 = _mm256_permute_pd(a0101, 0);
+      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
+      __m256d avec2 = _mm256_permute_pd(a2323, 0);
+      __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
+
+      a += 4;
+
+      __m256d bvec;
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec2, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec3, bvec);
+   }
+
+   for (; i < n; i++) {
+      __m256d avec = _mm256_broadcast_sd(a); a++;
+      __m256d bvec;
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4, acc0);
+   _mm256_store_pd(x + 1*4, acc1);
+   _mm256_store_pd(x + 2*4, acc2);
+   _mm256_store_pd(x + 3*4, acc3);
+   _mm256_store_pd(x + 4*4, acc4);
+   _mm256_store_pd(x + 5*4, acc5);
+   _mm256_store_pd(x + 6*4, acc6);
+   _mm256_store_pd(x + 7*4, acc7);
+}
+
+FMA_FUNC(void,muladd1_by_32)
+(double *x, const double *a, const double *b, long n)
+{
    __m256d acc0=_mm256_load_pd(x + 0*4);
    __m256d acc1=_mm256_load_pd(x + 1*4);
    __m256d acc2=_mm256_load_pd(x + 2*4);
@@ -661,10 +772,179 @@ void muladd1_by_32(double *x, const doub
    __m256d acc6=_mm256_load_pd(x + 6*4);
    __m256d acc7=_mm256_load_pd(x + 7*4);
 
+   long i = 0;
+   for (; i <= n-4; i +=4) {
 
-   for (long i = 0; i < n; i++) {
-      avec = _mm256_broadcast_sd(a); a++;
+      // the following code sequences are a bit faster than
+      // just doing 4 _mm256_broadcast_sd's
+      // it requires a to point to aligned storage, however
 
+#if 1
+     // this one seems slightly faster
+      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
+      __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
+#else
+      __m256d avec = _mm256_load_pd(a);
+      __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
+      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
+
+#endif
+
+      __m256d avec0 = _mm256_permute_pd(a0101, 0);
+      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
+      __m256d avec2 = _mm256_permute_pd(a2323, 0);
+      __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
+
+      a += 4;
+
+      __m256d bvec;
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec0, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec1, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec2, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec3, bvec);
+   }
+
+   for (; i < n; i++) {
+      __m256d avec = _mm256_broadcast_sd(a); a++;
+      __m256d bvec;
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4, acc0);
+   _mm256_store_pd(x + 1*4, acc1);
+   _mm256_store_pd(x + 2*4, acc2);
+   _mm256_store_pd(x + 3*4, acc3);
+   _mm256_store_pd(x + 4*4, acc4);
+   _mm256_store_pd(x + 5*4, acc5);
+   _mm256_store_pd(x + 6*4, acc6);
+   _mm256_store_pd(x + 7*4, acc7);
+}
+
+FMA_RESOLVER(static void,muladd1_by_32,
+   (double *x, const double *a, const double *b, long n));
+
+#else
+
+static
+void muladd1_by_32(double *x, const double *a, const double *b, long n)
+{
+   __m256d acc0=_mm256_load_pd(x + 0*4);
+   __m256d acc1=_mm256_load_pd(x + 1*4);
+   __m256d acc2=_mm256_load_pd(x + 2*4);
+   __m256d acc3=_mm256_load_pd(x + 3*4);
+   __m256d acc4=_mm256_load_pd(x + 4*4);
+   __m256d acc5=_mm256_load_pd(x + 5*4);
+   __m256d acc6=_mm256_load_pd(x + 6*4);
+   __m256d acc7=_mm256_load_pd(x + 7*4);
+
+   long i = 0;
+   for (; i <= n-4; i +=4) {
+
+      // the following code sequences are a bit faster than
+      // just doing 4 _mm256_broadcast_sd's
+      // it requires a to point to aligned storage, however
+
+#if 1
+     // this one seems slightly faster
+      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
+      __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
+#else
+      __m256d avec = _mm256_load_pd(a);
+      __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
+      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
+
+#endif
+
+      __m256d avec0 = _mm256_permute_pd(a0101, 0);
+      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
+      __m256d avec2 = _mm256_permute_pd(a2323, 0);
+      __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
+
+      a += 4;
+
+      __m256d bvec;
+
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec0, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec0, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec1, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec1, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec2, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec2, bvec);
+
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec3, bvec);
+      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec3, bvec);
+   }
+
+   for (; i < n; i++) {
+      __m256d avec = _mm256_broadcast_sd(a); a++;
+      __m256d bvec;
 
       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
       bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
@@ -687,6 +967,75 @@ void muladd1_by_32(double *x, const doub
    _mm256_store_pd(x + 7*4, acc7);
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+
+AVX_FUNC(void,muladd1_by_16)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec, bvec;
+
+
+   __m256d acc0=_mm256_load_pd(x + 0*4);
+   __m256d acc1=_mm256_load_pd(x + 1*4);
+   __m256d acc2=_mm256_load_pd(x + 2*4);
+   __m256d acc3=_mm256_load_pd(x + 3*4);
+
+
+   for (long i = 0; i < n; i++) {
+      avec = _mm256_broadcast_sd(a); a++;
+
+
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec);
+      b += 16;
+   }
+
+
+   _mm256_store_pd(x + 0*4, acc0);
+   _mm256_store_pd(x + 1*4, acc1);
+   _mm256_store_pd(x + 2*4, acc2);
+   _mm256_store_pd(x + 3*4, acc3);
+}
+
+FMA_FUNC(void,muladd1_by_16)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec, bvec;
+
+
+   __m256d acc0=_mm256_load_pd(x + 0*4);
+   __m256d acc1=_mm256_load_pd(x + 1*4);
+   __m256d acc2=_mm256_load_pd(x + 2*4);
+   __m256d acc3=_mm256_load_pd(x + 3*4);
+
+
+   for (long i = 0; i < n; i++) {
+      avec = _mm256_broadcast_sd(a); a++;
+
+
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec);
+      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec);
+      b += 16;
+   }
+
+
+   _mm256_store_pd(x + 0*4, acc0);
+   _mm256_store_pd(x + 1*4, acc1);
+   _mm256_store_pd(x + 2*4, acc2);
+   _mm256_store_pd(x + 3*4, acc3);
+}
+
+FMA_RESOLVER(static void,muladd1_by_16,
+   (double *x, const double *a, const double *b, long n));
+
+#else
+
 static
 void muladd1_by_16(double *x, const double *a, const double *b, long n)
 {
@@ -717,6 +1066,165 @@ void muladd1_by_16(double *x, const doub
    _mm256_store_pd(x + 3*4, acc3);
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+AVX_FUNC(void,muladd2_by_32)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec0, avec1, bvec;
+   __m256d acc00, acc01, acc02, acc03;
+   __m256d acc10, acc11, acc12, acc13;
+ 
+
+   // round 0
+
+   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
+
+   // round 1
+
+   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
+   }
+
+
+   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
+
+}
+
+FMA_FUNC(void,muladd2_by_32)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec0, avec1, bvec;
+   __m256d acc00, acc01, acc02, acc03;
+   __m256d acc10, acc11, acc12, acc13;
+ 
+
+   // round 0
+
+   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
+
+   // round 1
+
+   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
+   }
+
+
+   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
+
+}
+
+FMA_RESOLVER(static void,muladd2_by_32,
+   (double *x, const double *a, const double *b, long n));
+
+#else
 
 // experiment: process two rows at a time
 static
@@ -795,6 +1303,211 @@ void muladd2_by_32(double *x, const doub
 
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+FMA_FUNC(void,muladd3_by_32)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec0, avec1, avec2, bvec;
+   __m256d acc00, acc01, acc02, acc03;
+   __m256d acc10, acc11, acc12, acc13;
+   __m256d acc20, acc21, acc22, acc23;
+ 
+
+   // round 0
+
+   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
+
+   acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
+   acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
+   acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
+   acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
+
+   _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
+   _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
+   _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
+   _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
+
+   // round 1
+
+   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
+
+   acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
+   acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
+   acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
+   acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
+   }
+
+
+   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
+
+   _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
+   _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
+   _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
+   _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
+
+}
+
+AVX2_FUNC(void,muladd3_by_32)
+(double *x, const double *a, const double *b, long n)
+{
+   __m256d avec0, avec1, avec2, bvec;
+   __m256d acc00, acc01, acc02, acc03;
+   __m256d acc10, acc11, acc12, acc13;
+   __m256d acc20, acc21, acc22, acc23;
+ 
+
+   // round 0
+
+   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
+
+   acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
+   acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
+   acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
+   acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
+   }
+
+
+   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
+
+   _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
+   _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
+   _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
+   _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
+
+   // round 1
+
+   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
+   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
+   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
+   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
+
+   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
+   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
+   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
+   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
+
+   acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
+   acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
+   acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
+   acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
+
+   for (long i = 0; i < n; i++) {
+      avec0 = _mm256_broadcast_sd(&a[i]); 
+      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
+      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
+   }
+
+
+   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
+   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
+   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
+   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
+
+   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
+   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
+   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
+   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
+
+   _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
+   _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
+   _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
+   _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
+
+}
+
+AVX2_RESOLVER(static void,muladd3_by_32,
+   (double *x, const double *a, const double *b, long n));
+
+#else
 
 // experiment: process three rows at a time
 // NOTE: this makes things slower on an AVX1 platform --- not enough registers
@@ -899,8 +1612,10 @@ void muladd3_by_32(double *x, const doub
 
 }
 
-static
-void muladd2_by_16(double *x, const double *a, const double *b, long n)
+#endif
+
+static void __attribute__((target ("avx,pclmul")))
+muladd2_by_16(double *x, const double *a, const double *b, long n)
 {
    __m256d avec0, avec1, bvec;
    __m256d acc00, acc01, acc02, acc03;
@@ -923,10 +1638,10 @@ void muladd2_by_16(double *x, const doub
       avec0 = _mm256_broadcast_sd(&a[i]); 
       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
 
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
    }
 
 
@@ -942,8 +1657,8 @@ void muladd2_by_16(double *x, const doub
 
 }
 
-static
-void muladd3_by_16(double *x, const double *a, const double *b, long n)
+static void __attribute__((target ("fma,pclmul")))
+muladd3_by_16(double *x, const double *a, const double *b, long n)
 {
    __m256d avec0, avec1, avec2, bvec;
    __m256d acc00, acc01, acc02, acc03;
@@ -973,10 +1688,10 @@ void muladd3_by_16(double *x, const doub
       avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
       avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
 
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec); MUL_ADD(acc20, avec2, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec); MUL_ADD(acc21, avec2, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec); MUL_ADD(acc22, avec2, bvec);
-      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec); MUL_ADD(acc23, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
+      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
    }
 
 
@@ -997,6 +1712,30 @@ void muladd3_by_16(double *x, const doub
 
 }
 
+#ifdef NTL_LOADTIME_CPU
+
+static inline
+void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
+{
+   long i = first;
+
+   if (have_fma) {
+      // processing three rows at a time is faster
+      for (; i <= last-3; i+=3)
+         muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+      for (; i < last; i++)
+         muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+   } else {
+      // process only two rows at a time: not enough registers :-(
+      for (; i <= last-2; i+=2)
+         muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+      for (; i < last; i++)
+         muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+   }
+}
+
+#else
+
 static inline
 void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
 {
@@ -1016,6 +1755,30 @@ void muladd_all_by_32(long first, long l
 #endif
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+
+static inline
+void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
+{
+   long i = first;
+   if (have_fma) {
+      // processing three rows at a time is faster
+      for (; i <= last-3; i+=3)
+         muladd3_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+      for (; i < last; i++)
+         muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+   } else {
+      // process only two rows at a time: not enough registers :-(
+      for (; i <= last-2; i+=2)
+         muladd2_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+      for (; i < last; i++)
+         muladd1_by_16(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
+   }
+}
+
+#else
 
 static inline
 void muladd_all_by_16(long first, long last, double *x, const double *a, const double *b, long n)
@@ -1036,6 +1799,8 @@ void muladd_all_by_16(long first, long l
 #endif
 }
 
+#endif
+
 static inline
 void muladd_all_by_32_width(long first, long last, double *x, const double *a, const double *b, long n, long width)
 {
@@ -1045,7 +1810,74 @@ void muladd_all_by_32_width(long first,
       muladd_all_by_16(first, last, x, a, b, n);
 }
 
+#ifdef NTL_LOADTIME_CPU
+
+AVX_FUNC(void,muladd_interval)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+   __m256d xvec0, xvec1, xvec2, xvec3;
+   __m256d yvec0, yvec1, yvec2, yvec3;
+
+   __m256d cvec = _mm256_broadcast_sd(&c);
+
+   for (long i = 0; i < n; i += 16, x += 16, y += 16) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      xvec1 = _mm256_load_pd(x+1*4);
+      xvec2 = _mm256_load_pd(x+2*4);
+      xvec3 = _mm256_load_pd(x+3*4);
+
+      yvec0 = _mm256_load_pd(y+0*4);
+      yvec1 = _mm256_load_pd(y+1*4);
+      yvec2 = _mm256_load_pd(y+2*4);
+      yvec3 = _mm256_load_pd(y+3*4);
+
+      AVX_MUL_ADD(xvec0, yvec0, cvec);
+      AVX_MUL_ADD(xvec1, yvec1, cvec);
+      AVX_MUL_ADD(xvec2, yvec2, cvec);
+      AVX_MUL_ADD(xvec3, yvec3, cvec);
+
+      _mm256_store_pd(x + 0*4, xvec0);
+      _mm256_store_pd(x + 1*4, xvec1);
+      _mm256_store_pd(x + 2*4, xvec2);
+      _mm256_store_pd(x + 3*4, xvec3);
+   }
+}
+
+FMA_FUNC(void,muladd_interval)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+   __m256d xvec0, xvec1, xvec2, xvec3;
+   __m256d yvec0, yvec1, yvec2, yvec3;
 
+   __m256d cvec = _mm256_broadcast_sd(&c);
+
+   for (long i = 0; i < n; i += 16, x += 16, y += 16) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      xvec1 = _mm256_load_pd(x+1*4);
+      xvec2 = _mm256_load_pd(x+2*4);
+      xvec3 = _mm256_load_pd(x+3*4);
+
+      yvec0 = _mm256_load_pd(y+0*4);
+      yvec1 = _mm256_load_pd(y+1*4);
+      yvec2 = _mm256_load_pd(y+2*4);
+      yvec3 = _mm256_load_pd(y+3*4);
+
+      FMA_MUL_ADD(xvec0, yvec0, cvec);
+      FMA_MUL_ADD(xvec1, yvec1, cvec);
+      FMA_MUL_ADD(xvec2, yvec2, cvec);
+      FMA_MUL_ADD(xvec3, yvec3, cvec);
+
+      _mm256_store_pd(x + 0*4, xvec0);
+      _mm256_store_pd(x + 1*4, xvec1);
+      _mm256_store_pd(x + 2*4, xvec2);
+      _mm256_store_pd(x + 3*4, xvec3);
+   }
+}
+
+FMA_RESOLVER(static void,muladd_interval,
+   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
+
+#else
 
 // this assumes n is a multiple of 16
 static inline
@@ -1079,6 +1911,107 @@ void muladd_interval(double * NTL_RESTRI
    }
 }
 
+#endif
+
+#ifdef NTL_LOADTIME_CPU
+
+AVX_FUNC(void,muladd_interval1)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+
+   __m256d xvec0, xvec1, xvec2, xvec3;
+   __m256d yvec0, yvec1, yvec2, yvec3;
+   __m256d cvec;
+
+   if (n >= 4)
+      cvec = _mm256_broadcast_sd(&c);
+
+   long i=0;
+   for (; i <= n-16; i += 16, x += 16, y += 16) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      xvec1 = _mm256_load_pd(x+1*4);
+      xvec2 = _mm256_load_pd(x+2*4);
+      xvec3 = _mm256_load_pd(x+3*4);
+
+      yvec0 = _mm256_load_pd(y+0*4);
+      yvec1 = _mm256_load_pd(y+1*4);
+      yvec2 = _mm256_load_pd(y+2*4);
+      yvec3 = _mm256_load_pd(y+3*4);
+
+      AVX_MUL_ADD(xvec0, yvec0, cvec);
+      AVX_MUL_ADD(xvec1, yvec1, cvec);
+      AVX_MUL_ADD(xvec2, yvec2, cvec);
+      AVX_MUL_ADD(xvec3, yvec3, cvec);
+
+      _mm256_store_pd(x + 0*4, xvec0);
+      _mm256_store_pd(x + 1*4, xvec1);
+      _mm256_store_pd(x + 2*4, xvec2);
+      _mm256_store_pd(x + 3*4, xvec3);
+   }
+
+   for (; i <= n-4; i += 4, x += 4, y += 4) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      yvec0 = _mm256_load_pd(y+0*4);
+      AVX_MUL_ADD(xvec0, yvec0, cvec);
+      _mm256_store_pd(x + 0*4, xvec0);
+   }
+
+   for (; i < n; i++, x++, y++) {
+      *x += (*y)*c;
+   }
+}
+
+FMA_FUNC(void,muladd_interval1)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+
+   __m256d xvec0, xvec1, xvec2, xvec3;
+   __m256d yvec0, yvec1, yvec2, yvec3;
+   __m256d cvec;
+
+   if (n >= 4)
+      cvec = _mm256_broadcast_sd(&c);
+
+   long i=0;
+   for (; i <= n-16; i += 16, x += 16, y += 16) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      xvec1 = _mm256_load_pd(x+1*4);
+      xvec2 = _mm256_load_pd(x+2*4);
+      xvec3 = _mm256_load_pd(x+3*4);
+
+      yvec0 = _mm256_load_pd(y+0*4);
+      yvec1 = _mm256_load_pd(y+1*4);
+      yvec2 = _mm256_load_pd(y+2*4);
+      yvec3 = _mm256_load_pd(y+3*4);
+
+      FMA_MUL_ADD(xvec0, yvec0, cvec);
+      FMA_MUL_ADD(xvec1, yvec1, cvec);
+      FMA_MUL_ADD(xvec2, yvec2, cvec);
+      FMA_MUL_ADD(xvec3, yvec3, cvec);
+
+      _mm256_store_pd(x + 0*4, xvec0);
+      _mm256_store_pd(x + 1*4, xvec1);
+      _mm256_store_pd(x + 2*4, xvec2);
+      _mm256_store_pd(x + 3*4, xvec3);
+   }
+
+   for (; i <= n-4; i += 4, x += 4, y += 4) {
+      xvec0 = _mm256_load_pd(x+0*4);
+      yvec0 = _mm256_load_pd(y+0*4);
+      FMA_MUL_ADD(xvec0, yvec0, cvec);
+      _mm256_store_pd(x + 0*4, xvec0);
+   }
+
+   for (; i < n; i++, x++, y++) {
+      *x += (*y)*c;
+   }
+}
+
+FMA_RESOLVER(static void,muladd_interval1,
+   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
+
+#else
+
 // this one is more general: does not assume that n is a
 // multiple of 16
 static inline
@@ -1127,8 +2060,73 @@ void muladd_interval1(double * NTL_RESTR
    }
 }
 
+#endif
+
 #define AVX_PD_SZ (4)
 
+#ifdef NTL_LOADTIME_CPU
+
+AVX_FUNC(void,muladd_interval2)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+   n /= 4;
+   if (n <= 0 || n > 8) return;
+
+   x += n*4;
+   y += n*4;
+
+   // n in [1..8]
+
+   __m256d xvec, yvec, cvec;
+
+   cvec = _mm256_broadcast_sd(&c);
+
+   switch (n) {
+   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
+   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
+   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
+   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
+   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
+   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
+   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
+   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
+   }
+   
+}
+
+FMA_FUNC(void,muladd_interval2)
+(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+{
+   n /= 4;
+   if (n <= 0 || n > 8) return;
+
+   x += n*4;
+   y += n*4;
+
+   // n in [1..8]
+
+   __m256d xvec, yvec, cvec;
+
+   cvec = _mm256_broadcast_sd(&c);
+
+   switch (n) {
+   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
+   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
+   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
+   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
+   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
+   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
+   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
+   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
+   }
+   
+}
+
+FMA_RESOLVER(static void,muladd_interval2,
+   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
+
+#else
+
 // experimental: assumes n is a multiple of 4 in the range [0..32]
 #if 1
 static inline
@@ -1169,6 +2167,8 @@ void muladd_interval2(double * NTL_RESTR
 
 #endif
 
+#endif
+
 
 #define DO_MUL(a, b) ((unsigned long) (long(a)*long(b)))
 
@@ -2743,10 +3743,10 @@ void alt_mul_LL(const mat_window_zz_p& X
 }  
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
-static
-void blk_mul_DD(const mat_window_zz_p& X, 
+static void __attribute__((target ("avx,pclmul")))
+blk_mul_DD(const mat_window_zz_p& X, 
                 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)  
 {  
    long n = A.NumRows();  
@@ -3085,12 +4085,13 @@ void mul_base (const mat_window_zz_p& X,
    long p = zz_p::modulus();
    long V = MAT_BLK_SZ*4;
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
    // experimentally, blk_mul_DD beats all the alternatives
    // if each dimension is at least 16
 
-   if (n >= 16 && l >= 16 && m >= 16 &&
+   if (AVX_ACTIVE &&
+       n >= 16 && l >= 16 && m >= 16 &&
        p-1 <= MAX_DBL_INT &&
        V <= (MAX_DBL_INT-(p-1))/(p-1) &&
        V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) 
@@ -3185,7 +4186,8 @@ void mul_strassen(const mat_window_zz_p&
     // this code determines if mul_base triggers blk_mul_DD,
     // in which case a higher crossover is used
 
-#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
+#if (defined(NTL_HAVE_LL_TYPE) && (defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)))
+    if (AVX_ACTIVE)
     {
        long V = MAT_BLK_SZ*4;
        long p = zz_p::modulus();
@@ -3685,10 +4687,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
 
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
-static
-void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
+static void __attribute__((target ("avx,pclmul")))
+alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
 {
    long n = A.NumRows();
 
@@ -3854,10 +4856,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
 
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
-static
-void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
+static void __attribute__((target ("avx,pclmul")))
+blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
 {
    long n = A.NumRows();
 
@@ -4615,8 +5617,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
    else if (n/MAT_BLK_SZ < 4) {
       long V = 64;
 
-#ifdef NTL_HAVE_AVX
-      if (p-1 <= MAX_DBL_INT &&
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+      if (AVX_ACTIVE &&
+          p-1 <= MAX_DBL_INT &&
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
@@ -4641,8 +5644,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
    else {
       long V = 4*MAT_BLK_SZ;
 
-#ifdef NTL_HAVE_AVX
-      if (p-1 <= MAX_DBL_INT &&
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+      if (AVX_ACTIVE &&
+	  p-1 <= MAX_DBL_INT &&
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
@@ -5048,10 +6052,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
 
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
-static
-void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
+static void __attribute__((target ("avx,pclmul")))
+alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
                vec_zz_p *xp, bool trans, bool relax)
 {
    long n = A.NumRows();
@@ -5238,10 +6242,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
 
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
-static
-void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
+static void __attribute__((target ("avx,pclmul")))
+blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
                vec_zz_p *xp, bool trans, bool relax)
 {
    long n = A.NumRows();
@@ -6052,8 +7056,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
    else if (n/MAT_BLK_SZ < 4) {
       long V = 64;
 
-#ifdef NTL_HAVE_AVX
-      if (p-1 <= MAX_DBL_INT &&
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+      if (AVX_ACTIVE &&
+          p-1 <= MAX_DBL_INT &&
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
@@ -6078,8 +7083,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
    else {
       long V = 4*MAT_BLK_SZ;
 
-#ifdef NTL_HAVE_AVX
-      if (p-1 <= MAX_DBL_INT &&
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+      if (AVX_ACTIVE &&
+          p-1 <= MAX_DBL_INT &&
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
@@ -6325,7 +7331,7 @@ long elim_basic(const mat_zz_p& A, mat_z
 #ifdef NTL_HAVE_LL_TYPE
 
 
-#ifdef NTL_HAVE_AVX
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
 
 
 static inline
@@ -7778,8 +8784,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
    else {
       long V = 4*MAT_BLK_SZ;
 
-#ifdef NTL_HAVE_AVX
-      if (p-1 <= MAX_DBL_INT &&
+#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+      if (AVX_ACTIVE &&
+          p-1 <= MAX_DBL_INT &&
           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
 
--- src/QuickTest.cpp.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/QuickTest.cpp	2017-01-12 15:07:36.883376955 -0700
@@ -316,6 +316,9 @@ cerr << "Performance Options:\n";
    cerr << "NTL_GF2X_NOINLINE\n";
 #endif
 
+#ifdef NTL_LOADTIME_CPU
+   cerr << "NTL_LOADTIME_CPU\n";
+#endif
 
    cerr << "\n\n";
 
--- src/WizardAux.orig	2016-11-18 11:39:15.000000000 -0700
+++ src/WizardAux	2017-01-12 15:07:36.883376955 -0700
@@ -88,6 +88,7 @@ system("$ARGV[0] InitSettings");
 'NTL_GF2X_NOINLINE'    => 0,
 'NTL_FFT_BIGTAB'       => 0,
 'NTL_FFT_LAZYMUL'      => 0,
+'NTL_LOADTIME_CPU'     => 0,
 
 'WIZARD_HACK'          => '#define NTL_WIZARD_HACK',