diff --git a/.gitignore b/.gitignore
index 202c75c..779591a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 /ntl-9.6.2.tar.gz
 /ntl-9.6.4.tar.gz
 /ntl-9.7.0.tar.gz
+/ntl-9.8.0.tar.gz
diff --git a/ntl-loadtime-cpu.patch b/ntl-loadtime-cpu.patch
new file mode 100644
index 0000000..117711f
--- /dev/null
+++ b/ntl-loadtime-cpu.patch
@@ -0,0 +1,2286 @@
+--- doc/config.txt.orig	2016-04-26 05:40:15.000000000 -0600
++++ doc/config.txt	2016-04-26 18:22:38.925719916 -0600
+@@ -291,6 +291,7 @@ NTL_GF2X_NOINLINE=off
+ NTL_GF2X_ALTCODE=off
+ NTL_GF2X_ALTCODE1=off
+ NTL_PCLMUL=off
++NTL_LOADTIME_CPU=off
+ 
+ GMP_INCDIR=$(GMP_PREFIX)/include
+ GMP_LIBDIR=$(GMP_PREFIX)/lib
+@@ -638,6 +639,10 @@ NTL_PCLMUL=off
+ # switch to enable the PCLMUL instruction on x86 machines for faster arithmetic
+ # over GF(2)[X] (without relying on the gf2x package)
+ 
++NTL_LOADTIME_CPU=off
++
++# switch to check CPU characteristics at load time and use routines
++# optimized for the executing CPU.
+ 
+ 
+ ########## More GMP Options:
+--- include/NTL/config.h.orig	2016-04-26 05:40:16.000000000 -0600
++++ include/NTL/config.h	2016-04-26 18:22:38.929719590 -0600
+@@ -616,6 +616,23 @@ using the configure script.
+ #endif
+ 
+ 
++#if 0
++#define NTL_LOADTIME_CPU
++
++/*
++ * With this flag enabled, detect advanced CPU features at load time instead
++ * of at compile time.  This flag is intended for distributions, so that they
++ * can compile for the lowest common denominator CPU, but still support newer
++ * CPUs.
++ *
++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
++ *
++ * To re-build after changing this flag:
++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
++ */
++
++#endif
++
+ 
+ 
+ 
+--- include/NTL/ctools.h.orig	2016-04-26 05:40:15.000000000 -0600
++++ include/NTL/ctools.h	2016-04-26 18:22:38.930719509 -0600
+@@ -422,6 +422,137 @@ void _ntl_swap(T*& a, T*& b)
+ // this should be big enough to satisfy any SIMD instructions,
+ // and it should also be as big as a cache line
+ 
++/* Determine CPU characteristics at runtime */
++#ifdef NTL_LOADTIME_CPU
++#if !defined(__x86_64__)
++#error Runtime CPU support is only available on x86_64.
++#endif
++#ifndef __GNUC__
++#error Runtime CPU support is only available with GCC.
++#endif
++#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
++#error Runtime CPU support is only available with GCC 4.6 or later.
++#endif
++
++#include <cpuid.h>
++#ifndef bit_PCLMUL
++#define bit_PCLMUL	(1 << 1)
++#endif
++#ifndef bit_AVX
++#define bit_AVX		(1 << 28)
++#endif
++#ifndef bit_FMA
++#define bit_FMA		(1 << 12)
++#endif
++#ifndef bit_AVX2
++#define bit_AVX2	(1 << 5)
++#endif
++
++#define BASE_FUNC(type,name) static type name##_base
++#define TARGET_FUNC(arch,suffix,type,name)                              \
++   static type __attribute__((target (arch))) name##_##suffix
++#define PCLMUL_FUNC(type,name) TARGET_FUNC("pclmul",pclmul,type,name)
++#define AVX_FUNC(type,name) TARGET_FUNC("avx,pclmul",avx,type,name)
++#define FMA_FUNC(type,name) TARGET_FUNC("fma,avx,pclmul",fma,type,name)
++#define AVX2_FUNC(type,name) TARGET_FUNC("avx2,fma,avx,pclmul",avx2,type,name)
++#define PCLMUL_RESOLVER(type,name,params)                               \
++   extern "C" {                                                         \
++      static void __attribute__((optimize ("O0")))                      \
++         (*resolve_##name (void))(void) {                               \
++         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
++            unsigned int eax, ebx, ecx, edx;                            \
++            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
++               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
++               have_avx = ((ecx & bit_AVX) != 0);                       \
++               have_fma = ((ecx & bit_FMA) != 0);                       \
++            } else {                                                    \
++               have_pclmul = 0;                                         \
++               have_avx = 0;                                            \
++               have_fma = 0;                                            \
++            }                                                           \
++         }                                                              \
++         if (have_avx) return (void (*)(void))&name##_avx;              \
++         if (have_pclmul) return (void (*)(void))&name##_pclmul;        \
++         return (void (*)(void))&name##_base;                           \
++      }                                                                 \
++   }                                                                    \
++   type __attribute__((ifunc ("resolve_" #name))) name params
++#define AVX_RESOLVER(type,name,params)                                  \
++   extern "C" {                                                         \
++      static void __attribute__((optimize ("O0")))                      \
++         (*resolve_##name (void))(void) {                               \
++         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
++            unsigned int eax, ebx, ecx, edx;                            \
++            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
++               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
++               have_avx = ((ecx & bit_AVX) != 0);                       \
++               have_fma = ((ecx & bit_FMA) != 0);                       \
++            } else {                                                    \
++               have_pclmul = 0;                                         \
++               have_avx = 0;                                            \
++               have_fma = 0;                                            \
++            }                                                           \
++         }                                                              \
++         return have_avx                                                \
++               ? (void (*)(void))&name##_avx                            \
++               : (void (*)(void))&name##_base;                          \
++      }                                                                 \
++   }                                                                    \
++   type __attribute__((ifunc ("resolve_" #name))) name params
++#define FMA_RESOLVER(type,name,params)                                  \
++   extern "C" {                                                         \
++      static void __attribute__((optimize ("O0")))                      \
++         (*resolve_##name (void))(void) {                               \
++         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
++            unsigned int eax, ebx, ecx, edx;                            \
++            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
++               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
++               have_avx = ((ecx & bit_AVX) != 0);                       \
++               have_fma = ((ecx & bit_FMA) != 0);                       \
++            } else {                                                    \
++               have_pclmul = 0;                                         \
++               have_avx = 0;                                            \
++               have_fma = 0;                                            \
++            }                                                           \
++         }                                                              \
++         return have_fma                                                \
++               ? (void (*)(void))&name##_fma                            \
++               : (void (*)(void))&name##_avx;                           \
++      }                                                                 \
++   }                                                                    \
++   type __attribute__((ifunc ("resolve_" #name))) name params
++#define AVX2_RESOLVER(type,name,params)                                 \
++   extern "C" {                                                         \
++      static void __attribute__((optimize ("O0")))                      \
++         (*resolve_##name (void))(void) {                               \
++         if (__builtin_expect(have_avx2, 0) < 0) {                      \
++            unsigned int eax, ebx, ecx, edx;                            \
++            if (__get_cpuid(7, &eax, &ebx, &ecx, &edx)) {               \
++               have_avx2 = ((ebx & bit_AVX2) != 0);                     \
++            } else {                                                    \
++               have_avx2 = 0;                                           \
++            }                                                           \
++         }                                                              \
++         if (__builtin_expect(have_pclmul, 0) < 0) {                    \
++            unsigned int eax, ebx, ecx, edx;                            \
++            if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {               \
++               have_pclmul = ((ecx & bit_PCLMUL) != 0);                 \
++               have_avx = ((ecx & bit_AVX) != 0);                       \
++               have_fma = ((ecx & bit_FMA) != 0);                       \
++            } else {                                                    \
++               have_pclmul = 0;                                         \
++               have_avx = 0;                                            \
++               have_fma = 0;                                            \
++            }                                                           \
++         }                                                              \
++         unsigned int eax, ebx, ecx, edx;                               \
++         return have_avx2                                               \
++               ? (void (*)(void))&name##_avx2                           \
++               : (void (*)(void))&name##_fma;                           \
++      }                                                                 \
++   }                                                                    \
++   type __attribute__((ifunc ("resolve_" #name))) name params
++#endif
+ 
+    
+ 
+--- include/NTL/def_config.h.orig	2016-04-26 05:40:15.000000000 -0600
++++ include/NTL/def_config.h	2016-04-26 18:22:38.930719509 -0600
+@@ -616,6 +616,22 @@ using the configure script.
+ #endif
+ 
+ 
++#if 0
++#define NTL_LOADTIME_CPU
++
++/*
++ * With this flag enabled, detect advanced CPU features at load time instead
++ * of at compile time.  This flag is intended for distributions, so that they
++ * can compile for the lowest common denominator CPU, but still support newer
++ * CPUs.
++ *
++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
++ *
++ * To re-build after changing this flag:
++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
++ */
++
++#endif
+ 
+ 
+ 
+--- src/cfile.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/cfile	2016-04-26 18:22:38.931719428 -0600
+@@ -616,6 +616,23 @@ using the configure script.
+ #endif
+ 
+ 
++#if @{NTL_LOADTIME_CPU}
++#define NTL_LOADTIME_CPU
++
++/*
++ * With this flag enabled, detect advanced CPU features at load time instead
++ * of at compile time.  This flag is intended for distributions, so that they
++ * can compile for the lowest common denominator CPU, but still support newer
++ * CPUs.
++ *
++ * This flag is useful only on x86_64 platforms with gcc 4.8 or later.
++ *
++ * To re-build after changing this flag:
++ * rm GF2X.o GF2X1.o lzz_pX1.o mat_lzz_p.o; make ntl.a
++ */
++
++#endif
++
+ 
+ @{WIZARD_HACK}
+ 
+--- src/DispSettings.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/DispSettings.c	2016-04-26 18:22:38.931719428 -0600
+@@ -186,6 +186,9 @@ cout << "Performance Options:\n";
+    cout << "NTL_PCLMUL\n";
+ #endif
+ 
++#ifdef NTL_LOADTIME_CPU
++   cout << "NTL_LOADTIME_CPU\n";
++#endif
+ 
+    cout << "\n\n";
+ 
+--- src/DoConfig.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/DoConfig	2016-04-26 18:24:47.237292382 -0600
+@@ -1,7 +1,7 @@
+ # This is a perl script, invoked from a shell
+ 
+ # use warnings;  # this doesn't work on older versions of perl
+-
++use Config;
+ 
+ 
+ %MakeFlag = (
+@@ -82,6 +82,7 @@
+ 'NTL_RANGE_CHECK'         => 'off',
+ 'NTL_FFT_BIGTAB'          => 'off',
+ 'NTL_FFT_LAZYMUL'         => 'off',
++'NTL_LOADTIME_CPU'        => 'off',
+ 
+ );
+ 
+@@ -148,6 +149,15 @@ if ($ConfigFlag{'NTL_THREADS'} eq 'on' &
+ }
+ 
+ 
++# special processing: NTL_LOADTIME_CPU on x86/x86_64 only and => NTL_GF2X_NOINLINE
++
++if ($ConfigFlag{'NTL_LOADTIME_CPU'} eq 'on') {
++   if (!$Config{archname} =~ /x86_64/) {
++      die "Error: NTL_LOADTIME_CPU currently only available with x86_64...sorry\n";
++   }
++   $ConfigFlag{'NTL_GF2X_NOINLINE'} = 'on';
++}
++
+ 
+ # some special MakeVal values that are determined by SHARED
+ 
+--- src/GF2X1.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/GF2X1.c	2016-04-26 18:22:38.933719265 -0600
+@@ -19,7 +19,7 @@
+ // simple scaling factor for some crossover points:
+ // we use a lower crossover of the underlying multiplication
+ // is faster  
+-#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL))
++#if (defined(NTL_GF2X_LIB) || defined(NTL_PCLMUL) || defined (NTL_LOADTIME_CPU))
+ #define XOVER_SCALE (1L)
+ #else
+ #define XOVER_SCALE (2L)
+--- src/GF2X.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/GF2X.c	2016-04-26 18:22:38.933719265 -0600
+@@ -31,6 +31,22 @@ pclmul_mul1 (unsigned long *c, unsigned
+    __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
+    _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
+ }
++#elif defined (NTL_LOADTIME_CPU)
++
++#include <wmmintrin.h>
++
++static int have_pclmul = -1;
++static int have_avx = -1;
++static int have_fma = -1;
++
++#define NTL_INLINE inline
++
++#define pclmul_mul1(c,a,b) do {											\
++   __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); \
++   __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); \
++   _mm_storeu_si128((__m128i*)(c), _mm_clmulepi64_si128(aa, bb, 0));	\
++} while (0)
++
+ #else
+ 
+ 
+@@ -579,6 +595,27 @@ void add(GF2X& x, const GF2X& a, const G
+ 
+ 
+ 
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   NTL_EFF_BB_MUL_CODE0
++}
++
++PCLMUL_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   pclmul_mul1(c, a, b);
++}
++
++AVX_FUNC(void,mul1)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   pclmul_mul1(c, a, b);
++}
++
++PCLMUL_RESOLVER(static void,mul1,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
++
++#else
++
+ static 
+ void mul1(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+ {
+@@ -592,6 +629,7 @@ NTL_EFF_BB_MUL_CODE0
+ 
+ }
+ 
++#endif
+ 
+ #ifdef NTL_GF2X_NOINLINE
+ 
+@@ -616,6 +654,51 @@ NTL_EFF_BB_MUL_CODE0
+ #endif
+ 
+ 
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,Mul1)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   NTL_EFF_BB_MUL_CODE1
++}
++
++PCLMUL_FUNC(void,Mul1)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] = carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] = carry;
++}
++
++AVX_FUNC(void,Mul1)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] = carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] = carry;
++}
++
++PCLMUL_RESOLVER(static void,Mul1,
++    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
++
++#else
++
+ static 
+ void Mul1(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+ {
+@@ -643,6 +726,53 @@ NTL_EFF_BB_MUL_CODE1
+ 
+ }
+ 
++#endif
++
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,AddMul1)
++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
++{
++   NTL_EFF_BB_MUL_CODE2
++}
++
++PCLMUL_FUNC(void,AddMul1)
++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] ^= carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] ^= carry;
++}
++
++AVX_FUNC(void,AddMul1)
++(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] ^= carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] ^= carry;
++}
++
++PCLMUL_RESOLVER(static void,AddMul1,
++    (_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a));
++
++#else
++
+ static 
+ void AddMul1(_ntl_ulong *cp, const _ntl_ulong* bp, long sb, _ntl_ulong a)
+ {
+@@ -671,6 +801,52 @@ NTL_EFF_BB_MUL_CODE2
+ 
+ }
+ 
++#endif
++
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,Mul1_short)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   NTL_EFF_SHORT_BB_MUL_CODE1
++}
++
++PCLMUL_FUNC(void,Mul1_short)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] = carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] = carry;
++}
++
++AVX_FUNC(void,Mul1_short)
++(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
++{
++   long i;
++   unsigned long carry, prod[2];
++
++   carry = 0;
++   for (i = 0; i < sb; i++) {
++      pclmul_mul1(prod, bp[i], a);
++      cp[i] = carry ^ prod[0];
++      carry = prod[1];
++   }
++
++   cp[sb] = carry;
++}
++
++PCLMUL_RESOLVER(static void,Mul1_short,
++    (_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a));
++
++#else
+ 
+ static 
+ void Mul1_short(_ntl_ulong *cp, const _ntl_ulong *bp, long sb, _ntl_ulong a)
+@@ -699,9 +875,29 @@ NTL_EFF_SHORT_BB_MUL_CODE1
+ 
+ }
+ 
++#endif
++
++
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   NTL_EFF_HALF_BB_MUL_CODE0
++}
++
++PCLMUL_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   pclmul_mul1(c, a, b);
++}
+ 
++AVX_FUNC(void,mul_half)(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
++{
++   pclmul_mul1(c, a, b);
++}
+ 
++PCLMUL_RESOLVER(static void,mul_half,(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b));
+ 
++#else
+ 
+ static 
+ void mul_half(_ntl_ulong *c, _ntl_ulong a, _ntl_ulong b)
+@@ -716,6 +912,8 @@ NTL_EFF_HALF_BB_MUL_CODE0
+ 
+ }
+ 
++#endif
++
+ 
+ // mul2...mul8 hard-code 2x2...8x8 word multiplies.
+ // I adapted these routines from LiDIA (except mul3, see below).
+@@ -1627,6 +1825,77 @@ static const _ntl_ulong sqrtab[256] = {
+ 
+ 
+ 
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,sqr)(GF2X& c, const GF2X& a)
++{
++   long sa = a.xrep.length();
++   if (sa <= 0) {
++      clear(c);
++      return;
++   }
++
++   c.xrep.SetLength(sa << 1);
++   _ntl_ulong *cp = c.xrep.elts();
++   const _ntl_ulong *ap = a.xrep.elts();
++
++   for (long i = sa-1; i >= 0; i--) {
++      _ntl_ulong *c = cp + (i << 1);
++      _ntl_ulong a = ap[i];
++      _ntl_ulong hi, lo;
++
++      NTL_BB_SQR_CODE
++
++      c[0] = lo;
++      c[1] = hi;
++   }
++
++   c.normalize();
++   return;
++}
++
++PCLMUL_FUNC(void,sqr)(GF2X& c, const GF2X& a)
++{
++   long sa = a.xrep.length();
++   if (sa <= 0) {
++      clear(c);
++      return;
++   }
++
++   c.xrep.SetLength(sa << 1);
++   _ntl_ulong *cp = c.xrep.elts();
++   const _ntl_ulong *ap = a.xrep.elts();
++
++   for (long i = sa-1; i >= 0; i--)
++      pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
++
++   c.normalize();
++   return;
++}
++
++AVX_FUNC(void,sqr)(GF2X& c, const GF2X& a)
++{
++   long sa = a.xrep.length();
++   if (sa <= 0) {
++      clear(c);
++      return;
++   }
++
++   c.xrep.SetLength(sa << 1);
++   _ntl_ulong *cp = c.xrep.elts();
++   const _ntl_ulong *ap = a.xrep.elts();
++
++   for (long i = sa-1; i >= 0; i--)
++      pclmul_mul1 (cp + (i << 1), ap[i], ap[i]);
++
++   c.normalize();
++   return;
++}
++
++PCLMUL_RESOLVER(void,sqr,(GF2X& c, const GF2X& a));
++
++#else
++
+ static inline
+ void sqr1(_ntl_ulong *c, _ntl_ulong a)
+ {
+@@ -1667,6 +1936,7 @@ void sqr(GF2X& c, const GF2X& a)
+    return;
+ }
+ 
++#endif
+ 
+ 
+ void LeftShift(GF2X& c, const GF2X& a, long n)
+--- src/InitSettings.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/InitSettings.c	2016-04-26 20:06:27.078557786 -0600
+@@ -150,6 +150,11 @@ int main()
+    cout << "NTL_RANGE_CHECK=0\n";
+ #endif
+ 
++#ifdef NTL_LOADTIME_CPU
++   cout << "NTL_LOADTIME_CPU=1\n";
++#else
++   cout << "NTL_LOADTIME_CPU=0\n";
++#endif
+ 
+ // the following is synthetically defined
+ #ifdef NTL_LONGLONG_SP_MULMOD
+--- src/lzz_pX1.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/lzz_pX1.c	2016-04-26 18:22:38.934719184 -0600
+@@ -4,6 +4,12 @@
+ 
+ #ifdef NTL_HAVE_AVX
+ #include <immintrin.h>
++#elif defined(NTL_LOADTIME_CPU)
++#include <immintrin.h>
++
++static int have_pclmul = -1;
++static int have_avx = -1;
++static int have_fma = -1;
+ #endif
+ 
+ 
+@@ -1076,6 +1082,175 @@ void Comp3Mod(zz_pX& x1, zz_pX& x2, zz_p
+ 
+ 
+ 
++#ifdef NTL_LOADTIME_CPU
++
++BASE_FUNC(void,build)
++(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
++{
++   altH.orig = &H;
++   altH.mem.kill();
++   altH.row.kill();
++
++   if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; }
++
++   altH.n = F.n;
++   altH.m = H.H.length()-1;
++
++   long p = zz_p::modulus();
++   long n = altH.n;
++   long m = altH.m;
++
++
++   if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
++       cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
++      altH.strategy = 1;
++      altH.pinv_L = sp_PrepRem(p);
++   }
++   else {
++      altH.strategy = 2;
++      altH.pinv_LL = make_sp_ll_reduce_struct(p);
++   }
++
++
++   if (altH.strategy == 1 || altH.strategy == 2) {
++
++      altH.row.SetLength(n);
++      long **row = altH.row.elts();
++      
++      const long  AllocAmt = 1L << 18;
++   
++      long BlockSize = (AllocAmt + m - 1)/m;
++      long NumBlocks = (n + BlockSize - 1)/BlockSize;
++   
++      altH.mem.SetLength(NumBlocks);
++   
++      for (long i = 0; i < NumBlocks; i++) {
++         long first = i*BlockSize;
++         long last =  min(n, first + BlockSize);
++         altH.mem[i].SetLength((last-first)*m);
++         for (long j = first; j < last; j++) {
++            row[j] = altH.mem[i].elts() + (j-first)*m;
++         }
++      }
++   
++      for (long i = 0; i < m; i++) {
++         const zz_p* ptr = H.H[i].rep.elts();
++         long len = H.H[i].rep.length();
++         for (long j = 0; j < len; j++) 
++            row[j][i] = rep(ptr[j]);
++         for (long j = len; j < n; j++)
++            row[j][i] = 0;
++      }
++   }
++}
++
++AVX_FUNC(void,build)
++(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
++{
++   altH.orig = &H;
++   altH.mem.kill();
++   altH.row.kill();
++   altH.dmem.kill();
++   altH.drow.kill();
++
++   if (H.H.length() < 10 || F.n < 50) { altH.strategy = 0; return; }
++
++   altH.n = F.n;
++   altH.m = H.H.length()-1;
++
++   long p = zz_p::modulus();
++   long n = altH.n;
++   long m = altH.m;
++
++   if (n >= 128 && m <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1) &&
++      m*(p-1) <= ((1L << NTL_DOUBLE_PRECISION)-1)/(p-1)) {
++         altH.strategy = 3;
++         altH.pinv_L = sp_PrepRem(p);
++   }
++   else if (cast_unsigned(m) <= (~(0UL))/cast_unsigned(p-1) &&
++       cast_unsigned(m)*cast_unsigned(p-1) <= (~(0UL))/cast_unsigned(p-1)) {
++      altH.strategy = 1;
++      altH.pinv_L = sp_PrepRem(p);
++   }
++   else {
++      altH.strategy = 2;
++      altH.pinv_LL = make_sp_ll_reduce_struct(p);
++   }
++
++
++   if (altH.strategy == 1 || altH.strategy == 2) {
++
++      altH.row.SetLength(n);
++      long **row = altH.row.elts();
++      
++      const long  AllocAmt = 1L << 18;
++   
++      long BlockSize = (AllocAmt + m - 1)/m;
++      long NumBlocks = (n + BlockSize - 1)/BlockSize;
++   
++      altH.mem.SetLength(NumBlocks);
++   
++      for (long i = 0; i < NumBlocks; i++) {
++         long first = i*BlockSize;
++         long last =  min(n, first + BlockSize);
++         altH.mem[i].SetLength((last-first)*m);
++         for (long j = first; j < last; j++) {
++            row[j] = altH.mem[i].elts() + (j-first)*m;
++         }
++      }
++   
++      for (long i = 0; i < m; i++) {
++         const zz_p* ptr = H.H[i].rep.elts();
++         long len = H.H[i].rep.length();
++         for (long j = 0; j < len; j++) 
++            row[j][i] = rep(ptr[j]);
++         for (long j = len; j < n; j++)
++            row[j][i] = 0;
++      }
++   } else {
++
++      // sanity check
++      if (m >= (1L << (NTL_BITS_PER_LONG-8))) ResourceError("zz_pXAltArgument: overflow");
++
++      long npanels = (n+15)/16;
++      long panel_size = 16*m;
++
++      const long AllocAmt = 1L << 18;
++
++      long BlockSize = (AllocAmt + panel_size - 1)/panel_size;
++      long NumBlocks = (npanels + BlockSize - 1)/BlockSize;
++
++      altH.dmem.SetLength(NumBlocks);
++      altH.drow.SetLength(npanels);
++      double **drow = altH.drow.elts();
++
++      for (long i = 0; i < NumBlocks; i++) {
++         long first = i*BlockSize;
++         long last = min(npanels, first + BlockSize);
++         altH.dmem[i].SetLength((last-first)*panel_size);
++
++         double *ptr = altH.dmem[i].get();
++
++         for (long j = first; j < last; j++)
++            drow[j] = ptr + (j-first)*panel_size;
++      }
++
++      for (long i = 0; i < m; i++) {
++         const zz_p *ptr = H.H[i].rep.elts();
++         long len = H.H[i].rep.length();
++         for (long j = 0; j < len; j++)
++            drow[j/16][(i*16) + (j%16)] = rep(ptr[j]);
++         for (long j = len; j < npanels*16; j++) 
++            drow[j/16][(i*16) + (j%16)] = 0;
++      }
++   }
++}
++
++AVX_RESOLVER(void,build,
++   (zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F));
++
++#else
++
+ void build(zz_pXAltArgument& altH, const zz_pXArgument& H, const zz_pXModulus& F)
+ {
+    altH.orig = &H;
+@@ -1194,11 +1369,82 @@ void build(zz_pXAltArgument& altH, const
+ #endif
+ }
+ 
++#endif
++
+ 
+ #ifdef NTL_HAVE_LL_TYPE
+ 
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,mul16rowsD)
++(double *x, const double *a, const double *b, long n)
++{
++   __m256d avec0, avec1, avec2, avec3;
++
++   __m256d acc0 = _mm256_setzero_pd();
++   __m256d acc1 = _mm256_setzero_pd();
++   __m256d acc2 = _mm256_setzero_pd();
++   __m256d acc3 = _mm256_setzero_pd();
++
++   __m256d bvec;
++
++   for (long i = 0; i < n; i++) {
++      bvec = _mm256_broadcast_sd(&b[i]); 
++
++      avec0 = _mm256_load_pd(a); a += 4;
++      avec1 = _mm256_load_pd(a); a += 4;
++      avec2 = _mm256_load_pd(a); a += 4;
++      avec3 = _mm256_load_pd(a); a += 4;
++
++      acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
++      acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
++      acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
++      acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
++   }
++
++   _mm256_store_pd(x + 0*4, acc0);
++   _mm256_store_pd(x + 1*4, acc1);
++   _mm256_store_pd(x + 2*4, acc2);
++   _mm256_store_pd(x + 3*4, acc3);
++}
++
++FMA_FUNC(void,mul16rowsD)
++(double *x, const double *a, const double *b, long n)
++{
++   __m256d avec0, avec1, avec2, avec3;
++
++   __m256d acc0 = _mm256_setzero_pd();
++   __m256d acc1 = _mm256_setzero_pd();
++   __m256d acc2 = _mm256_setzero_pd();
++   __m256d acc3 = _mm256_setzero_pd();
++
++   __m256d bvec;
++
++   for (long i = 0; i < n; i++) {
++      bvec = _mm256_broadcast_sd(&b[i]); 
++
++      avec0 = _mm256_load_pd(a); a += 4;
++      avec1 = _mm256_load_pd(a); a += 4;
++      avec2 = _mm256_load_pd(a); a += 4;
++      avec3 = _mm256_load_pd(a); a += 4;
++
++      acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
++      acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
++      acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
++      acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
++   }
++
++   _mm256_store_pd(x + 0*4, acc0);
++   _mm256_store_pd(x + 1*4, acc1);
++   _mm256_store_pd(x + 2*4, acc2);
++   _mm256_store_pd(x + 3*4, acc3);
++}
++
++FMA_RESOLVER(static void,mul16rowsD,
++   (double *x, const double *a, const double *b, long n));
++
++#elif defined(NTL_HAVE_AVX)
+ 
+-#ifdef NTL_HAVE_AVX
+ static
+ void mul16rowsD(double *x, const double *a, const double *b, long n)
+ {
+@@ -1243,6 +1489,114 @@ void mul16rowsD(double *x, const double
+    _mm256_store_pd(x + 3*4, acc3);
+ }
+ 
++#endif
++
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,mul16rows2D)
++(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
++{
++   __m256d avec0, avec1, avec2, avec3;
++
++   __m256d acc0 = _mm256_setzero_pd();
++   __m256d acc1 = _mm256_setzero_pd();
++   __m256d acc2 = _mm256_setzero_pd();
++   __m256d acc3 = _mm256_setzero_pd();
++
++   __m256d acc0_ = _mm256_setzero_pd();
++   __m256d acc1_ = _mm256_setzero_pd();
++   __m256d acc2_ = _mm256_setzero_pd();
++   __m256d acc3_ = _mm256_setzero_pd();
++
++
++   __m256d bvec;
++   __m256d bvec_;
++
++   for (long i = 0; i < n; i++) {
++      bvec = _mm256_broadcast_sd(&b[i]); 
++      bvec_ = _mm256_broadcast_sd(&b_[i]); 
++
++      avec0 = _mm256_load_pd(a); a += 4;
++      avec1 = _mm256_load_pd(a); a += 4;
++      avec2 = _mm256_load_pd(a); a += 4;
++      avec3 = _mm256_load_pd(a); a += 4;
++
++      acc0 = _mm256_add_pd(_mm256_mul_pd(avec0, bvec), acc0);
++      acc1 = _mm256_add_pd(_mm256_mul_pd(avec1, bvec), acc1);
++      acc2 = _mm256_add_pd(_mm256_mul_pd(avec2, bvec), acc2);
++      acc3 = _mm256_add_pd(_mm256_mul_pd(avec3, bvec), acc3);
++
++      acc0_ = _mm256_add_pd(_mm256_mul_pd(avec0, bvec_), acc0_);
++      acc1_ = _mm256_add_pd(_mm256_mul_pd(avec1, bvec_), acc1_);
++      acc2_ = _mm256_add_pd(_mm256_mul_pd(avec2, bvec_), acc2_);
++      acc3_ = _mm256_add_pd(_mm256_mul_pd(avec3, bvec_), acc3_);
++   }
++
++   _mm256_store_pd(x + 0*4, acc0);
++   _mm256_store_pd(x + 1*4, acc1);
++   _mm256_store_pd(x + 2*4, acc2);
++   _mm256_store_pd(x + 3*4, acc3);
++
++   _mm256_store_pd(x_ + 0*4, acc0_);
++   _mm256_store_pd(x_ + 1*4, acc1_);
++   _mm256_store_pd(x_ + 2*4, acc2_);
++   _mm256_store_pd(x_ + 3*4, acc3_);
++}
++
++FMA_FUNC(void,mul16rows2D)
++(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
++{
++   __m256d avec0, avec1, avec2, avec3;
++
++   __m256d acc0 = _mm256_setzero_pd();
++   __m256d acc1 = _mm256_setzero_pd();
++   __m256d acc2 = _mm256_setzero_pd();
++   __m256d acc3 = _mm256_setzero_pd();
++
++   __m256d acc0_ = _mm256_setzero_pd();
++   __m256d acc1_ = _mm256_setzero_pd();
++   __m256d acc2_ = _mm256_setzero_pd();
++   __m256d acc3_ = _mm256_setzero_pd();
++
++
++   __m256d bvec;
++   __m256d bvec_;
++
++   for (long i = 0; i < n; i++) {
++      bvec = _mm256_broadcast_sd(&b[i]); 
++      bvec_ = _mm256_broadcast_sd(&b_[i]); 
++
++      avec0 = _mm256_load_pd(a); a += 4;
++      avec1 = _mm256_load_pd(a); a += 4;
++      avec2 = _mm256_load_pd(a); a += 4;
++      avec3 = _mm256_load_pd(a); a += 4;
++
++      acc0 = _mm256_fmadd_pd(avec0, bvec, acc0);
++      acc1 = _mm256_fmadd_pd(avec1, bvec, acc1);
++      acc2 = _mm256_fmadd_pd(avec2, bvec, acc2);
++      acc3 = _mm256_fmadd_pd(avec3, bvec, acc3);
++
++      acc0_ = _mm256_fmadd_pd(avec0, bvec_, acc0_);
++      acc1_ = _mm256_fmadd_pd(avec1, bvec_, acc1_);
++      acc2_ = _mm256_fmadd_pd(avec2, bvec_, acc2_);
++      acc3_ = _mm256_fmadd_pd(avec3, bvec_, acc3_);
++   }
++
++   _mm256_store_pd(x + 0*4, acc0);
++   _mm256_store_pd(x + 1*4, acc1);
++   _mm256_store_pd(x + 2*4, acc2);
++   _mm256_store_pd(x + 3*4, acc3);
++
++   _mm256_store_pd(x_ + 0*4, acc0_);
++   _mm256_store_pd(x_ + 1*4, acc1_);
++   _mm256_store_pd(x_ + 2*4, acc2_);
++   _mm256_store_pd(x_ + 3*4, acc3_);
++}
++
++FMA_RESOLVER(static void,mul16rows2D,
++   (double *x, double *x_, const double *a, const double *b, const double *b_, long n));
++
++#elif defined(NTL_HAVE_AVX)
+ static
+ void mul16rows2D(double *x, double *x_, const double *a, const double *b, const double *b_, long n)
+ {
+@@ -1309,6 +1663,7 @@ void mul16rows2D(double *x, double *x_,
+    _mm256_store_pd(x_ + 3*4, acc3_);
+ }
+ 
++#endif
+ 
+ #endif
+ 
+@@ -1422,7 +1777,7 @@ void CompMod_L(zz_pX& x, const zz_pX& g,
+ }
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+ static
+ void InnerProduct_AVX(zz_pX& x, const Vec<double>& v, long low, long high, 
+@@ -1534,7 +1889,6 @@ void CompMod_AVX(zz_pX& x, const zz_pX&
+ 
+    x = t;
+ }
+-#endif
+ 
+ 
+ 
+@@ -1570,6 +1924,14 @@ void CompMod(zz_pX& x, const zz_pX& g, c
+       break;
+ 
+ #endif
++#ifdef NTL_LOADTIME_CPU
++   case 3:
++      if (have_avx) {
++         CompMod_AVX(x, g, A, F);
++         break;
++       }
++       /* FALLTHRU */
++#endif
+ 
+    default:
+       LogicError("CompMod: bad strategy");
+--- src/mat_lzz_p.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/mat_lzz_p.c	2016-04-26 18:22:38.937718940 -0600
+@@ -10,6 +10,15 @@
+ 
+ #ifdef NTL_HAVE_AVX
+ #include <immintrin.h>
++#define AVX_ACTIVE 1
++#elif defined(NTL_LOADTIME_CPU)
++#include <immintrin.h>
++#define AVX_ACTIVE have_avx
++
++static int have_pclmul = -1;
++static int have_avx = -1;
++static int have_fma = -1;
++static int have_avx2 = -1;
+ #endif
+ 
+ NTL_START_IMPL
+@@ -632,7 +641,7 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+ 
+ #ifdef NTL_HAVE_LL_TYPE
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+ #define MAX_DBL_INT ((1L << NTL_DOUBLE_PRECISION)-1)
+ // max int representable exactly as a double
+@@ -646,19 +655,120 @@ void mul(mat_zz_p& X, const mat_zz_p& A,
+ 
+ 
+ // MUL_ADD(a, b, c): a += b*c
++#define FMA_MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) 
++#define AVX_MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
+ #ifdef NTL_HAVE_FMA
+-#define MUL_ADD(a, b, c) a = _mm256_fmadd_pd(b, c, a) 
++#define MUL_ADD(a, b, c) FMA_MUL_ADD(a, b, c)
+ #else
+-#define MUL_ADD(a, b, c) a = _mm256_add_pd(a, _mm256_mul_pd(b, c))
++#define MUL_ADD(a, b, c) AVX_MUL_ADD(a, b, c)
+ #endif
+ 
+-#if 0
+-static
+-void muladd1_by_32(double *x, const double *a, const double *b, long n)
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,muladd1_by_32)
++(double *x, const double *a, const double *b, long n)
+ {
+-   __m256d avec, bvec;
++   __m256d acc0=_mm256_load_pd(x + 0*4);
++   __m256d acc1=_mm256_load_pd(x + 1*4);
++   __m256d acc2=_mm256_load_pd(x + 2*4);
++   __m256d acc3=_mm256_load_pd(x + 3*4);
++   __m256d acc4=_mm256_load_pd(x + 4*4);
++   __m256d acc5=_mm256_load_pd(x + 5*4);
++   __m256d acc6=_mm256_load_pd(x + 6*4);
++   __m256d acc7=_mm256_load_pd(x + 7*4);
++
++   long i = 0;
++   for (; i <= n-4; i +=4) {
++
++      // the following code sequences are a bit faster than
++      // just doing 4 _mm256_broadcast_sd's
++      // it requires a to point to aligned storage, however
++
++#if 1
++     // this one seems slightly faster
++      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
++      __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
++#else
++      __m256d avec = _mm256_load_pd(a);
++      __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
++      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
++
++#endif
++
++      __m256d avec0 = _mm256_permute_pd(a0101, 0);
++      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
++      __m256d avec2 = _mm256_permute_pd(a2323, 0);
++      __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
++
++      a += 4;
++
++      __m256d bvec;
+ 
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec0, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec1, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec2, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec3, bvec);
++   }
++
++   for (; i < n; i++) {
++      __m256d avec = _mm256_broadcast_sd(a); a++;
++      __m256d bvec;
++
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc0, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc1, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc2, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc3, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc4, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc5, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc6, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; AVX_MUL_ADD(acc7, avec, bvec);
++   }
++
++
++   _mm256_store_pd(x + 0*4, acc0);
++   _mm256_store_pd(x + 1*4, acc1);
++   _mm256_store_pd(x + 2*4, acc2);
++   _mm256_store_pd(x + 3*4, acc3);
++   _mm256_store_pd(x + 4*4, acc4);
++   _mm256_store_pd(x + 5*4, acc5);
++   _mm256_store_pd(x + 6*4, acc6);
++   _mm256_store_pd(x + 7*4, acc7);
++}
+ 
++FMA_FUNC(void,muladd1_by_32)
++(double *x, const double *a, const double *b, long n)
++{
+    __m256d acc0=_mm256_load_pd(x + 0*4);
+    __m256d acc1=_mm256_load_pd(x + 1*4);
+    __m256d acc2=_mm256_load_pd(x + 2*4);
+@@ -668,19 +778,82 @@ void muladd1_by_32(double *x, const doub
+    __m256d acc6=_mm256_load_pd(x + 6*4);
+    __m256d acc7=_mm256_load_pd(x + 7*4);
+ 
++   long i = 0;
++   for (; i <= n-4; i +=4) {
+ 
+-   for (long i = 0; i < n; i++) {
+-      avec = _mm256_broadcast_sd(a); a++;
++      // the following code sequences are a bit faster than
++      // just doing 4 _mm256_broadcast_sd's
++      // it requires a to point to aligned storage, however
++
++#if 1
++     // this one seems slightly faster
++      __m256d a0101 = _mm256_broadcast_pd((const __m128d*)(a+0));
++      __m256d a2323 = _mm256_broadcast_pd((const __m128d*)(a+2));
++#else
++      __m256d avec = _mm256_load_pd(a);
++      __m256d a0101 = _mm256_permute2f128_pd(avec, avec, 0);
++      __m256d a2323 = _mm256_permute2f128_pd(avec, avec, 0x11);
+ 
++#endif
+ 
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc0, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc1, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc2, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc3, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc4, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc5, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc6, avec, bvec);
+-      bvec = _mm256_load_pd(b); b += 4; MUL_ADD(acc7, avec, bvec);
++      __m256d avec0 = _mm256_permute_pd(a0101, 0);
++      __m256d avec1 = _mm256_permute_pd(a0101, 0xf);
++      __m256d avec2 = _mm256_permute_pd(a2323, 0);
++      __m256d avec3 = _mm256_permute_pd(a2323, 0xf);
++
++      a += 4;
++
++      __m256d bvec;
++
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec0, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec0, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec1, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec1, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec2, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec2, bvec);
++
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec3, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec3, bvec);
++   }
++
++   for (; i < n; i++) {
++      __m256d avec = _mm256_broadcast_sd(a); a++;
++      __m256d bvec;
++
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc0, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc1, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc2, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc3, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc4, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc5, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc6, avec, bvec);
++      bvec = _mm256_load_pd(b); b += 4; FMA_MUL_ADD(acc7, avec, bvec);
+    }
+ 
+ 
+@@ -694,6 +867,9 @@ void muladd1_by_32(double *x, const doub
+    _mm256_store_pd(x + 7*4, acc7);
+ }
+ 
++FMA_RESOLVER(static void,muladd1_by_32,
++   (double *x, const double *a, const double *b, long n));
++
+ #else
+ 
+ static
+@@ -800,7 +976,164 @@ void muladd1_by_32(double *x, const doub
+ #endif
+ 
+ // experiment: process two rows at a time
+-#if 1
++#ifdef NTL_LOADTIME_CPU
++AVX_FUNC(void,muladd2_by_32)
++(double *x, const double *a, const double *b, long n)
++{
++   __m256d avec0, avec1, bvec;
++   __m256d acc00, acc01, acc02, acc03;
++   __m256d acc10, acc11, acc12, acc13;
++ 
++
++   // round 0
++
++   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
++
++   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
++   }
++
++
++   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
++
++   // round 1
++
++   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
++
++   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc00, avec0, bvec); AVX_MUL_ADD(acc10, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc01, avec0, bvec); AVX_MUL_ADD(acc11, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc02, avec0, bvec); AVX_MUL_ADD(acc12, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); AVX_MUL_ADD(acc03, avec0, bvec); AVX_MUL_ADD(acc13, avec1, bvec);
++   }
++
++
++   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
++
++}
++
++FMA_FUNC(void,muladd2_by_32)
++(double *x, const double *a, const double *b, long n)
++{
++   __m256d avec0, avec1, bvec;
++   __m256d acc00, acc01, acc02, acc03;
++   __m256d acc10, acc11, acc12, acc13;
++ 
++
++   // round 0
++
++   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
++
++   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
++   }
++
++
++   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
++
++   // round 1
++
++   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
++
++   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec);
++   }
++
++
++   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
++
++}
++
++FMA_RESOLVER(static void,muladd2_by_32,
++   (double *x, const double *a, const double *b, long n));
++
++#else
++
+ static
+ void muladd2_by_32(double *x, const double *a, const double *b, long n)
+ {
+@@ -876,96 +1209,217 @@ void muladd2_by_32(double *x, const doub
+    _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
+ 
+ }
++#endif
+ 
+-#else
+ 
+-static
+-void muladd2_by_32(double *x, const double *a, const double *b, long n)
++
++// experiment: process three rows at a time
++// NOTE: this makes things slower on an AVX1 platform --- not enough registers
++// it could be faster on AVX2/FMA, where there should be enough registers
++
++#ifdef NTL_LOADTIME_CPU
++FMA_FUNC(void,muladd3_by_32)
++(double *x, const double *a, const double *b, long n)
+ {
+-   long i, j;
+-   __m256d bvec;
++   __m256d avec0, avec1, avec2, bvec;
+    __m256d acc00, acc01, acc02, acc03;
+    __m256d acc10, acc11, acc12, acc13;
++   __m256d acc20, acc21, acc22, acc23;
++ 
+ 
++   // round 0
+ 
+-   for (j = 0; j < 2; j++) {
++   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+ 
+-      acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
++   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
+ 
+-      acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
+-      acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2));
++   acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
++   acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
++   acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
++   acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
+ 
+-      for (i = 0; i <= n-4; i+=4) {
+-	 __m256d a0_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0));
+-	 __m256d a0_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2));
+-	 __m256d avec00 = _mm256_permute_pd(a0_0101, 0);
+-	 __m256d avec01 = _mm256_permute_pd(a0_0101, 0xf);
+-	 __m256d avec02 = _mm256_permute_pd(a0_2323, 0);
+-	 __m256d avec03 = _mm256_permute_pd(a0_2323, 0xf);
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+ 
+-	 __m256d a1_0101 = _mm256_broadcast_pd((const __m128d*)(a+i+0+MAT_BLK_SZ));
+-	 __m256d a1_2323 = _mm256_broadcast_pd((const __m128d*)(a+i+2+MAT_BLK_SZ));
+-	 __m256d avec10 = _mm256_permute_pd(a1_0101, 0);
+-	 __m256d avec11 = _mm256_permute_pd(a1_0101, 0xf);
+-	 __m256d avec12 = _mm256_permute_pd(a1_2323, 0);
+-	 __m256d avec13 = _mm256_permute_pd(a1_2323, 0xf);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
++   }
+ 
+-	 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec00, bvec); MUL_ADD(acc10, avec10, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec00, bvec); MUL_ADD(acc11, avec10, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec00, bvec); MUL_ADD(acc12, avec10, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+0)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec00, bvec); MUL_ADD(acc13, avec10, bvec);
+ 
+-	 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec01, bvec); MUL_ADD(acc10, avec11, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec01, bvec); MUL_ADD(acc11, avec11, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec01, bvec); MUL_ADD(acc12, avec11, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+1)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec01, bvec); MUL_ADD(acc13, avec11, bvec);
++   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
+ 
+-	 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec02, bvec); MUL_ADD(acc10, avec12, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec02, bvec); MUL_ADD(acc11, avec12, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec02, bvec); MUL_ADD(acc12, avec12, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+2)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec02, bvec); MUL_ADD(acc13, avec12, bvec);
++   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
+ 
+-	 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec03, bvec); MUL_ADD(acc10, avec13, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec03, bvec); MUL_ADD(acc11, avec13, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec03, bvec); MUL_ADD(acc12, avec13, bvec);
+-	 bvec = _mm256_load_pd(&b[(i+3)*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec03, bvec); MUL_ADD(acc13, avec13, bvec);
+-      }
++   _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
++   _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
++   _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
++   _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
+ 
+-      for (; i < n; i++) {
+-	 __m256d avec0 = _mm256_broadcast_sd(&a[i]); 
+-	 __m256d avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++   // round 1
+ 
+-	 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc00, avec0, bvec); MUL_ADD(acc10, avec1, bvec);
+-	 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc01, avec0, bvec); MUL_ADD(acc11, avec1, bvec);
+-	 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc02, avec0, bvec); MUL_ADD(acc12, avec1, bvec);
+-	 bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+j*(MAT_BLK_SZ/2)]); MUL_ADD(acc03, avec0, bvec); MUL_ADD(acc13, avec1, bvec);
+-      }
++   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
+ 
++   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
+ 
+-      _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc00);
+-      _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc01);
+-      _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc02);
+-      _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc03);
++   acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
++   acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
++   acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
++   acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
+ 
+-      _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc10);
+-      _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc11);
+-      _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc12);
+-      _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ + j*(MAT_BLK_SZ/2), acc13);
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
+ 
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
+    }
++
++
++   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
++
++   _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
++   _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
++   _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
++   _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
++
+ }
+-#endif
+ 
++AVX2_FUNC(void,muladd3_by_32)
++(double *x, const double *a, const double *b, long n)
++{
++   __m256d avec0, avec1, avec2, bvec;
++   __m256d acc00, acc01, acc02, acc03;
++   __m256d acc10, acc11, acc12, acc13;
++   __m256d acc20, acc21, acc22, acc23;
++ 
++
++   // round 0
+ 
++   acc00=_mm256_load_pd(x + 0*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 1*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 2*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 3*4 + 0*MAT_BLK_SZ);
+ 
+-// experiment: process three rows at a time
+-// NOTE: this makes things slower on an AVX1 platform --- not enough registers
+-// it could be faster on AVX2/FMA, where there should be enough registers
++   acc10=_mm256_load_pd(x + 0*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 1*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 2*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 3*4 + 1*MAT_BLK_SZ);
++
++   acc20=_mm256_load_pd(x + 0*4 + 2*MAT_BLK_SZ);
++   acc21=_mm256_load_pd(x + 1*4 + 2*MAT_BLK_SZ);
++   acc22=_mm256_load_pd(x + 2*4 + 2*MAT_BLK_SZ);
++   acc23=_mm256_load_pd(x + 3*4 + 2*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
++   }
++
++
++   _mm256_store_pd(x + 0*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 1*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 2*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 3*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 0*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 1*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 2*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 3*4 + 1*MAT_BLK_SZ, acc13);
++
++   _mm256_store_pd(x + 0*4 + 2*MAT_BLK_SZ, acc20);
++   _mm256_store_pd(x + 1*4 + 2*MAT_BLK_SZ, acc21);
++   _mm256_store_pd(x + 2*4 + 2*MAT_BLK_SZ, acc22);
++   _mm256_store_pd(x + 3*4 + 2*MAT_BLK_SZ, acc23);
++
++   // round 1
++
++   acc00=_mm256_load_pd(x + 4*4 + 0*MAT_BLK_SZ);
++   acc01=_mm256_load_pd(x + 5*4 + 0*MAT_BLK_SZ);
++   acc02=_mm256_load_pd(x + 6*4 + 0*MAT_BLK_SZ);
++   acc03=_mm256_load_pd(x + 7*4 + 0*MAT_BLK_SZ);
++
++   acc10=_mm256_load_pd(x + 4*4 + 1*MAT_BLK_SZ);
++   acc11=_mm256_load_pd(x + 5*4 + 1*MAT_BLK_SZ);
++   acc12=_mm256_load_pd(x + 6*4 + 1*MAT_BLK_SZ);
++   acc13=_mm256_load_pd(x + 7*4 + 1*MAT_BLK_SZ);
++
++   acc20=_mm256_load_pd(x + 4*4 + 2*MAT_BLK_SZ);
++   acc21=_mm256_load_pd(x + 5*4 + 2*MAT_BLK_SZ);
++   acc22=_mm256_load_pd(x + 6*4 + 2*MAT_BLK_SZ);
++   acc23=_mm256_load_pd(x + 7*4 + 2*MAT_BLK_SZ);
++
++   for (long i = 0; i < n; i++) {
++      avec0 = _mm256_broadcast_sd(&a[i]); 
++      avec1 = _mm256_broadcast_sd(&a[i+MAT_BLK_SZ]); 
++      avec2 = _mm256_broadcast_sd(&a[i+2*MAT_BLK_SZ]); 
++
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+0*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc00, avec0, bvec); FMA_MUL_ADD(acc10, avec1, bvec); FMA_MUL_ADD(acc20, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+1*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc01, avec0, bvec); FMA_MUL_ADD(acc11, avec1, bvec); FMA_MUL_ADD(acc21, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+2*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc02, avec0, bvec); FMA_MUL_ADD(acc12, avec1, bvec); FMA_MUL_ADD(acc22, avec2, bvec);
++      bvec = _mm256_load_pd(&b[i*MAT_BLK_SZ+3*4+MAT_BLK_SZ/2]); FMA_MUL_ADD(acc03, avec0, bvec); FMA_MUL_ADD(acc13, avec1, bvec); FMA_MUL_ADD(acc23, avec2, bvec);
++   }
++
++
++   _mm256_store_pd(x + 4*4 + 0*MAT_BLK_SZ, acc00);
++   _mm256_store_pd(x + 5*4 + 0*MAT_BLK_SZ, acc01);
++   _mm256_store_pd(x + 6*4 + 0*MAT_BLK_SZ, acc02);
++   _mm256_store_pd(x + 7*4 + 0*MAT_BLK_SZ, acc03);
++
++   _mm256_store_pd(x + 4*4 + 1*MAT_BLK_SZ, acc10);
++   _mm256_store_pd(x + 5*4 + 1*MAT_BLK_SZ, acc11);
++   _mm256_store_pd(x + 6*4 + 1*MAT_BLK_SZ, acc12);
++   _mm256_store_pd(x + 7*4 + 1*MAT_BLK_SZ, acc13);
++
++   _mm256_store_pd(x + 4*4 + 2*MAT_BLK_SZ, acc20);
++   _mm256_store_pd(x + 5*4 + 2*MAT_BLK_SZ, acc21);
++   _mm256_store_pd(x + 6*4 + 2*MAT_BLK_SZ, acc22);
++   _mm256_store_pd(x + 7*4 + 2*MAT_BLK_SZ, acc23);
++
++}
++
++AVX2_RESOLVER(static void,muladd3_by_32,
++   (double *x, const double *a, const double *b, long n));
++
++#else
+ 
+ static
+ void muladd3_by_32(double *x, const double *a, const double *b, long n)
+@@ -1066,6 +1520,32 @@ void muladd3_by_32(double *x, const doub
+ 
+ }
+ 
++#endif
++
++#ifdef NTL_LOADTIME_CPU
++
++static inline
++void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
++{
++   long i = first;
++
++   if (have_fma) {
++      // processing three rows at a time is faster
++      for (; i <= last-3; i+=3)
++         muladd3_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
++      for (; i < last; i++)
++         muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
++   } else {
++      // process only two rows at a time: not enough registers :-(
++      for (; i <= last-2; i+=2)
++         muladd2_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
++      for (; i < last; i++)
++         muladd1_by_32(x + i*MAT_BLK_SZ, a + i*MAT_BLK_SZ, b, n);
++   }
++}
++
++#else
++
+ static inline
+ void muladd_all_by_32(long first, long last, double *x, const double *a, const double *b, long n)
+ {
+@@ -1085,8 +1565,79 @@ void muladd_all_by_32(long first, long l
+ #endif
+ }
+ 
++#endif
++
+ 
+ // this assumes n is a multiple of 16
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,muladd_interval)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++   __m256d xvec0, xvec1, xvec2, xvec3;
++   __m256d yvec0, yvec1, yvec2, yvec3;
++
++   __m256d cvec = _mm256_broadcast_sd(&c);
++
++   for (long i = 0; i < n; i += 16, x += 16, y += 16) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      xvec1 = _mm256_load_pd(x+1*4);
++      xvec2 = _mm256_load_pd(x+2*4);
++      xvec3 = _mm256_load_pd(x+3*4);
++
++      yvec0 = _mm256_load_pd(y+0*4);
++      yvec1 = _mm256_load_pd(y+1*4);
++      yvec2 = _mm256_load_pd(y+2*4);
++      yvec3 = _mm256_load_pd(y+3*4);
++
++      AVX_MUL_ADD(xvec0, yvec0, cvec);
++      AVX_MUL_ADD(xvec1, yvec1, cvec);
++      AVX_MUL_ADD(xvec2, yvec2, cvec);
++      AVX_MUL_ADD(xvec3, yvec3, cvec);
++
++      _mm256_store_pd(x + 0*4, xvec0);
++      _mm256_store_pd(x + 1*4, xvec1);
++      _mm256_store_pd(x + 2*4, xvec2);
++      _mm256_store_pd(x + 3*4, xvec3);
++   }
++}
++
++FMA_FUNC(void,muladd_interval)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++   __m256d xvec0, xvec1, xvec2, xvec3;
++   __m256d yvec0, yvec1, yvec2, yvec3;
++
++   __m256d cvec = _mm256_broadcast_sd(&c);
++
++   for (long i = 0; i < n; i += 16, x += 16, y += 16) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      xvec1 = _mm256_load_pd(x+1*4);
++      xvec2 = _mm256_load_pd(x+2*4);
++      xvec3 = _mm256_load_pd(x+3*4);
++
++      yvec0 = _mm256_load_pd(y+0*4);
++      yvec1 = _mm256_load_pd(y+1*4);
++      yvec2 = _mm256_load_pd(y+2*4);
++      yvec3 = _mm256_load_pd(y+3*4);
++
++      FMA_MUL_ADD(xvec0, yvec0, cvec);
++      FMA_MUL_ADD(xvec1, yvec1, cvec);
++      FMA_MUL_ADD(xvec2, yvec2, cvec);
++      FMA_MUL_ADD(xvec3, yvec3, cvec);
++
++      _mm256_store_pd(x + 0*4, xvec0);
++      _mm256_store_pd(x + 1*4, xvec1);
++      _mm256_store_pd(x + 2*4, xvec2);
++      _mm256_store_pd(x + 3*4, xvec3);
++   }
++}
++
++FMA_RESOLVER(static void,muladd_interval,
++   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
++
++#else
++
+ static inline
+ void muladd_interval(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+ {
+@@ -1117,9 +1668,109 @@ void muladd_interval(double * NTL_RESTRI
+       _mm256_store_pd(x + 3*4, xvec3);
+    }
+ }
++#endif
+ 
+ // this one is more general: does not assume that n is a
+ // multiple of 16
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,muladd_interval1)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++
++   __m256d xvec0, xvec1, xvec2, xvec3;
++   __m256d yvec0, yvec1, yvec2, yvec3;
++   __m256d cvec;
++
++   if (n >= 4)
++      cvec = _mm256_broadcast_sd(&c);
++
++   long i=0;
++   for (; i <= n-16; i += 16, x += 16, y += 16) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      xvec1 = _mm256_load_pd(x+1*4);
++      xvec2 = _mm256_load_pd(x+2*4);
++      xvec3 = _mm256_load_pd(x+3*4);
++
++      yvec0 = _mm256_load_pd(y+0*4);
++      yvec1 = _mm256_load_pd(y+1*4);
++      yvec2 = _mm256_load_pd(y+2*4);
++      yvec3 = _mm256_load_pd(y+3*4);
++
++      AVX_MUL_ADD(xvec0, yvec0, cvec);
++      AVX_MUL_ADD(xvec1, yvec1, cvec);
++      AVX_MUL_ADD(xvec2, yvec2, cvec);
++      AVX_MUL_ADD(xvec3, yvec3, cvec);
++
++      _mm256_store_pd(x + 0*4, xvec0);
++      _mm256_store_pd(x + 1*4, xvec1);
++      _mm256_store_pd(x + 2*4, xvec2);
++      _mm256_store_pd(x + 3*4, xvec3);
++   }
++
++   for (; i <= n-4; i += 4, x += 4, y += 4) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      yvec0 = _mm256_load_pd(y+0*4);
++      AVX_MUL_ADD(xvec0, yvec0, cvec);
++      _mm256_store_pd(x + 0*4, xvec0);
++   }
++
++   for (; i < n; i++, x++, y++) {
++      *x += (*y)*c;
++   }
++}
++
++FMA_FUNC(void,muladd_interval1)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++
++   __m256d xvec0, xvec1, xvec2, xvec3;
++   __m256d yvec0, yvec1, yvec2, yvec3;
++   __m256d cvec;
++
++   if (n >= 4)
++      cvec = _mm256_broadcast_sd(&c);
++
++   long i=0;
++   for (; i <= n-16; i += 16, x += 16, y += 16) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      xvec1 = _mm256_load_pd(x+1*4);
++      xvec2 = _mm256_load_pd(x+2*4);
++      xvec3 = _mm256_load_pd(x+3*4);
++
++      yvec0 = _mm256_load_pd(y+0*4);
++      yvec1 = _mm256_load_pd(y+1*4);
++      yvec2 = _mm256_load_pd(y+2*4);
++      yvec3 = _mm256_load_pd(y+3*4);
++
++      FMA_MUL_ADD(xvec0, yvec0, cvec);
++      FMA_MUL_ADD(xvec1, yvec1, cvec);
++      FMA_MUL_ADD(xvec2, yvec2, cvec);
++      FMA_MUL_ADD(xvec3, yvec3, cvec);
++
++      _mm256_store_pd(x + 0*4, xvec0);
++      _mm256_store_pd(x + 1*4, xvec1);
++      _mm256_store_pd(x + 2*4, xvec2);
++      _mm256_store_pd(x + 3*4, xvec3);
++   }
++
++   for (; i <= n-4; i += 4, x += 4, y += 4) {
++      xvec0 = _mm256_load_pd(x+0*4);
++      yvec0 = _mm256_load_pd(y+0*4);
++      FMA_MUL_ADD(xvec0, yvec0, cvec);
++      _mm256_store_pd(x + 0*4, xvec0);
++   }
++
++   for (; i < n; i++, x++, y++) {
++      *x += (*y)*c;
++   }
++}
++
++FMA_RESOLVER(static void,muladd_interval1,
++   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
++
++#else
++
+ static inline
+ void muladd_interval1(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+ {
+@@ -1165,11 +1816,74 @@ void muladd_interval1(double * NTL_RESTR
+       *x += (*y)*c;
+    }
+ }
++#endif
+ 
+ #define AVX_PD_SZ (4)
+ 
+ // experimental: assumes n is a multiple of 4 in the range [0..32]
+-#if 1
++#ifdef NTL_LOADTIME_CPU
++
++AVX_FUNC(void,muladd_interval2)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++   n /= 4;
++   if (n <= 0 || n > 8) return;
++
++   x += n*4;
++   y += n*4;
++
++   // n in [1..8]
++
++   __m256d xvec, yvec, cvec;
++
++   cvec = _mm256_broadcast_sd(&c);
++
++   switch (n) {
++   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
++   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
++   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
++   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
++   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
++   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
++   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
++   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); AVX_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
++   }
++   
++}
++
++FMA_FUNC(void,muladd_interval2)
++(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
++{
++   n /= 4;
++   if (n <= 0 || n > 8) return;
++
++   x += n*4;
++   y += n*4;
++
++   // n in [1..8]
++
++   __m256d xvec, yvec, cvec;
++
++   cvec = _mm256_broadcast_sd(&c);
++
++   switch (n) {
++   case 8: xvec = _mm256_load_pd(x-8*4); yvec = _mm256_load_pd(y-8*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-8*4, xvec);
++   case 7: xvec = _mm256_load_pd(x-7*4); yvec = _mm256_load_pd(y-7*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-7*4, xvec);
++   case 6: xvec = _mm256_load_pd(x-6*4); yvec = _mm256_load_pd(y-6*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-6*4, xvec);
++   case 5: xvec = _mm256_load_pd(x-5*4); yvec = _mm256_load_pd(y-5*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-5*4, xvec);
++   case 4: xvec = _mm256_load_pd(x-4*4); yvec = _mm256_load_pd(y-4*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-4*4, xvec);
++   case 3: xvec = _mm256_load_pd(x-3*4); yvec = _mm256_load_pd(y-3*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-3*4, xvec);
++   case 2: xvec = _mm256_load_pd(x-2*4); yvec = _mm256_load_pd(y-2*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-2*4, xvec);
++   case 1: xvec = _mm256_load_pd(x-1*4); yvec = _mm256_load_pd(y-1*4); FMA_MUL_ADD(xvec, yvec, cvec); _mm256_store_pd(x-1*4, xvec);
++   }
++   
++}
++
++FMA_RESOLVER(static void,muladd_interval2,
++   (double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n));
++
++#else
++
+ static inline
+ void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+ {
+@@ -1197,13 +1911,6 @@ void muladd_interval2(double * NTL_RESTR
+    }
+    
+ }
+-#else
+-static inline
+-void muladd_interval2(double * NTL_RESTRICT x, double * NTL_RESTRICT y, double c, long n)
+-{
+-   for (long i = 0; i < n; i++)
+-      x[i] += y[i]*c;
+-}
+ #endif
+ 
+ #endif
+@@ -2245,10 +2952,10 @@ void alt_mul_LL(const mat_window_zz_p& X
+ }  
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+-static
+-void blk_mul_DD(const mat_window_zz_p& X, 
++static void __attribute__((target ("avx,pclmul")))
++blk_mul_DD(const mat_window_zz_p& X, 
+                 const const_mat_window_zz_p& A, const const_mat_window_zz_p& B)  
+ {  
+    long n = A.NumRows();  
+@@ -2615,8 +3322,9 @@ void mul_base (const mat_window_zz_p& X,
+ 
+       long V = MAT_BLK_SZ*4;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined (NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+@@ -2696,7 +3404,8 @@ void mul_strassen(const mat_window_zz_p&
+     // this code determines if mul_base triggers blk_mul_DD,
+     // in which case a higher crossover is used
+ 
+-#if (defined(NTL_HAVE_LL_TYPE) && defined(NTL_HAVE_AVX))
++#if (defined(NTL_HAVE_LL_TYPE) && (defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)))
++    if (AVX_ACTIVE)
+     {
+        long V = MAT_BLK_SZ*4;
+        long p = zz_p::modulus();
+@@ -3196,10 +3905,10 @@ void alt_inv_L(zz_p& d, mat_zz_p& X, con
+ 
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+-static
+-void alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
++static void __attribute__((target ("avx,pclmul")))
++alt_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
+ {
+    long n = A.NumRows();
+ 
+@@ -3365,10 +4074,10 @@ void alt_inv_DD(zz_p& d, mat_zz_p& X, co
+ 
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+-static
+-void blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
++static void __attribute__((target ("avx,pclmul")))
++blk_inv_DD(zz_p& d, mat_zz_p& X, const mat_zz_p& A, bool relax)
+ {
+    long n = A.NumRows();
+ 
+@@ -4126,8 +4835,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+    else if (n/MAT_BLK_SZ < 4) {
+       long V = 64;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+@@ -4152,8 +4862,9 @@ void relaxed_inv(zz_p& d, mat_zz_p& X, c
+    else {
+       long V = 4*MAT_BLK_SZ;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+@@ -4559,10 +5270,10 @@ void alt_tri_L(zz_p& d, const mat_zz_p&
+ 
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+-static
+-void alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
++static void __attribute__((target ("avx,pclmul")))
++alt_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
+                vec_zz_p *xp, bool trans, bool relax)
+ {
+    long n = A.NumRows();
+@@ -4749,10 +5460,10 @@ void alt_tri_DD(zz_p& d, const mat_zz_p&
+ 
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+-static
+-void blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
++static void __attribute__((target ("avx,pclmul")))
++blk_tri_DD(zz_p& d, const mat_zz_p& A, const vec_zz_p *bp, 
+                vec_zz_p *xp, bool trans, bool relax)
+ {
+    long n = A.NumRows();
+@@ -5563,8 +6274,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+    else if (n/MAT_BLK_SZ < 4) {
+       long V = 64;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+@@ -5589,8 +6301,9 @@ void tri(zz_p& d, const mat_zz_p& A, con
+    else {
+       long V = 4*MAT_BLK_SZ;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+@@ -5836,7 +6549,7 @@ long elim_basic(const mat_zz_p& A, mat_z
+ #ifdef NTL_HAVE_LL_TYPE
+ 
+ 
+-#ifdef NTL_HAVE_AVX
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
+ 
+ 
+ static inline
+@@ -7289,8 +8002,9 @@ long elim(const mat_zz_p& A, mat_zz_p *i
+    else {
+       long V = 4*MAT_BLK_SZ;
+ 
+-#ifdef NTL_HAVE_AVX
+-      if (p-1 <= MAX_DBL_INT &&
++#if defined(NTL_HAVE_AVX) || defined(NTL_LOADTIME_CPU)
++      if (AVX_ACTIVE &&
++          p-1 <= MAX_DBL_INT &&
+           V <= (MAX_DBL_INT-(p-1))/(p-1) &&
+           V*(p-1) <= (MAX_DBL_INT-(p-1))/(p-1)) {
+ 
+--- src/QuickTest.c.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/QuickTest.c	2016-04-26 18:22:38.938718859 -0600
+@@ -335,6 +335,9 @@ cerr << "Performance Options:\n";
+    cerr << "NTL_PCLMUL\n";
+ #endif
+ 
++#ifdef NTL_LOADTIME_CPU
++   cerr << "NTL_LOADTIME_CPU\n";
++#endif
+ 
+    cerr << "\n\n";
+ 
+--- src/WizardAux.orig	2016-04-26 05:40:15.000000000 -0600
++++ src/WizardAux	2016-04-26 18:22:38.938718859 -0600
+@@ -94,6 +94,7 @@ system("make InitSettings");
+ 'NTL_PCLMUL'           => 0,
+ 'NTL_FFT_BIGTAB'       => 0,
+ 'NTL_FFT_LAZYMUL'      => 0,
++'NTL_LOADTIME_CPU'     => 0,
+ 
+ 'WIZARD_HACK'          => '#define NTL_WIZARD_HACK',
+ 
diff --git a/ntl.spec b/ntl.spec
index 2a8f5fc..89f1902 100644
--- a/ntl.spec
+++ b/ntl.spec
@@ -10,15 +10,18 @@
 
 Summary: High-performance algorithms for vectors, matrices, and polynomials 
 Name:    ntl 
-Version: 9.7.0
+Version: 9.8.0
 Release: 1%{?dist}
 
 License: GPLv2+
 URL:     http://shoup.net/ntl/ 
-Group:   System Environment/Libraries
 
 Source0: http://shoup.net/ntl/%{name}-%{version}.tar.gz
 Source1: multilib_template.h
+# Detect CPU at load time, optionally use PCLMUL, AVX, FMA, and AVX2 features.
+# This patch was sent upstream, but upstream prefers that the entire library
+# be built for a specific CPU, which we cannot do in Fedora.
+Patch0:  %{name}-loadtime-cpu.patch
 
 BuildRequires: gcc-c++
 %if 0%{?gf2x}
@@ -47,14 +50,12 @@ NTL provides high quality implementations of state-of-the-art algorithms for:
 
 %package devel 
 Summary: Development files for %{name} 
-Group:   Development/Libraries
 Requires: %{name}%{?_isa} = %{version}-%{release}
 %description devel 
 %{summary}.
 
 %package static 
 Summary: Static libraries for %{name}
-Group:   Development/Libraries
 Requires: %{name}-devel%{?_isa} = %{version}-%{release}
 #Requires: gmp-devel
 %description static 
@@ -62,7 +63,8 @@ Requires: %{name}-devel%{?_isa} = %{version}-%{release}
 
 
 %prep
-%setup -q 
+%setup -q
+%patch0
 
 
 %build
@@ -78,6 +80,11 @@ pushd src
   NATIVE=off \
   %{?gf2x:NTL_GF2X_LIB=on} \
   NTL_PCLMUL=off \
+  NTL_DISABLE_TLS_HACK=on \
+%ifarch x86_64
+  NTL_LOADTIME_CPU=on \
+  WIZARD=off \
+%endif
   SHARED=on
 popd
 
@@ -86,8 +93,10 @@ make -C src V=1
 
 
 %check
-# skip by default, takes a *long, long, long* (days?) time -- Rex
-%{?_with_check:make -C src check}
+# skip on non-x86_64, takes a *long, long, long* (days?) time -- Rex
+%ifarch x86_64
+make -C src check
+%endif
 
 
 %install
@@ -127,7 +136,7 @@ done
 %files
 %doc README
 %license doc/copying.txt
-%{_libdir}/libntl.so.22*
+%{_libdir}/libntl.so.24*
 
 %files devel 
 %doc doc/*
@@ -141,6 +150,11 @@ done
 
 
 %changelog
+* Fri Apr 29 2016 Jerry James <loganjerry@gmail.com> - 9.8.0-1
+- ntl-9.8.0
+- Add -loadtime-cpu patch
+- Enable the check script on x86_64
+
 * Sat Mar 19 2016 Jerry James <loganjerry@gmail.com> - 9.7.0-1
 - ntl-9.7.0
 
diff --git a/sources b/sources
index c422b75..d04a3a8 100644
--- a/sources
+++ b/sources
@@ -1 +1 @@
-02f6076903f482d4bf15f62fb7ee260f  ntl-9.7.0.tar.gz
+a7e87d859511c15023169fa0fcf9903b  ntl-9.8.0.tar.gz