From: Michael R. Crusoe <michael.crusoe@gmail.com>
Subject: Use the SIMD Everywhere header only library

Always use the "AVX2" codepath as SIMDe will provide the non AVX2
equivalents automatically

--- a/src/mcf_simd.hh
+++ b/src/mcf_simd.hh
@@ -4,18 +4,13 @@
 #ifndef MCF_SIMD_HH
 #define MCF_SIMD_HH
 
-#if defined __SSE4_1__
-#include <immintrin.h>
-#elif defined __ARM_NEON
-#include <arm_neon.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include "simde/x86/avx2.h"
 
 #include <stddef.h>  // size_t
 
 namespace mcf {
 
-#if defined __AVX2__
-
 typedef __m256i SimdInt;
 typedef __m256i SimdUint1;
 typedef __m256d SimdDbl;
@@ -181,358 +176,6 @@
 static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
   return _mm256_shuffle_epi8(items, choices);
 }
-
-#elif defined __SSE4_1__
-
-typedef __m128i SimdInt;
-typedef __m128i SimdUint1;
-typedef __m128d SimdDbl;
-
-const int simdBytes = 16;
-
-static inline SimdInt simdZero() {
-  return _mm_setzero_si128();
-}
-
-static inline SimdInt simdZero1() {
-  return _mm_setzero_si128();
-}
-
-static inline SimdDbl simdZeroDbl() {
-  return _mm_setzero_pd();
-}
-
-static inline SimdInt simdOnes1() {
-  return _mm_set1_epi32(-1);
-}
-
-static inline SimdInt simdLoad(const void *p) {
-  return _mm_loadu_si128((const SimdInt *)p);
-}
-
-static inline SimdInt simdLoad1(const void *p) {
-  return _mm_loadu_si128((const SimdInt *)p);
-}
-
-static inline SimdDbl simdLoadDbl(const double *p) {
-  return _mm_loadu_pd(p);
-}
-
-static inline void simdStore(void *p, SimdInt x) {
-  _mm_storeu_si128((SimdInt *)p, x);
-}
-
-static inline void simdStore1(void *p, SimdInt x) {
-  _mm_storeu_si128((SimdInt *)p, x);
-}
-
-static inline void simdStoreDbl(double *p, SimdDbl x) {
-  _mm_storeu_pd(p, x);
-}
-
-static inline SimdInt simdOr1(SimdInt x, SimdInt y) {
-  return _mm_or_si128(x, y);
-}
-
-static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdInt mask) {
-  return _mm_blendv_epi8(x, y, mask);  // SSE4.1
-}
-
-const int simdLen = 4;
-const int simdDblLen = 2;
-
-static inline SimdInt simdSet(int i3, int i2, int i1, int i0) {
-  return _mm_set_epi32(i3, i2, i1, i0);
-}
-
-static inline SimdInt simdSet1(char iF, char iE, char iD, char iC,
-			       char iB, char iA, char i9, char i8,
-			       char i7, char i6, char i5, char i4,
-			       char i3, char i2, char i1, char i0) {
-  return _mm_set_epi8(iF, iE, iD, iC, iB, iA, i9, i8,
-		      i7, i6, i5, i4, i3, i2, i1, i0);
-}
-
-static inline SimdDbl simdSetDbl(double i1, double i0) {
-  return _mm_set_pd(i1, i0);
-}
-
-static inline SimdInt simdFill(int x) {
-  return _mm_set1_epi32(x);
-}
-
-static inline SimdInt simdFill1(char x) {
-  return _mm_set1_epi8(x);
-}
-
-static inline SimdDbl simdFillDbl(double x) {
-  return _mm_set1_pd(x);
-}
-
-static inline SimdInt simdGt(SimdInt x, SimdInt y) {
-  return _mm_cmpgt_epi32(x, y);
-}
-
-static inline SimdInt simdGe1(SimdInt x, SimdInt y) {
-  return _mm_cmpeq_epi8(_mm_min_epu8(x, y), y);
-}
-
-static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
-  return _mm_add_epi32(x, y);
-}
-
-static inline SimdInt simdAdd1(SimdInt x, SimdInt y) {
-  return _mm_add_epi8(x, y);
-}
-
-static inline SimdInt simdAdds1(SimdInt x, SimdInt y) {
-  return _mm_adds_epu8(x, y);
-}
-
-static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) {
-  return _mm_add_pd(x, y);
-}
-
-static inline SimdInt simdSub(SimdInt x, SimdInt y) {
-  return _mm_sub_epi32(x, y);
-}
-
-static inline SimdInt simdSub1(SimdInt x, SimdInt y) {
-  return _mm_sub_epi8(x, y);
-}
-
-static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) {
-  return _mm_mul_pd(x, y);
-}
-
-static inline SimdInt simdQuadruple1(SimdInt x) {
-  return _mm_slli_epi32(x, 2);
-}
-
-static inline SimdInt simdMax(SimdInt x, SimdInt y) {
-  return _mm_max_epi32(x, y);  // SSE4.1
-}
-
-static inline SimdInt simdMin1(SimdInt x, SimdInt y) {
-  return _mm_min_epu8(x, y);
-}
-
-static inline int simdHorizontalMax(SimdInt x) {
-  x = simdMax(x, _mm_shuffle_epi32(x, 0x4E));
-  x = simdMax(x, _mm_shuffle_epi32(x, 0xB1));
-  return _mm_cvtsi128_si32(x);
-}
-
-static inline int simdHorizontalMin1(SimdInt x) {
-  x = _mm_min_epu8(x, _mm_srli_epi16(x, 8));
-  x = _mm_minpos_epu16(x);  // SSE4.1
-  return _mm_extract_epi16(x, 0);
-}
-
-static inline double simdHorizontalAddDbl(SimdDbl x) {
-  return _mm_cvtsd_f64(_mm_hadd_pd(x, x));
-}
-
-static inline SimdInt simdChoose1(SimdInt items, SimdInt choices) {
-  return _mm_shuffle_epi8(items, choices);  // SSSE3
-}
-
-#elif defined __ARM_NEON
-
-typedef int32x4_t SimdInt;
-typedef uint32x4_t SimdUint;
-typedef uint8x16_t SimdUint1;
-typedef float64x2_t SimdDbl;
-
-const int simdBytes = 16;
-
-static inline SimdInt simdZero() {
-  return vdupq_n_s32(0);
-}
-
-static inline SimdUint1 simdZero1() {
-  return vdupq_n_u8(0);
-}
-
-static inline SimdDbl simdZeroDbl() {
-  return vdupq_n_f64(0);
-}
-
-static inline SimdUint1 simdOnes1() {
-  return vdupq_n_u8(-1);
-}
-
-static inline SimdInt simdLoad(const int *p) {
-  return vld1q_s32(p);
-}
-
-static inline SimdUint1 simdLoad1(const unsigned char *p) {
-  return vld1q_u8(p);
-}
-
-static inline SimdDbl simdLoadDbl(const double *p) {
-  return vld1q_f64(p);
-}
-
-static inline void simdStore(int *p, SimdInt x) {
-  vst1q_s32(p, x);
-}
-
-static inline void simdStore1(unsigned char *p, SimdUint1 x) {
-  vst1q_u8(p, x);
-}
-
-static inline void simdStoreDbl(double *p, SimdDbl x) {
-  vst1q_f64(p, x);
-}
-
-static inline SimdUint1 simdOr1(SimdUint1 x, SimdUint1 y) {
-  return vorrq_u8(x, y);
-}
-
-static inline SimdInt simdBlend(SimdInt x, SimdInt y, SimdUint mask) {
-  return vbslq_s32(mask, y, x);
-}
-
-const int simdLen = 4;
-const int simdDblLen = 2;
-
-static inline SimdInt simdSet(unsigned i3, unsigned i2,
-                              unsigned i1, unsigned i0) {
-  size_t lo = i1;
-  size_t hi = i3;
-  return
-    vcombine_s32(vcreate_s32((lo << 32) | i0), vcreate_s32((hi << 32) | i2));
-}
-
-static inline SimdUint1 simdSet1(unsigned char iF, unsigned char iE,
-				 unsigned char iD, unsigned char iC,
-				 unsigned char iB, unsigned char iA,
-				 unsigned char i9, unsigned char i8,
-				 unsigned char i7, unsigned char i6,
-				 unsigned char i5, unsigned char i4,
-				 unsigned char i3, unsigned char i2,
-				 unsigned char i1, unsigned char i0) {
-  size_t lo =
-    (size_t)i0       | (size_t)i1 <<  8 | (size_t)i2 << 16 | (size_t)i3 << 24 |
-    (size_t)i4 << 32 | (size_t)i5 << 40 | (size_t)i6 << 48 | (size_t)i7 << 56;
-
-  size_t hi =
-    (size_t)i8       | (size_t)i9 <<  8 | (size_t)iA << 16 | (size_t)iB << 24 |
-    (size_t)iC << 32 | (size_t)iD << 40 | (size_t)iE << 48 | (size_t)iF << 56;
-
-  return vcombine_u8(vcreate_u8(lo), vcreate_u8(hi));
-}
-
-static inline SimdDbl simdSetDbl(double i1, double i0) {
-  return vcombine_f64(vdup_n_f64(i0), vdup_n_f64(i1));
-}
-
-static inline SimdInt simdFill(int x) {
-  return vdupq_n_s32(x);
-}
-
-static inline SimdUint1 simdFill1(unsigned char x) {
-  return vdupq_n_u8(x);
-}
-
-static inline SimdDbl simdFillDbl(double x) {
-  return vdupq_n_f64(x);
-}
-
-static inline SimdUint simdGt(SimdInt x, SimdInt y) {
-  return vcgtq_s32(x, y);
-}
-
-static inline SimdUint1 simdGe1(SimdUint1 x, SimdUint1 y) {
-  return vcgeq_u8(x, y);
-}
-
-static inline SimdInt simdAdd(SimdInt x, SimdInt y) {
-  return vaddq_s32(x, y);
-}
-
-static inline SimdUint1 simdAdd1(SimdUint1 x, SimdUint1 y) {
-  return vaddq_u8(x, y);
-}
-
-static inline SimdUint1 simdAdds1(SimdUint1 x, SimdUint1 y) {
-  return vqaddq_u8(x, y);
-}
-
-static inline SimdDbl simdAddDbl(SimdDbl x, SimdDbl y) {
-  return vaddq_f64(x, y);
-}
-
-static inline SimdInt simdSub(SimdInt x, SimdInt y) {
-  return vsubq_s32(x, y);
-}
-
-static inline SimdUint1 simdSub1(SimdUint1 x, SimdUint1 y) {
-  return vsubq_u8(x, y);
-}
-
-static inline SimdDbl simdMulDbl(SimdDbl x, SimdDbl y) {
-  return vmulq_f64(x, y);
-}
-
-static inline SimdUint1 simdQuadruple1(SimdUint1 x) {
-  return vshlq_n_u8(x, 2);
-}
-
-static inline SimdInt simdMax(SimdInt x, SimdInt y) {
-  return vmaxq_s32(x, y);
-}
-
-static inline SimdUint1 simdMin1(SimdUint1 x, SimdUint1 y) {
-  return vminq_u8(x, y);
-}
-
-static inline int simdHorizontalMax(SimdInt x) {
-  return vmaxvq_s32(x);
-}
-
-static inline int simdHorizontalMin1(SimdUint1 x) {
-  return vminvq_u8(x);
-}
-
-static inline double simdHorizontalAddDbl(SimdDbl x) {
-  return vaddvq_f64(x);
-}
-
-static inline SimdUint1 simdChoose1(SimdUint1 items, SimdUint1 choices) {
-  return vqtbl1q_u8(items, choices);
-}
-
-#else
-
-typedef int SimdInt;
-typedef double SimdDbl;
-const int simdBytes = 1;
-const int simdLen = 1;
-const int simdDblLen = 1;
-static inline int simdZero() { return 0; }
-static inline double simdZeroDbl() { return 0; }
-static inline int simdSet(int x) { return x; }
-static inline double simdSetDbl(double x) { return x; }
-static inline int simdFill(int x) { return x; }
-static inline int simdLoad(const int *p) { return *p; }
-static inline double simdLoadDbl(const double *p) { return *p; }
-static inline void simdStore(int *p, int x) { *p = x; }
-static inline void simdStoreDbl(double *p, double x) { *p = x; }
-static inline double simdFillDbl(double x) { return x; }
-static inline int simdGt(int x, int y) { return x > y; }
-static inline int simdAdd(int x, int y) { return x + y; }
-static inline double simdAddDbl(double x, double y) { return x + y; }
-static inline int simdSub(int x, int y) { return x - y; }
-static inline double simdMulDbl(double x, double y) { return x * y; }
-static inline int simdMax(int x, int y) { return x > y ? x : y; }
-static inline int simdBlend(int x, int y, int mask) { return mask ? y : x; }
-static inline int simdHorizontalMax(int a) { return a; }
-static inline double simdHorizontalAddDbl(double x) { return x; }
-
-#endif
-
 }
 
 #endif
--- a/src/GappedXdropAligner.cc
+++ b/src/GappedXdropAligner.cc
@@ -140,17 +140,13 @@
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			    s1[7][s2[7]],
 			    s1[6][s2[6]],
 			    s1[5][s2[5]],
 			    s1[4][s2[4]],
-#endif
 			    s1[3][s2[3]],
 			    s1[2][s2[2]],
 			    s1[1][s2[1]],
-#endif
 			    s1[0][s2[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/src/GappedXdropAlignerPssm.cc
+++ b/src/GappedXdropAlignerPssm.cc
@@ -91,17 +91,13 @@
     if (isAffine) {
       for (int i = 0; i < numCells; i += simdLen) {
 	SimdInt s = simdSet(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			    s2[-7][s1[7]],
 			    s2[-6][s1[6]],
 			    s2[-5][s1[5]],
 			    s2[-4][s1[4]],
-#endif
 			    s2[-3][s1[3]],
 			    s2[-2][s1[2]],
 			    s2[-1][s1[1]],
-#endif
 			    s2[-0][s1[0]]);
 	SimdInt x = simdLoad(x2+i);
 	SimdInt y = simdSub(simdLoad(y1+i), mDelGrowCost);
--- a/makefile
+++ b/makefile
@@ -1,4 +1,4 @@
-CXXFLAGS = -msse4 -O3 -std=c++11 -pthread
+CXXFLAGS = -O3 -std=c++11 -pthread
 all:
 	@cd src && $(MAKE) CXXFLAGS="$(CXXFLAGS)"
 
--- a/src/makefile
+++ b/src/makefile
@@ -1,5 +1,4 @@
 CXXFLAGS = -O3 -Wall -Wextra -g
-CXXFLAGS += -msse4
 CXXFLAGS += -std=c++11
 CXXFLAGS += -pthread -DHAS_CXX_THREAD
 # -fomit-frame-pointer ?
@@ -52,25 +51,26 @@
 PPOBJ = last-pair-probs.o last-pair-probs-main.o
 
 MBOBJ = last-merge-batches.o
+SFX :=
 
-ALL = ../bin/lastdb ../bin/lastal ../bin/last-split	\
-../bin/last-merge-batches ../bin/last-pair-probs
+ALL = ../bin/lastdb$(SFX) ../bin/lastal$(SFX) ../bin/last-split$(SFX)	\
+../bin/last-merge-batches$(SFX) ../bin/last-pair-probs$(SFX)
 
 all: $(ALL)
 
-../bin/lastdb: $(indexObj)
+../bin/lastdb$(SFX): $(indexObj)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(indexObj) -lz
 
-../bin/lastal: $(alignObj)
+../bin/lastal$(SFX): $(alignObj)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(alignObj) -lz
 
-../bin/last-split: $(splitObj)
+../bin/last-split$(SFX): $(splitObj)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(splitObj)
 
-../bin/last-pair-probs: $(PPOBJ)
+../bin/last-pair-probs$(SFX): $(PPOBJ)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDFLAGS) -o $@ $(PPOBJ) -lz
 
-../bin/last-merge-batches: $(MBOBJ)
+../bin/last-merge-batches$(SFX): $(MBOBJ)
 	$(CC) $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) -o $@ $(MBOBJ)
 
 .SUFFIXES:
--- a/src/GappedXdropAlignerDna.cc
+++ b/src/GappedXdropAlignerDna.cc
@@ -4,8 +4,6 @@
 #include "GappedXdropAligner.hh"
 #include "GappedXdropAlignerInl.hh"
 
-#if defined __SSE4_1__ || defined __ARM_NEON
-
 //#include <iostream>  // for debugging
 
 namespace cbrc {
@@ -43,12 +41,10 @@
 
   const SimdUint1 scorer4x4 =
     simdSet1(
-#ifdef __AVX2__
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
 		 scorer[0][3], scorer[0][2], scorer[0][1], scorer[0][0],
-#endif
 		 scorer[3][3], scorer[3][2], scorer[3][1], scorer[3][0],
 		 scorer[2][3], scorer[2][2], scorer[2][1], scorer[2][0],
 		 scorer[1][3], scorer[1][2], scorer[1][1], scorer[1][0],
@@ -126,7 +122,6 @@
 
       for (int i = 0; i < numCells; i += simdBytes) {
 	SimdUint1 s = simdSet1(
-#ifdef __AVX2__
 			     scorer[s1[31]][s2[31]],
 			     scorer[s1[30]][s2[30]],
 			     scorer[s1[29]][s2[29]],
@@ -143,7 +138,6 @@
 			     scorer[s1[18]][s2[18]],
 			     scorer[s1[17]][s2[17]],
 			     scorer[s1[16]][s2[16]],
-#endif
 			     scorer[s1[15]][s2[15]],
 			     scorer[s1[14]][s2[14]],
 			     scorer[s1[13]][s2[13]],
@@ -275,5 +269,3 @@
 }
 
 }
-
-#endif
--- a/src/Alignment.cc
+++ b/src/Alignment.cc
@@ -358,12 +358,10 @@
 				  del.openCost, del.growCost,
 				  ins.openCost, ins.growCost,
 				  gap.pairCost, gap.isAffine, maxDrop, smMax)
-#if defined __SSE4_1__ || defined __ARM_NEON
     : isSimdMatrix ? aligner.alignDna(seq1 + start1, s2, isForward, sm,
 				      del.openCost, del.growCost,
 				      ins.openCost, ins.growCost,
 				      maxDrop, smMax, alph.numbersToUppercase)
-#endif
     :           aligner.align(seq1 + start1, s2, isForward, globality, sm,
 			      del.openCost, del.growCost,
 			      ins.openCost, ins.growCost,
@@ -380,14 +378,12 @@
       while( greedyAligner.getNextChunk( end1, end2, size ) )
 	blocks.push_back( SegmentPair( end1 - size, end2 - size, size ) );
     }
-#if defined __SSE4_1__ || defined __ARM_NEON
     else if (isSimdMatrix && !pssm2 && !sm2qual) {
       while (aligner.getNextChunkDna(end1, end2, size,
 				     del.openCost, del.growCost,
 				     ins.openCost, ins.growCost))
 	blocks.push_back(SegmentPair(end1 - size, end2 - size, size));
     }
-#endif
     else {
       while( aligner.getNextChunk( end1, end2, size,
 				   del.openCost, del.growCost,
--- a/src/GappedXdropAligner.hh
+++ b/src/GappedXdropAligner.hh
@@ -353,7 +353,6 @@
   void initFrame();
 
   // Everything below here is for alignDna & getNextChunkDna
-#if defined __SSE4_1__ || defined __ARM_NEON
   std::vector<TinyScore> xTinyScores;
   std::vector<TinyScore> yTinyScores;
   std::vector<TinyScore> zTinyScores;
@@ -403,7 +402,6 @@
     while (*x2 != target) ++x2;
     bestSeq1position = x2 - x2beg + seq1beg;
   }
-#endif
 };
 
 }
--- a/src/tantan.cc
+++ b/src/tantan.cc
@@ -325,13 +325,9 @@
     int i = 0;
     for (; i <= maxOffset - simdDblLen; i += simdDblLen) {
       SimdDbl rV = simdSetDbl(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			      lrRow[sp[-i-4]],
 			      lrRow[sp[-i-3]],
-#endif
 			      lrRow[sp[-i-2]],
-#endif
 			      lrRow[sp[-i-1]]);
       SimdDbl fV = simdLoadDbl(fp+i);
       sV = simdAddDbl(sV, fV);
@@ -369,13 +365,9 @@
     int i = 0;
     for (; i <= maxOffset - simdDblLen; i += simdDblLen) {
       SimdDbl rV = simdSetDbl(
-#if defined __SSE4_1__ || defined __ARM_NEON
-#ifdef __AVX2__
 			      lrRow[sp[-i-4]],
 			      lrRow[sp[-i-3]],
-#endif
 			      lrRow[sp[-i-2]],
-#endif
 			      lrRow[sp[-i-1]]);
       SimdDbl fV = simdMulDbl(simdLoadDbl(fp+i), rV);
       sV = simdAddDbl(sV, simdMulDbl(simdLoadDbl(b2f+i), fV));
