summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorTim Dettmers <tim.dettmers@gmail.com>2021-10-05 19:16:20 -0700
committerTim Dettmers <tim.dettmers@gmail.com>2021-10-05 19:16:20 -0700
commit7439924891496025edf60c9da6a782f362a50c70 (patch)
tree90476984d2c267f89232577a2ea40eb172387475 /include
Initial commit
Diffstat (limited to 'include')
-rw-r--r--include/AAlloc.h86
-rw-r--r--include/Algo-Direct-Common.h341
-rw-r--r--include/Algo-Direct2.h305
-rw-r--r--include/AlgoXCodes.h23
-rw-r--r--include/BinAlgo.h77
-rw-r--r--include/BinSearch.h11
-rw-r--r--include/Portable.h151
-rw-r--r--include/SIMD.h562
-rw-r--r--include/Type.h221
9 files changed, 1777 insertions, 0 deletions
diff --git a/include/AAlloc.h b/include/AAlloc.h
new file mode 100644
index 0000000..6c2ae41
--- /dev/null
+++ b/include/AAlloc.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include "Portable.h"
+
+namespace BinSearch {
+namespace Details {
+
+template <typename T>
+bool isAligned(const T *p, size_t A)
+{
+ return (reinterpret_cast<size_t>(p) % A) == 0;
+}
+
+template <class T, size_t A=64>
+struct AlignedVec
+{
+ AlignedVec()
+ : m_storage(0)
+ , m_data(0)
+ , m_sz(0)
+ {
+ }
+
+ static size_t nBytes(size_t sz)
+ {
+ return sz * sizeof(T) + A;
+ }
+
+ static size_t shiftAmt(char *p)
+ {
+ return A>1? (A - (reinterpret_cast<size_t>(p) % A)) % A: 0;
+ }
+
+ void setPtr(char *p, size_t sz)
+ {
+ m_sz = sz;
+ m_data = reinterpret_cast<T *>(p + shiftAmt(p));
+ }
+
+ //void setPtr(T *p, size_t sz)
+ //{
+ // m_sz = sz;
+ // if (A>1)
+ // myassert(((reinterpret_cast<size_t>(p) % A) == 0), "bad alignment");
+ // m_data = p;
+ //}
+
+ // internal allocation
+ void resize(size_t sz)
+ {
+ m_storage = new char[nBytes(sz)];
+ setPtr(m_storage, sz);
+ }
+
+ // external allocation
+ void set(char *storage, size_t sz)
+ {
+ setPtr(storage, sz);
+ }
+
+ ~AlignedVec()
+ {
+ if (m_storage)
+ delete [] m_storage;
+ }
+
+ size_t size() const { return m_sz; }
+ T& operator[](size_t i) { return m_data[i]; }
+ const T& operator[](size_t i) const { return m_data[i]; }
+ T* begin() { return m_data; }
+ T* end() { return m_data+m_sz; }
+ const T* begin() const { return m_data; }
+ const T* end() const { return m_data+m_sz; }
+ T& front() { return m_data[0]; }
+ T& back() { return m_data[m_sz-1]; }
+ const T& front() const { return m_data[0]; }
+ const T& back() const { return m_data[m_sz - 1]; }
+
+private:
+ char *m_storage;
+ T *m_data;
+ size_t m_sz;
+};
+
+} // namespace Details
+} // namespace BinSearch
diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h
new file mode 100644
index 0000000..cf5f0c9
--- /dev/null
+++ b/include/Algo-Direct-Common.h
@@ -0,0 +1,341 @@
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <type_traits>
+#include "AAlloc.h"
+
+namespace BinSearch {
+namespace Details {
+
+namespace DirectAux {
+
+#define SAFETY_MULTI_PASS true
+
+template <typename T>
+struct HResults
+{
+ HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {}
+ T H;
+ double hRatio;
+ size_t nInc;
+};
+
+
+#ifdef USE_FMA
+template <Algos A> struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); };
+template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); };
+template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); };
+#else
+template <Algos A> struct IsDirect { static const bool value = (A == Direct); };
+template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2); };
+template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache); };
+#endif
+
+// general definition
+template <Algos A, typename T, typename Enable = void>
+struct BucketElem
+{
+ FORCE_INLINE void set( uint32 b, const T *)
+ {
+ m_b = b;
+ }
+
+ FORCE_INLINE uint32 index() const { return m_b; }
+
+private:
+ uint32 m_b;
+};
+
+// specialization for DirectCache methods
+
+template <typename T> struct MatchingIntType;
+template <> struct MatchingIntType<double> { typedef uint64 type; };
+template <> struct MatchingIntType<float> { typedef uint32 type; };
+
+template <Algos A, typename T>
+struct BucketElem<A, T, typename std::enable_if< IsDirectCache<A>::value >::type >
+{
+ typedef typename MatchingIntType<T>::type I;
+
+ void set(uint32 b, const T *xi)
+ {
+ u.u.x = xi[b];
+ u.u.b = b;
+ }
+
+ FORCE_INLINE I index() const { return u.u.b; }
+ FORCE_INLINE T x() const { return u.u.x; }
+
+private:
+ union {
+ double dummy;
+ struct
+ {
+ T x;
+ I b;
+ } u;
+ } u;
+};
+
+
+template <bool UseFMA, unsigned char Gap, typename T>
+struct DirectTraits
+{
+ static void checkH(T scaler, T x0, T xN)
+ {
+ T Dn = xN - x0;
+ T ifmax = Dn * scaler;
+ myassert((ifmax < std::numeric_limits<uint32>::max() - (Gap - 1)),
+ "Problem unfeasible: index size exceeds uint32 capacity:"
+ << " D[N] =" << Dn
+ << ", H =" << scaler
+ << ", H D[n] =" << ifmax << "\n"
+ );
+ }
+
+ FORCE_INLINE static uint32 f(T scaler, T x0, T z)
+ {
+ T tmp = scaler * (z - x0);
+#ifdef USE_SSE2
+ return ftoi(FVec1<SSE,T>(tmp));
+#else
+ return static_cast<uint32>(tmp);
+#endif
+ }
+
+ template <InstrSet I>
+ FORCE_INLINE static typename FTOITraits<I, T>::vec_t f(const FVec<I, T>& scaler, const FVec<I, T>& x0, const FVec<I, T>& z)
+ {
+ return ftoi(scaler*(z-x0));
+ }
+
+ static T cst0(T scaler, T x0)
+ {
+ return x0;
+ }
+};
+
+#ifdef USE_FMA
+template <unsigned char Gap, typename T>
+struct DirectTraits<true,Gap,T>
+{
+ typedef FVec1<SSE, T> fVec1;
+
+ static void checkH(T scaler, T H_Times_x0, T xN)
+ {
+ union {
+ typename FVec1<SSE, T>::vec_t v;
+ T s;
+ } ifmax;
+ ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0));
+ myassert((ifmax.s < std::numeric_limits<uint32>::max() - (Gap - 1)),
+ "Problem unfeasible: index size exceeds uint32 capacity:"
+ << " H X[0] =" << H_Times_x0
+ << ", H =" << scaler
+ << ", X[N] =" << xN
+ << ", H X[N] - H X[0] =" << ifmax.s << "\n"
+ );
+ }
+
+ FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi)
+ {
+ return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0)));
+ }
+
+ template <InstrSet I>
+ FORCE_INLINE static typename FTOITraits<I,T>::vec_t f(const FVec<I,T>& scaler, const FVec<I, T>& H_Times_X0, const FVec<I, T>& z)
+ {
+ return ftoi(mulSub(scaler, z, H_Times_X0));
+ }
+
+ static T cst0(T scaler, T x0)
+ {
+ return scaler*x0;
+ }
+};
+#endif
+
+template <unsigned char Gap, typename T, Algos A>
+struct DirectInfo
+{
+ static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA);
+ typedef DirectTraits<UseFMA, Gap, T> fun_t;
+ typedef BucketElem<A,T> bucket_t;
+ typedef AlignedVec<bucket_t> bucketvec_t;
+
+ struct Data {
+ Data() : buckets(0), xi(0), scaler(0), cst0(0) {}
+ Data( const T *x // for Direct must persist if xws=NULL
+ , uint32 n
+ , T H
+ , bucket_t *bws // assumed to gave size nb, as computed below
+ , T *xws = NULL // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap
+ )
+ : buckets(bws)
+ , scaler(H)
+ , cst0(fun_t::cst0(H, x[0]))
+ {
+ myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
+
+ uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
+
+ const uint32 npad = Gap-1;
+ const uint32 n_sz = n + npad; // size of padded vector
+
+ if (xws) {
+ myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned");
+ std::fill_n(xws, npad, x[0]); // pad in front with x[0]
+ std::copy(x, x+n, xws + npad);
+ xi = xws;
+ }
+ else {
+ myassert(Gap==1, "if Gap>1 then X workspace must be provided");
+ xi = x;
+ }
+
+ populateIndex(bws, nb, xi, n_sz, scaler, cst0);
+ }
+
+ const bucket_t *buckets;
+ const T *xi;
+ T scaler;
+ T cst0; // could be x0 or (scaler*x0), depending if we are using FMA or not
+ } data;
+
+ static T growStep(T H)
+ {
+ T step;
+ T P = next(H);
+ while ((step = P - H) == 0)
+ P = next(P);
+ return step;
+ }
+
+ static HResults<T> computeH(const T *px, uint32 nx)
+ {
+ myassert((nx > Gap), "Array X too small");
+ myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap");
+
+ const T x0 = px[0];
+ const T xN = px[nx-1];
+
+ const T range = xN - x0;
+ myassert((range < std::numeric_limits<T>::max()), "range too large");
+
+ // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i
+ T deltaDMin = range;
+ for (uint32 i = Gap; i < nx; ++i) {
+ T Dnew = px[i] - x0;
+ T Dold = px[i - Gap] - x0;
+ myassert((Dnew > Dold),
+ "Problem unfeasible: D_i sequence not strictly increasing"
+ << " X[" << 0 << "]=" << x0
+ << " X[" << i - Gap << "]=" << px[i - Gap]
+ << " X[" << i << "]=" << px[i]
+ << "\n"
+ );
+ T deltaD = Dnew - Dold;
+ if (deltaD < deltaDMin)
+ deltaDMin = deltaD;
+ }
+
+ // initial guess for H
+ const T H0 = T(1.0) / deltaDMin;
+ T H = H0;
+
+ T cst0 = fun_t::cst0(H, x0);
+ fun_t::checkH(H, cst0, xN);
+
+ // adjust H by trial and error until succeed
+ size_t nInc = 0;
+ bool modified = false;
+ size_t npasses = 0;
+ T step = growStep(H);
+ uint32 seg_already_checked_from = nx;
+ do {
+ myassert((npasses++ < 2), "verification failed\n");
+ // if there has been an increase, then check only up to that point
+ uint32 last_seg_to_be_checked = seg_already_checked_from - 1;
+ modified = false;
+ uint32 inew = 0;
+ for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) {
+ uint32 iold = fun_t::f(H, cst0, px[i-Gap]);
+ uint32 inew = fun_t::f(H, cst0, px[i]);
+ while (inew == iold) {
+ seg_already_checked_from = i;
+ last_seg_to_be_checked = nx-1; // everything needs to be checked
+ modified = true;
+ H = H + step;
+ step *= 2;
+ // recalculate all constants and indices
+ cst0 = fun_t::cst0(H, x0);
+ fun_t::checkH(H, cst0, xN);
+ iold = fun_t::f(H, cst0, px[i - Gap]);
+ inew = fun_t::f(H, cst0, px[i]);
+ }
+ }
+ } while (SAFETY_MULTI_PASS && modified);
+
+ return HResults<T>(H, (((double)H) / H0) - 1.0, nInc);
+ }
+
+ static void populateIndex(BucketElem<A, T> *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0)
+ {
+ for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) {
+ uint32 idx = fun_t::f(scaler, cst0, px[i]);
+ while (b > idx) { // in the 1st iteration it is j=0 but this condition is always false
+ buckets[b].set( j, px );
+ --b;
+ }
+ if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant
+ j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check
+ buckets[b].set(j, px);
+ if (b-- == 0)
+ break;
+ }
+ }
+ }
+
+ DirectInfo(const Data& d)
+ : data(d)
+ {
+ }
+
+ DirectInfo(const T* px, const uint32 n)
+ {
+ HResults<T> res = computeH(px, n);
+
+#ifdef PAPER_TEST
+ nInc = res.nInc;
+ hRatio = res.hRatio;
+#endif
+ const uint32 npad = Gap-1;
+ const uint32 n_sz = n + npad; // size of padded vector
+
+ if (npad)
+ xi.resize(n_sz);
+
+ T H = res.H;
+ T cst0 = fun_t::cst0(H, px[0]);
+ const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
+ buckets.resize(maxIndex + 1);
+
+ data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
+ }
+
+private:
+ bucketvec_t buckets;
+ AlignedVec<T,8> xi;
+
+#ifdef PAPER_TEST
+public:
+ double hRatio;
+ size_t nInc;
+#endif
+};
+
+
+} // namespace DirectAux
+} // namespace Details
+} // namespace BinSearch
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
new file mode 100644
index 0000000..d5fa58d
--- /dev/null
+++ b/include/Algo-Direct2.h
@@ -0,0 +1,305 @@
+#pragma once
+
+#include "Algo-Direct-Common.h"
+
+namespace BinSearch {
+namespace Details {
+
+template <typename T, Algos A>
+struct AlgoScalarBase<T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : DirectAux::DirectInfo<2, T, A>
+{
+private:
+ typedef DirectAux::DirectInfo<2, T, A> base_t;
+ static const size_t Offset=2;
+
+public:
+ AlgoScalarBase(const T* x, const uint32 n)
+ : base_t(x, n)
+ {
+ }
+
+ FORCE_INLINE uint32 scalar(T z) const
+ {
+ const T* px = base_t::data.xi;
+ const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
+ uint32 bidx = base_t::fun_t::f(base_t::data.scaler, base_t::data.cst0, z);
+ uint32 iidx = buckets[bidx];
+ px += iidx;
+ if (z < *px)
+ --iidx;
+ if (z < *(px+1))
+ --iidx;
+ return iidx;
+ }
+};
+
+
+template <InstrSet I, typename T, Algos A>
+struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : AlgoScalarBase<T, A>
+{
+ static const uint32 nElem = sizeof(typename InstrFloatTraits<I, T>::vec_t) / sizeof(T);
+
+ typedef FVec<I, T> fVec;
+ typedef IVec<SSE, T> i128;
+
+ struct Constants
+ {
+ fVec vscaler;
+ fVec vcst0;
+ IVec<I, T> one;
+ };
+
+private:
+ typedef AlgoScalarBase<T, A> base_t;
+
+ FORCE_INLINE
+ //NO_INLINE
+ void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
+ {
+ union U {
+ __m128i vec;
+ uint32 ui32[4];
+ } u;
+
+ const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
+ const float *xi = base_t::data.xi;
+
+ // read indices t
+ const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
+ const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
+ const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
+ const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);
+
+#if 0
+ // read pairs ( X(t-1), X(t) )
+ __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
+ __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
+ __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
+ __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));
+
+ // build:
+ // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
+ // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) }
+ __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6));
+ __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6));
+ __m128 u01 = _mm_unpacklo_ps(h02, h13);
+ __m128 u23 = _mm_unpackhi_ps(h02, h13);
+ __m128 vxm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6));
+ __m128 vxp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6));
+#else
+ __m128 xp23 = _mm_castpd_ps(_mm_set_pd(*p3, *p2));
+ __m128 xp01 = _mm_castpd_ps(_mm_set_pd(*p1, *p0));
+ __m128 vxm = _mm_shuffle_ps(xp01, xp23, (0) + (2 << 2) + (0 << 4) + (2 << 6));
+ __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
+#endif
+ IVec<SSE, float> i(u.vec);
+ IVec<SSE, float> vlem = vz < vxm;
+ IVec<SSE, float> vlep = vz < vxp;
+ i = i + vlem + vlep;
+ i.store(pr);
+ }
+
+ FORCE_INLINE
+ //NO_INLINE
+ void resolve(const FVec<SSE, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
+ {
+ const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
+ const double *xi = base_t::data.xi;
+
+ uint32 b1 = buckets[bidx.get1()];
+ uint32 b0 = buckets[bidx.get0()];
+
+ const double *p1 = &xi[b1];
+ const double *p0 = &xi[b0];
+
+ // read pairs ( X(t-1), X(t) )
+ __m128d vx1 = _mm_loadu_pd(p1);
+ __m128d vx0 = _mm_loadu_pd(p0);
+
+ // build:
+ // { X(t(0)-1), X(t(1)-1) }
+ // { X(t(0)), X(t(1)) }
+ __m128d vxm = _mm_shuffle_pd(vx0, vx1, 0);
+ __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3);
+
+ IVec<SSE, double> i(b1, b0);
+ IVec<SSE, double> vlem = (vz < vxm);
+ IVec<SSE, double> vlep = (vz < vxp);
+ i = i + vlem + vlep;
+
+ union {
+ __m128i vec;
+ uint32 ui32[4];
+ } u;
+ u.vec = i;
+ pr[0] = u.ui32[0];
+ pr[1] = u.ui32[2];
+ }
+
+#ifdef USE_AVX
+
+ FORCE_INLINE
+ //NO_INLINE
+ void resolve(const FVec<AVX, float>& vz, const IVec<AVX, float>& bidx, uint32 *pr) const
+ {
+ const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
+ const float *xi = base_t::data.xi;
+
+#if 0 // use gather instructions
+
+ IVec<AVX,float> idxm;
+ idxm.setidx(buckets, bidx);
+ __m256i z = _mm256_setzero_si256();
+ IVec<AVX,float> minusone = _mm256_cmpeq_epi32(z,z);
+ IVec<AVX,float> idxp = idxm - minusone;
+
+ FVec<AVX, float> vxm = _mm256_i32gather_ps(xi, idxm, sizeof(float));
+ FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
+ IVec<AVX, float> ip = idxm;
+
+#else // do not use gather instrucions
+
+ union U {
+ __m256i vec;
+ uint32 ui32[8];
+ } u;
+
+ // read indices t
+
+ const double *p7 = reinterpret_cast<const double *>(&xi[(u.ui32[7] = buckets[bidx.get7()])]);
+ const double *p6 = reinterpret_cast<const double *>(&xi[(u.ui32[6] = buckets[bidx.get6()])]);
+ const double *p5 = reinterpret_cast<const double *>(&xi[(u.ui32[5] = buckets[bidx.get5()])]);
+ const double *p4 = reinterpret_cast<const double *>(&xi[(u.ui32[4] = buckets[bidx.get4()])]);
+ const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
+ const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
+ const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
+ const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);
+
+#if 0 // perform 8 loads in double precision
+
+ // read pairs ( X(t-1), X(t) )
+ __m128 xp7 = _mm_castpd_ps(_mm_load_sd(p7));
+ __m128 xp6 = _mm_castpd_ps(_mm_load_sd(p6));
+ __m128 xp5 = _mm_castpd_ps(_mm_load_sd(p5));
+ __m128 xp4 = _mm_castpd_ps(_mm_load_sd(p4));
+ __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
+ __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
+ __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
+ __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));
+
+ // build:
+ // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
+ // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) }
+ __m128 h57 = _mm_shuffle_ps(xp5, xp7, (1 << 2) + (1 << 6)); // F- F+ H- H+
+ __m128 h46 = _mm_shuffle_ps(xp4, xp6, (1 << 2) + (1 << 6)); // E- E+ G- G+
+ __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6)); // B- B+ D- D+
+ __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6)); // A- A+ C- C+
+
+ __m128 u01 = _mm_unpacklo_ps(h02, h13); // A- B- A+ B+
+ __m128 u23 = _mm_unpackhi_ps(h02, h13); // C- D- C+ D+
+ __m128 u45 = _mm_unpacklo_ps(h46, h57); // E- F- E+ F+
+ __m128 u67 = _mm_unpackhi_ps(h46, h57); // G- H- G+ H+
+
+ __m128 abcdm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // A- B- C- D-
+ __m128 abcdp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // A+ B+ C+ D+
+ __m128 efghm = _mm_shuffle_ps(u45, u67, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // E- F- G- H-
+ __m128 efghp = _mm_shuffle_ps(u45, u67, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // E+ F+ G+ H+
+
+ FVec<AVX, float> vxp = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdm), efghm, 1);
+ FVec<AVX, float> vxm = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdp), efghp, 1);
+
+ IVec<AVX, float> ip(u.vec);
+
+#else // use __mm256_set_pd
+
+ // read pairs ( X(t-1), X(t) )
+ __m256 x0145 = _mm256_castpd_ps(_mm256_set_pd(*p5, *p4, *p1, *p0)); // { x0(t-1), x0(t), x1(t-1), x1(t), x4(t-1), x4(t), x5(t-1), x5(t) }
+ __m256 x2367 = _mm256_castpd_ps(_mm256_set_pd(*p7, *p6, *p3, *p2)); // { x2(t-1), x2(t), x3(t-1), x3(t), x6(t-1), x6(t), x7(t-1), x7(t) }
+
+ // { x0(t-1), x1(t-1), x2(t-1), 3(t-1, x4(t-1), x5(t-1), x6(t-1), xt(t-1) }
+ FVec<AVX, float> vxm = _mm256_shuffle_ps(x0145, x2367, 0 + (2 << 2) + (0 << 4) + (2 << 6) );
+ // { x0(t), x1(t), x2(t), 3(t, x4(t), x5(t), x6(t), xt(t) }
+ FVec<AVX, float> vxp = _mm256_shuffle_ps(x0145, x2367, 1 + (3 << 2) + (1 << 4) + (3 << 6) );
+
+ IVec<AVX, float> ip(u.vec);
+
+#endif
+
+#endif
+
+ IVec<AVX, float> vlem = vz < vxm;
+ IVec<AVX, float> vlep = vz < vxp;
+ ip = ip + vlem + vlep;
+
+ ip.store(pr);
+ }
+
+
+
+ FORCE_INLINE
+ //NO_INLINE
+ void resolve(const FVec<AVX, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
+ {
+ union {
+ __m256i vec;
+ uint64 ui64[4];
+ } u;
+
+ const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
+ const double *xi = base_t::data.xi;
+
+ // read indices t
+ const double *p3 = &xi[(u.ui64[3] = buckets[bidx.get3()])];
+ const double *p2 = &xi[(u.ui64[2] = buckets[bidx.get2()])];
+ const double *p1 = &xi[(u.ui64[1] = buckets[bidx.get1()])];
+ const double *p0 = &xi[(u.ui64[0] = buckets[bidx.get0()])];
+
+ // read pairs ( X(t-1), X(t) )
+ __m128d xp3 = _mm_loadu_pd(p3);
+ __m128d xp2 = _mm_loadu_pd(p2);
+ __m128d xp1 = _mm_loadu_pd(p1);
+ __m128d xp0 = _mm_loadu_pd(p0);
+
+ // build:
+ // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
+ // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) }
+ __m256d x02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp0), xp2, 1);
+ __m256d x13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp1), xp3, 1);
+ FVec<AVX, double> vxm = _mm256_unpacklo_pd(x02,x13);
+ FVec<AVX, double> vxp = _mm256_unpackhi_pd(x02,x13);
+
+
+// __m128d h01m = _mm_shuffle_pd(xp0, xp1, 0);
+// __m128d h23m = _mm_shuffle_pd(xp2, xp3, 0);
+// __m128d h01p = _mm_shuffle_pd(xp0, xp1, 3);
+// __m128d h23p = _mm_shuffle_pd(xp2, xp3, 3);
+// FVec<AVX, double> vxm = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01m), h23m, 1);
+// FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1);
+
+ IVec<AVX, double> i(u.vec);
+ IVec<AVX, double> vlem = vz < vxm;
+ IVec<AVX, double> vlep = vz < vxp;
+ i = i + vlem + vlep;
+ i.extractLo32s().store(pr);
+ }
+#endif
+
+public:
+
+ AlgoVecBase(const T* x, const uint32 n) : base_t(x, n) {}
+
+ void initConstants(Constants& cst) const
+ {
+ cst.vscaler.setN(base_t::data.scaler);
+ cst.vcst0.setN(base_t::data.cst0);
+ cst.one.setN(uint32(1));
+ }
+
+ void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
+ {
+ fVec vz(pz);
+ resolve(vz, base_t::fun_t::f(cst.vscaler, cst.vcst0, vz), pr);
+ }
+};
+} // namespace Details
+} // namespace BinSearch
diff --git a/include/AlgoXCodes.h b/include/AlgoXCodes.h
new file mode 100644
index 0000000..bdc9b00
--- /dev/null
+++ b/include/AlgoXCodes.h
@@ -0,0 +1,23 @@
+ALGOENUM(DirectCacheFMA, 5)
+ALGOENUM(DirectFMA, 15)
+ALGOENUM(Direct2FMA, 25)
+ALGOENUM(DirectCache, 10)
+ALGOENUM(Direct, 20)
+ALGOENUM(Direct2, 30)
+ALGOENUM(Nonary, 40)
+ALGOENUM(Pentary, 50)
+ALGOENUM(Ternary, 60)
+ALGOENUM(Eytzinger, 70)
+ALGOENUM(BitSet, 80)
+ALGOENUM(ClassicOffset, 90)
+#ifdef PAPER_TEST
+ALGOENUM(MorinOffset, 100)
+ALGOENUM(BitSetNoPad, 110)
+ALGOENUM(ClassicMod, 120)
+ALGOENUM(MorinBranchy, 130)
+ALGOENUM(Classic, 140)
+ALGOENUM(LowerBound, 145)
+#ifdef USE_MKL
+ALGOENUM(MKL, 150)
+#endif
+#endif
diff --git a/include/BinAlgo.h b/include/BinAlgo.h
new file mode 100644
index 0000000..aac67a0
--- /dev/null
+++ b/include/BinAlgo.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "Type.h"
+#include <algorithm>
+
+namespace BinSearch {
+
+template <InstrSet I, typename T, Algos A, bool L=false, bool R=false>
+struct BinAlgo : Details::BinAlgoBase<I,T,A>
+{
+ typedef Details::BinAlgoBase<I,T,A> base_t;
+
+ BinAlgo(const T* px, const uint32 n) : base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {}
+ BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {}
+
+ FORCE_INLINE
+ uint32 scalar(T z) const
+ {
+ if (!L || z >= x0)
+ if (!R || z < xN)
+ return base_t::scalar(z);
+ else
+ return N;
+ else
+ return std::numeric_limits<uint32>::max();
+ }
+
+
+ FORCE_INLINE
+ void vectorial(uint32 *pr, const T *pz, uint32 n) const
+ {
+ if (!L && !R) {
+ Details::Loop<T,base_t>::loop(*this, pr, pz, n);
+ }
+ else {
+ const uint32 nElem = base_t::nElem;
+ const uint32 idealbufsize = 256;
+ const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0));
+ T databuf[bufsize];
+ uint32 resbuf[bufsize];
+ uint32 indexbuf[bufsize];
+
+ uint32 *prend = pr + n;
+ while(pr != prend) {
+ uint32 cnt = 0;
+ uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend));
+ for (uint32 j = 0; j < niter; ++j) {
+ T z = pz[j];
+ // FIXME: use SSE2?
+ if (!L || z >= x0)
+ if (!R || z < xN) {
+ databuf[cnt] = z;
+ indexbuf[cnt] = j;
+ ++cnt;
+ }
+ else
+ pr[j] = N;
+ else
+ pr[j] = std::numeric_limits<uint32>::max();
+ }
+ // FIXME: merge these two loops
+ Details::Loop<T,base_t>::loop(*this, resbuf, databuf, cnt);
+ for (uint32 j = 0; j < cnt; ++j)
+ pr[indexbuf[j]] = resbuf[j];
+ pr += niter;
+ pz += niter;
+ }
+ }
+ }
+
+ Details::CondData<T,L> x0;
+ Details::CondData<T,R> xN;
+ Details::CondData<uint32,R> N;
+};
+
+
+} // namespace BinSearch
diff --git a/include/BinSearch.h b/include/BinSearch.h
new file mode 100644
index 0000000..336f529
--- /dev/null
+++ b/include/BinSearch.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "AAlloc.h"
+#include "BinAlgo.h"
+#include "SIMD.h"
+
+#include <algorithm>
+#include <limits>
+
+
+#include "Algo-Direct2.h"
diff --git a/include/Portable.h b/include/Portable.h
new file mode 100644
index 0000000..1710b05
--- /dev/null
+++ b/include/Portable.h
@@ -0,0 +1,151 @@
+#pragma once
+#include <limits>
+#include <cmath>
+#include <stdexcept>
+#include <sstream>
+
+#ifdef __FMA__
+#define USE_FMA
+#endif
+
+#ifdef __AVX2__
+#define USE_AVX2
+#endif
+
+#ifdef __AVX__
+#define USE_AVX
+#endif
+
+
+#ifdef __SSE4_1__
+#define USE_SSE41
+#endif
+
+#ifdef __SSE4_2__
+#define USE_SSE42
+#endif
+
+
+#ifndef _MSC_VER
+#include <stdint.h>
+#endif
+
+namespace BinSearch {
+
+#ifndef _MSC_VER
+typedef int8_t int8;
+typedef uint8_t uint8;
+typedef int32_t int32;
+typedef uint32_t uint32;
+typedef int64_t int64;
+typedef uint64_t uint64;
+#else
+typedef __int8 int8;
+typedef unsigned __int8 uint8;
+typedef __int32 int32;
+typedef unsigned __int32 uint32;
+typedef __int64 int64;
+typedef unsigned __int64 uint64;
+#endif
+
+namespace Details {
+
+#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
+
+// log2 is not defined in VS2008
+#if defined(_MSC_VER)
+inline uint32 log2 (uint32 val) {
+ if (val == 1) return 0;
+ uint32 ret = 0;
+ do {
+ ret++;
+ val >>= 1;
+ } while (val > 1);
+ return ret;
+}
+#endif
+
+#ifdef _DEBUG
+#define DEBUG
+#endif
+
+#ifdef _MSC_VER
+# define FORCE_INLINE __forceinline
+# define NO_INLINE __declspec(noinline)
+#else
+# define NO_INLINE __attribute__((noinline))
+# ifdef DEBUG
+# define FORCE_INLINE NO_INLINE
+# else
+# define FORCE_INLINE __attribute__((always_inline)) inline
+# endif
+#endif
+
+#ifdef USE_AVX
+#define COMISS "vcomiss"
+#define COMISD "vcomisd"
+#else
+#define COMISS "comiss"
+#define COMISD "comisd"
+#endif
+
+// nextafter is not defined in VS2008
+#if defined(_MSC_VER) && (_MSC_VER <= 1500)
+#include <float.h>
+inline float mynext(float x)
+{
+ return _nextafterf(x, std::numeric_limits<float>::max());
+}
+
+inline double mynext(double x)
+{
+ return _nextafter(x, std::numeric_limits<double>::max());
+}
+inline float myprev(float x)
+{
+ return _nextafterf(x, -std::numeric_limits<float>::max());
+}
+
+inline double myprev(double x)
+{
+ return _nextafter(x, -std::numeric_limits<double>::max());
+}
+#else
+inline float mynext(float x)
+{
+ return std::nextafterf(x, std::numeric_limits<float>::max());
+}
+
+inline double mynext(double x)
+{
+ return std::nextafter(x, std::numeric_limits<double>::max());
+}
+inline float myprev(float x)
+{
+ return std::nextafterf(x, -std::numeric_limits<float>::max());
+}
+
+inline double myprev(double x)
+{
+ return std::nextafter(x, -std::numeric_limits<double>::max());
+}
+#endif
+
+template <typename T>
+inline T next(T x)
+{
+ for (int i = 0; i < 4; ++i)
+ x = mynext(x);
+ return x;
+}
+
+template <typename T>
+inline T prev(T x)
+{
+ for (int i = 0; i < 4; ++i)
+ x = myprev(x);
+ return x;
+}
+
+} // namepsace Details
+} // namespace BinSearch
diff --git a/include/SIMD.h b/include/SIMD.h
new file mode 100644
index 0000000..642b80a
--- /dev/null
+++ b/include/SIMD.h
@@ -0,0 +1,562 @@
+#pragma once
+
+#include "Portable.h"
+
+#ifdef USE_SSE42
+#ifndef _MSC_VER
+#include <popcntintrin.h>
+#define popcnt32 _mm_popcnt_u32
+#else
+#include <intrin.h>
+#define popcnt32 __popcnt
+#endif
+#else // USE_SSE42
+namespace BinSearch {
+FORCE_INLINE int popcnt32(int x32)
+{
+ // strictly speaking this is not correct, as it ignores higher order bits
+ // however, this is only used on the resuot of movemask on a 128-bit register, which is 8 at most, so it is ok
+ // with 256-bit registers, SSE42 is defined, and we do not use this function
+ uint8 x = static_cast<uint8>(x32);
+ x = (x & 0x55) + (x >> 1 & 0x55);
+ x = (x & 0x33) + (x >> 2 & 0x33);
+ x = (x & 0x0f) + (x >> 4 & 0x0f);
+ return x;
+}
+} // namespace
+#endif
+
+#if defined(USE_AVX) || defined(USE_AVX2)
+#include <immintrin.h>
+#else
+#include <emmintrin.h>
+#ifdef USE_SSE41
+#include <smmintrin.h>
+#endif
+#endif
+
+#include "Type.h"
+
+namespace BinSearch {
+namespace Details {
+
+template <InstrSet I, class T>
+struct FVec;
+
+template <InstrSet I, class T>
+struct IVec;
+
+template <InstrSet I, class T>
+struct FVec1;
+
+template <> struct InstrIntTraits<SSE>
+{
+ typedef __m128i vec_t;
+};
+
+template <> struct InstrFloatTraits<SSE, float>
+{
+ typedef __m128 vec_t;
+};
+
+template <> struct InstrFloatTraits<SSE, double>
+{
+ typedef __m128d vec_t;
+};
+
+template <InstrSet I, typename T>
+struct FTOITraits
+{
+ typedef IVec<SSE, float> vec_t;
+};
+
+#ifdef USE_AVX
+
+template <>
+struct FTOITraits<AVX, float>
+{
+ typedef IVec<AVX, float> vec_t;
+};
+
+template <> struct InstrIntTraits<AVX>
+{
+ typedef __m256i vec_t;
+};
+
+template <> struct InstrFloatTraits<AVX, float>
+{
+ typedef __m256 vec_t;
+};
+
+template <> struct InstrFloatTraits<AVX, double>
+{
+ typedef __m256d vec_t;
+};
+
+#endif
+
+
+template <typename TR>
+struct VecStorage
+{
+ typedef typename TR::vec_t vec_t;
+
+ FORCE_INLINE operator vec_t&() { return vec; }
+ FORCE_INLINE operator const vec_t&() const { return vec; }
+
+protected:
+ FORCE_INLINE VecStorage() {}
+ FORCE_INLINE VecStorage(const vec_t& v) : vec( v ) {}
+
+ vec_t vec;
+};
+
+template <InstrSet>
+struct IVecBase;
+
+template <>
+struct IVecBase<SSE> : VecStorage<InstrIntTraits<SSE>>
+{
+protected:
+ FORCE_INLINE IVecBase() {}
+ FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<SSE>>( v ) {}
+public:
+ FORCE_INLINE static vec_t zero() { return _mm_setzero_si128(); }
+
+ FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32( vec ); }
+
+ FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask )
+ {
+#ifdef USE_SSE41
+ vec = _mm_blendv_epi8(vec, val, mask);
+#else
+ vec = _mm_or_si128(_mm_andnot_si128(mask,vec), _mm_and_si128(mask,val));
+#endif
+ }
+ FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask)
+ {
+ vec = _mm_or_si128(vec, _mm_and_si128(val,mask));
+ }
+};
+
+template <>
+struct IVec<SSE, float> : IVecBase<SSE>
+{
+ FORCE_INLINE IVec() {}
+ FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi32( i ) ) {}
+ FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v ) {}
+ FORCE_INLINE IVec( uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<SSE>( _mm_set_epi32( u3, u2, u1, u0 ) ) {}
+
+ void setN( int32 i ) { vec = _mm_set1_epi32( i ); }
+
+#ifdef USE_SSE41
+ FORCE_INLINE int32 get1() const { return _mm_extract_epi32(vec, 1); }
+ FORCE_INLINE int32 get2() const { return _mm_extract_epi32(vec, 2); }
+ FORCE_INLINE int32 get3() const { return _mm_extract_epi32(vec, 3); }
+#else
+ FORCE_INLINE int32 get1() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 1 ) ); }
+ FORCE_INLINE int32 get2() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); }
+ FORCE_INLINE int32 get3() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 3 ) ); }
+#endif
+
+ FORCE_INLINE void store( uint32 *pi ) const { _mm_storeu_si128( reinterpret_cast<vec_t*>(pi), vec ); }
+
+ FORCE_INLINE int countbit()
+ {
+ return popcnt32(_mm_movemask_ps(_mm_castsi128_ps(vec)));
+ }
+};
+
+template <>
+struct IVec<SSE, double> : IVecBase<SSE>
+{
+ FORCE_INLINE IVec() {}
+ FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi64x( i ) ) {}
+ FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v ) {}
+ FORCE_INLINE IVec( uint64 u1, uint64 u0 ) : IVecBase<SSE>( _mm_set_epi64x(u1, u0) ) {}
+
+ void setN( int32 i ) { vec = _mm_set1_epi64x( i ); }
+
+ FORCE_INLINE int32 get1() const
+ {
+#ifdef USE_SSE41
+ return _mm_extract_epi32(vec, 2);
+#else
+ return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) );
+#endif
+ }
+
+ // extract the 2 32 bits integers no. 0, 2 and store them in a __m128i
+ FORCE_INLINE IVec<SSE,float> extractLo32s() const
+ {
+ return _mm_shuffle_epi32(vec, ((2 << 2) | 0));
+ }
+
+ FORCE_INLINE void store( uint32 *pi ) const
+ {
+ pi[0] = get0();
+ pi[1] = get1();
+ }
+
+ FORCE_INLINE int countbit()
+ {
+#if 1
+ // takes 4 cycles
+ __m128i hi = _mm_shuffle_epi32(vec, 2); // 1 cycle
+ __m128i s = _mm_add_epi32(vec, hi);
+ int32 x = _mm_cvtsi128_si32(s);
+ return -x;
+#else
+ // takes 6 cycles
+ return popcnt32(_mm_movemask_pd(_mm_castsi128_pd(vec)));
+#endif
+ }
+};
+
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator>> (const IVec<SSE,T>& a, unsigned n) { return _mm_srli_epi32(a, n); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator<< (const IVec<SSE,T>& a, unsigned n) { return _mm_slli_epi32(a, n); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator& (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_and_si128( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator| (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_or_si128( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator^ (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_xor_si128( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator+ (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_add_epi32( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<SSE,T> operator- (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_sub_epi32( a, b ); }
+#ifdef USE_SSE41
+template <typename T>
+FORCE_INLINE IVec<SSE,T> min (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_min_epi32( a, b ); }
+#endif
+
+typedef VecStorage<InstrFloatTraits<SSE,float>> FVec128Float;
+
+template <>
+struct FVec1<SSE, float> : FVec128Float
+{
+ FORCE_INLINE FVec1() {}
+ FORCE_INLINE FVec1( float f ) : FVec128Float( _mm_load_ss( &f ) ) {}
+ FORCE_INLINE FVec1( const vec_t& v ): FVec128Float( v ) {}
+
+ FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); }
+};
+
+template <>
+struct FVec<SSE, float> : FVec128Float
+{
+ FORCE_INLINE FVec() {}
+ FORCE_INLINE FVec( float f ) : FVec128Float( _mm_set1_ps( f ) ) {}
+ FORCE_INLINE FVec( const float *v ) : FVec128Float( _mm_loadu_ps( v ) ) {}
+ FORCE_INLINE FVec( const vec_t& v) : FVec128Float(v) {}
+ FORCE_INLINE FVec( float f3, float f2, float f1, float f0 ) : FVec128Float( _mm_set_ps(f3, f2, f1, f0) ) {}
+
+ void set0( float f ) { vec = _mm_load_ss( &f ); }
+ void setN( float f ) { vec = _mm_set1_ps( f ); }
+
+ FORCE_INLINE void setidx( const float *xi, const IVec<SSE,float>& idx )
+ {
+ uint32 i0 = idx.get0();
+ uint32 i1 = idx.get1();
+ uint32 i2 = idx.get2();
+ uint32 i3 = idx.get3();
+ vec = _mm_set_ps( xi[i3], xi[i2], xi[i1], xi[i0] );
+ }
+
+ FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); }
+ FORCE_INLINE float get1() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 1 ) ); }
+ FORCE_INLINE float get2() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 2 ) ); }
+ FORCE_INLINE float get3() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 3 ) ); }
+};
+
+FORCE_INLINE FVec1<SSE,float> operator+ (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_add_ss( a, b ); }
+FORCE_INLINE FVec1<SSE,float> operator- (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_sub_ss( a, b ); }
+FORCE_INLINE FVec1<SSE,float> operator* (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_mul_ss( a, b ); }
+FORCE_INLINE FVec1<SSE,float> operator/ (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_div_ss( a, b ); }
+FORCE_INLINE int ftoi (const FVec1<SSE,float>& a) { return _mm_cvttss_si32(a); }
+FORCE_INLINE IVec<SSE,float> operator> (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_castps_si128( _mm_cmpgt_ss( a, b ) ); }
+#ifdef USE_FMA
+FORCE_INLINE FVec1<SSE, float> mulSub(const FVec1<SSE, float>& a, const FVec1<SSE, float>& b, const FVec1<SSE, float>& c) { return _mm_fmsub_ss(a, b, c); }
+#endif
+
+FORCE_INLINE FVec<SSE,float> operator- (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_sub_ps( a, b ); }
+FORCE_INLINE FVec<SSE,float> operator* (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_mul_ps( a, b ); }
+FORCE_INLINE FVec<SSE,float> operator/ (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_div_ps( a, b ); }
+FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,float>& a) { return _mm_cvttps_epi32(a); }
+FORCE_INLINE IVec<SSE,float> operator<= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
+FORCE_INLINE IVec<SSE,float> operator>= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
+FORCE_INLINE IVec<SSE,float> operator< (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
+#ifdef USE_FMA
+FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
+#endif
+
+typedef VecStorage<InstrFloatTraits<SSE,double>> FVec128Double;
+
+template <>
+struct FVec1<SSE, double> : FVec128Double
+{
+ FORCE_INLINE FVec1() {}
+ FORCE_INLINE FVec1( double f ) : FVec128Double( _mm_load_sd( &f ) ) {}
+ FORCE_INLINE FVec1( const vec_t& v ) : FVec128Double( v ) {}
+
+ FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); }
+};
+
+template <>
+struct FVec<SSE, double> : FVec128Double
+{
+ FORCE_INLINE FVec() {}
+ FORCE_INLINE FVec( double d ) : FVec128Double( _mm_set1_pd( d ) ) {}
+ FORCE_INLINE FVec( const double *v ) : FVec128Double( _mm_loadu_pd( v ) ) {}
+ FORCE_INLINE FVec( const vec_t& v) : FVec128Double( v ) {}
+ FORCE_INLINE FVec( double f1, double f0 ) : FVec128Double( _mm_set_pd(f1, f0) ) {}
+
+ void set0( double f ) { vec = _mm_load_sd( &f ); }
+ void setN( double f ) { vec = _mm_set1_pd( f ); }
+
+ FORCE_INLINE void setidx( const double *xi, const IVec<SSE,double>& idx )
+ {
+ vec = _mm_set_pd( xi[idx.get1()], xi[idx.get0()] );
+ }
+
+ FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); }
+ FORCE_INLINE double get1() const { return _mm_cvtsd_f64( _mm_shuffle_pd( vec, vec, 1 ) ); };
+};
+
+FORCE_INLINE FVec1<SSE,double> operator+ (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_add_sd( a, b ); }
+FORCE_INLINE FVec1<SSE,double> operator- (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_sub_sd( a, b ); }
+FORCE_INLINE FVec1<SSE,double> operator* (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_mul_sd( a, b ); }
+FORCE_INLINE FVec1<SSE,double> operator/ (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_div_sd( a, b ); }
+FORCE_INLINE int ftoi (const FVec1<SSE,double>& a) { return _mm_cvttsd_si32(a); }
+FORCE_INLINE IVec<SSE,double> operator> (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpgt_sd( a, b ) ); }
+#ifdef USE_FMA
+FORCE_INLINE FVec1<SSE, double> mulSub(const FVec1<SSE, double>& a, const FVec1<SSE, double>& b, const FVec1<SSE, double>& c) { return _mm_fmsub_sd(a, b, c); }
+#endif
+
+FORCE_INLINE FVec<SSE,double> operator- (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_sub_pd( a, b ); }
+FORCE_INLINE FVec<SSE,double> operator* (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_mul_pd( a, b ); }
+FORCE_INLINE FVec<SSE,double> operator/ (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_div_pd( a, b ); }
+FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,double>& a) { return _mm_cvttpd_epi32(a); }
+FORCE_INLINE IVec<SSE,double> operator<= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
+FORCE_INLINE IVec<SSE,double> operator< (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
+FORCE_INLINE IVec<SSE,double> operator>= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
+#ifdef USE_FMA
+FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
+#endif
+
+#ifdef USE_AVX
+
+template <>
+struct IVecBase<AVX> : VecStorage<InstrIntTraits<AVX>>
+{
+protected:
+ FORCE_INLINE IVecBase() {}
+ FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<AVX>>( v ) {}
+public:
+ FORCE_INLINE static vec_t zero() { return _mm256_setzero_si256(); }
+
+ FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32(_mm256_castsi256_si128(vec)); }
+
+ FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) { vec = _mm256_blendv_epi8(vec, val, mask); }
+ FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask)
+ {
+ vec = _mm256_blendv_epi8(vec, val, mask);
+ //vec = _mm256_or_si256(vec, _mm256_and_si256(val,mask));
+ }
+
+ FORCE_INLINE __m128i lo128() const { return _mm256_castsi256_si128(vec); }
+ FORCE_INLINE __m128i hi128() const { return _mm256_extractf128_si256(vec, 1); }
+};
+
+template <>
+struct IVec<AVX, float> : IVecBase<AVX>
+{
+ FORCE_INLINE IVec() {}
+ FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi32( i ) ) {}
+ FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v ) {}
+ FORCE_INLINE IVec(uint32 u7, uint32 u6, uint32 u5, uint32 u4, uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<AVX>(_mm256_set_epi32(u7, u6, u5, u4, u3, u2, u1, u0)) {}
+
+ void setN( int32 i ) { vec = _mm256_set1_epi32( i ); }
+
+ FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 1); }
+ FORCE_INLINE int32 get2() const { return _mm256_extract_epi32(vec, 2); }
+ FORCE_INLINE int32 get3() const { return _mm256_extract_epi32(vec, 3); }
+ FORCE_INLINE int32 get4() const { return _mm256_extract_epi32(vec, 4); }
+ FORCE_INLINE int32 get5() const { return _mm256_extract_epi32(vec, 5); }
+ FORCE_INLINE int32 get6() const { return _mm256_extract_epi32(vec, 6); }
+ FORCE_INLINE int32 get7() const { return _mm256_extract_epi32(vec, 7); }
+
+ FORCE_INLINE void setidx( const uint32 *bi, const IVec<AVX,float>& idx )
+ {
+ vec = _mm256_i32gather_epi32(reinterpret_cast<const int32 *>(bi), idx, sizeof(uint32));
+ }
+
+ FORCE_INLINE void store( uint32 *pi ) const { _mm256_storeu_si256( reinterpret_cast<vec_t*>(pi), vec ); }
+
+ FORCE_INLINE int countbit()
+ {
+ return popcnt32(_mm256_movemask_ps(_mm256_castsi256_ps(vec)));
+ }
+};
+
+template <>
+struct IVec<AVX, double> : IVecBase<AVX>
+{
+ FORCE_INLINE IVec() {}
+ FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi64x( i ) ) {}
+ FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v ) {}
+ FORCE_INLINE IVec(uint64 u3, uint64 u2, uint64 u1, uint64 u0) : IVecBase<AVX>(_mm256_set_epi64x(u3, u2, u1, u0)) {}
+
+ void setN( int32 i ) { vec = _mm256_set1_epi64x( i ); }
+
+ // extract the 4 32 bits integers no. 0, 2, 4, 6 and store them in a __m128i
+ FORCE_INLINE IVec<SSE,float> extractLo32s() const
+ {
+ union {
+ uint32 u32[4];
+ __m128i u;
+ } mask = {0,2,4,6};
+ //__m256 ps256 = _mm256_castsi256_ps(vec);
+ //__m128 lo128 = _mm256_castps256_ps128(ps256);
+ //__m128 hi128 = _mm256_extractf128_ps(ps256, 1);
+ //__m128 blend = _mm_shuffle_ps(lo128, hi128, 0 + (2<<2) + (0<<4) + (2<<6));
+ __m256i blend = _mm256_permutevar8x32_epi32(vec, _mm256_castsi128_si256(mask.u));
+ return _mm256_castsi256_si128(blend);
+ }
+
+ //int32 get1() const { return _mm256_cvtsi256_si32( _mm256_shuffle_epi32( vec, 2 ) ); };
+ FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 2); }
+
+ FORCE_INLINE void store( uint32 *pi ) const
+ {
+ extractLo32s().store(pi);
+ }
+
+ FORCE_INLINE int countbit()
+ {
+ return popcnt32(_mm256_movemask_pd(_mm256_castsi256_pd(vec)));
+ }
+};
+
+template <typename T>
+FORCE_INLINE IVec<AVX,T> operator>> (const IVec<AVX,T>& a, unsigned n) { return _mm256_srli_epi32(a, n); }
+template <typename T>
+FORCE_INLINE IVec<AVX,T> operator<< (const IVec<AVX,T>& a, unsigned n) { return _mm256_slli_epi32(a, n); }
+template <typename T>
+FORCE_INLINE IVec<AVX,T> operator& (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_and_si256( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<AVX,T> operator| (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_or_si256( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<AVX,T> operator^ (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_xor_si256( a, b ); }
+template <typename T>
+FORCE_INLINE IVec<AVX,T> min (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_min_epi32( a, b ); }
+
+FORCE_INLINE IVec<AVX,float> operator+ (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_add_epi32( a, b ); }
+FORCE_INLINE IVec<AVX,float> operator- (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_sub_epi32( a, b ); }
+FORCE_INLINE IVec<AVX,double> operator+ (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_add_epi64( a, b ); }
+FORCE_INLINE IVec<AVX,double> operator- (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_sub_epi64( a, b ); }
+
+
+typedef VecStorage<InstrFloatTraits<AVX,float>> FVec256Float;
+
+template <>
+struct FVec<AVX, float> : FVec256Float
+{
+ FORCE_INLINE FVec() {}
+ FORCE_INLINE FVec( float f ) : FVec256Float( _mm256_set1_ps( f ) ) {}
+ FORCE_INLINE FVec( const float *v ) : FVec256Float( _mm256_loadu_ps( v ) ) {}
+ FORCE_INLINE FVec( const vec_t& v) : FVec256Float(v) {}
+ FORCE_INLINE FVec(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0) : FVec256Float(_mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0)) {}
+
+ //void set0( float f ) { vec = _mm256_load_ss( &f ); }
+ void setN( float f ) { vec = _mm256_set1_ps( f ); }
+
+ FORCE_INLINE void setidx( const float *xi, const IVec<AVX,float>& idx )
+ {
+#if 1 // use gather primitives
+ vec = _mm256_i32gather_ps (xi, idx, 4);
+#elif 0
+ uint32 i0 = idx.get0();
+ uint32 i1 = idx.get1();
+ uint32 i2 = idx.get2();
+ uint32 i3 = idx.get3();
+ uint32 i4 = idx.get4();
+ uint32 i5 = idx.get5();
+ uint32 i6 = idx.get6();
+ uint32 i7 = idx.get7();
+ vec = _mm256_set_ps( xi[i7], xi[i6], xi[i5], xi[i4], xi[i3], xi[i2], xi[i1], xi[i0] );
+#else
+ union {
+ __m256i vec;
+ uint32 ui32[8];
+ } i;
+ i.vec = static_cast<const __m256i&>(idx);
+ vec = _mm256_set_ps(xi[i.ui32[7]], xi[i.ui32[6]], xi[i.ui32[5]], xi[i.ui32[4]], xi[i.ui32[3]], xi[i.ui32[2]], xi[i.ui32[1]], xi[i.ui32[0]]);
+#endif
+ }
+
+ FORCE_INLINE FVec<SSE, float> lo128() const { return _mm256_castps256_ps128(vec); }
+ FORCE_INLINE FVec<SSE, float> hi128() const { return _mm256_extractf128_ps(vec, 1); }
+
+ //FORCE_INLINE float get0() const { return _mm256_cvtss_f32( vec ); }
+ //FORCE_INLINE float get1() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 1 ) ); }
+ //FORCE_INLINE float get2() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 2 ) ); }
+ //FORCE_INLINE float get3() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 3 ) ); }
+};
+
+FORCE_INLINE FVec<AVX,float> operator- (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_sub_ps( a, b ); }
+FORCE_INLINE FVec<AVX,float> operator* (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_mul_ps( a, b ); }
+FORCE_INLINE FVec<AVX,float> operator/ (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_div_ps( a, b ); }
+FORCE_INLINE IVec<AVX,float> ftoi (const FVec<AVX,float>& a) { return _mm256_cvttps_epi32(a); }
+FORCE_INLINE IVec<AVX,float> operator<= (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_LE_OS) ); }
+FORCE_INLINE IVec<AVX,float> operator>= (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_GE_OS ) ); }
+FORCE_INLINE IVec<AVX,float> operator< (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS )); }
+#ifdef USE_FMA
+FORCE_INLINE FVec<AVX, float> mulSub(const FVec<AVX, float>& a, const FVec<AVX, float>& b, const FVec<AVX, float>& c) { return _mm256_fmsub_ps(a, b, c); }
+#endif
+
+typedef VecStorage<InstrFloatTraits<AVX,double>> FVec256Double;
+
+template <>
+struct FVec<AVX, double> : FVec256Double
+{
+ FORCE_INLINE FVec() {}
+ FORCE_INLINE FVec( double d ) : FVec256Double( _mm256_set1_pd( d ) ) {}
+ FORCE_INLINE FVec( const double *v ) : FVec256Double( _mm256_loadu_pd( v ) ) {}
+ FORCE_INLINE FVec( const vec_t& v) : FVec256Double( v ) {}
+ FORCE_INLINE FVec(double d3, double d2, double d1, double d0) : FVec256Double(_mm256_set_pd(d3, d2, d1, d0)) {}
+
+ //void set0( double f ) { vec = _mm256_load_sd( &f ); }
+ void setN( double f ) { vec = _mm256_set1_pd( f ); }
+
+ FORCE_INLINE void setidx( const double *xi, const IVec<SSE,float>& idx )
+ {
+ vec = _mm256_i32gather_pd(xi, idx, 8);
+ }
+
+ FORCE_INLINE void setidx( const double *xi, const IVec<AVX,double>& idx )
+ {
+ vec = _mm256_i64gather_pd(xi, idx, 8);
+ }
+
+// FORCE_INLINE double get0() const { return _mm256_cvtsd_f64( vec ); }
+// FORCE_INLINE double get1() const { return _mm256_cvtsd_f64( _mm256_shuffle_pd( vec, vec, 1 ) ); };
+};
+
+FORCE_INLINE FVec<AVX,double> operator- (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_sub_pd( a, b ); }
+FORCE_INLINE FVec<AVX,double> operator* (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_mul_pd( a, b ); }
+FORCE_INLINE FVec<AVX,double> operator/ (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_div_pd( a, b ); }
+FORCE_INLINE IVec<SSE,float> ftoi (const FVec<AVX,double>& a) { return _mm256_cvttpd_epi32(a); }
+FORCE_INLINE IVec<AVX,double> operator<= (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_LE_OS ) ); }
+FORCE_INLINE IVec<AVX,double> operator< (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS)); }
+FORCE_INLINE IVec<AVX,double> operator>= (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_GE_OS ) ); }
+#ifdef USE_FMA
+FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX, double>& b, const FVec<AVX, double>& c) { return _mm256_fmsub_pd(a, b, c); }
+#endif
+
+#endif
+
+} // namepsace Details
+} // namespace BinSearch
diff --git a/include/Type.h b/include/Type.h
new file mode 100644
index 0000000..720bfb8
--- /dev/null
+++ b/include/Type.h
@@ -0,0 +1,221 @@
+ #pragma once
+
+#include <stddef.h>
+#include <vector>
+#include <limits>
+
+#include "Portable.h"
+
+using std::size_t;
+
+namespace BinSearch {
+
+enum InstrSet { Scalar, SSE, AVX };
+
+#define ALGOENUM(x, b) x,
+enum Algos
+ {
+#include "AlgoXCodes.h"
+ };
+#undef ALGOENUM
+
+namespace Details {
+
+ template <InstrSet I>
+ struct InstrIntTraits;
+
+ template <InstrSet I, typename T>
+ struct InstrFloatTraits;
+
+ // base class for algorithm supporting the method:
+ // uint32 scalar(T z) const
+ template <typename T, Algos A, typename Enable=void>
+ struct AlgoScalarBase;
+
+ // base class for algorithm supporting the following methods, constants and definitions:
+ // static const uint32 nElem
+ // struct Constants;
+ // void initConstants(Constants& cst) const
+ // void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
+ // The function vectorial processes nElem items
+ template <InstrSet I, typename T, Algos A, typename Enable=void>
+ struct AlgoVecBase;
+
+ template <typename T> struct IntTraits;
+
+ template <> struct IntTraits<float>
+ {
+ typedef uint32 itype;
+ };
+ template <> struct IntTraits<double>
+ {
+ typedef uint64 itype;
+ };
+
+ template <int N>
+ struct Body
+ {
+ template <uint32 D, typename T, typename Expr>
+ FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const typename Expr::Constants& cst)
+ {
+ e.vectorial(ri, zi, cst);
+ Body<N - 1>::template iteration<D>(e, ri + D, zi + D, cst);
+ }
+
+ };
+
+ template <>
+ struct Body<0>
+ {
+ template <uint32 D, typename T, typename Expr, typename H>
+ FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const H&)
+ {
+ }
+ };
+
+ template <typename T, typename Algo>
+ struct Loop
+ {
+ typedef Algo algo_type;
+ static const uint32 M = 4;
+ static const uint32 D = algo_type::nElem;
+
+ FORCE_INLINE static void loop(const algo_type& e, uint32 *ri, const T* zi, uint32 n)
+ {
+ typename algo_type::Constants cst;
+ e.initConstants(cst);
+
+ uint32 j = 0;
+ while (j + (D*M) <= n) {
+ Details::Body<M>::template iteration<D>(e, ri + j, zi + j, cst);
+ j += (D*M);
+ }
+ while (j + D <= n) {
+ e.vectorial(ri + j, zi + j, cst);
+ j += D;
+ }
+ while (D > 1 && j < n) {
+ ri[j] = e.scalar(zi[j]);
+ j += 1;
+ }
+ }
+ };
+
+ template <uint32 nIterTot, uint32 nIterLeft>
+ struct _Pipeliner
+ {
+ template <typename Expr, typename Data>
+ FORCE_INLINE static void go(const Expr& e, Data* d)
+ {
+ e.template run<nIterTot - nIterLeft>(d);
+ _Pipeliner<nIterTot, nIterLeft - 1>::go(e, d);
+ }
+ };
+
+ template <uint32 nIterTot>
+ struct _Pipeliner<nIterTot, 0>
+ {
+ template <typename Expr, typename Data>
+ FORCE_INLINE static void go(const Expr& e, Data* d)
+ {
+ }
+ };
+
+ template <uint32 nIter>
+ struct Pipeliner
+ {
+ template <typename Expr, typename Data>
+ FORCE_INLINE static void go(const Expr& e, Data* d)
+ {
+ _Pipeliner<nIter, nIter>::go(e, d);
+ }
+ };
+
+
+#if 1
+ template <class T>
+ char is_complete_impl(char (*)[sizeof(T)]);
+
+ template <class>
+ long is_complete_impl(...);
+
+ template <class T>
+ struct IsComplete
+ {
+ static const bool value = sizeof(is_complete_impl<T>(0)) == sizeof(char);
+ };
+#else
+ template <class T, std::size_t = sizeof(T)>
+ std::true_type is_complete_impl(T *);
+
+ std::false_type is_complete_impl(...);
+
+ template <class T>
+ struct IsComplete : decltype(is_complete_impl(std::declval<T*>())) {};
+#endif
+
+template <typename T, Algos A>
+struct AlgoScalarToVec : AlgoScalarBase<T,A>
+{
+ typedef AlgoScalarBase<T, A> base_t;
+
+ AlgoScalarToVec(const typename base_t::Data& d) : base_t(d) {}
+ AlgoScalarToVec(const T* px, const uint32 n) : base_t(px, n) {}
+
+ static const uint32 nElem = 1;
+
+ struct Constants
+ {
+ };
+
+ void initConstants(Constants& cst) const
+ {
+ }
+
+ FORCE_INLINE
+ void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
+ {
+ *pr = base_t::scalar(*pz);
+ }
+};
+
+template<bool B, class T, class F>
+struct conditional { typedef T type; };
+
+template<class T, class F>
+struct conditional<false, T, F> { typedef F type; };
+
+template <typename T, bool C>
+struct CondData
+{
+ FORCE_INLINE CondData(T x) : v(x) {}
+ FORCE_INLINE operator const T&() const { return v;}
+private:
+ T v;
+};
+
+template <typename T>
+struct CondData<T,false>
+{
+ FORCE_INLINE CondData(T) {}
+ FORCE_INLINE operator const T() const { return 0;}
+};
+
+template <InstrSet I, typename T, Algos A, bool L=false>
+struct BinAlgoBase : Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
+ , Details::AlgoVecBase<I, T, A>
+ , Details::AlgoScalarToVec<T,A>
+ >::type
+{
+ typedef typename Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
+ , Details::AlgoVecBase<I, T, A>
+ , Details::AlgoScalarToVec<T,A>
+ >::type base_t;
+
+ BinAlgoBase(const T* px, const uint32 n) : base_t(px, n) {}
+ BinAlgoBase(const typename base_t::Data& d) : base_t(d) {}
+};
+
+} // namespace Details
+
+} // namespace BinSearch