diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/AAlloc.h | 86 | ||||
-rw-r--r-- | include/Algo-Direct-Common.h | 341 | ||||
-rw-r--r-- | include/Algo-Direct2.h | 305 | ||||
-rw-r--r-- | include/AlgoXCodes.h | 23 | ||||
-rw-r--r-- | include/BinAlgo.h | 77 | ||||
-rw-r--r-- | include/BinSearch.h | 11 | ||||
-rw-r--r-- | include/Portable.h | 151 | ||||
-rw-r--r-- | include/SIMD.h | 562 | ||||
-rw-r--r-- | include/Type.h | 221 |
9 files changed, 1777 insertions, 0 deletions
diff --git a/include/AAlloc.h b/include/AAlloc.h new file mode 100644 index 0000000..6c2ae41 --- /dev/null +++ b/include/AAlloc.h @@ -0,0 +1,86 @@ +#pragma once + +#include "Portable.h" + +namespace BinSearch { +namespace Details { + +template <typename T> +bool isAligned(const T *p, size_t A) +{ + return (reinterpret_cast<size_t>(p) % A) == 0; +} + +template <class T, size_t A=64> +struct AlignedVec +{ + AlignedVec() + : m_storage(0) + , m_data(0) + , m_sz(0) + { + } + + static size_t nBytes(size_t sz) + { + return sz * sizeof(T) + A; + } + + static size_t shiftAmt(char *p) + { + return A>1? (A - (reinterpret_cast<size_t>(p) % A)) % A: 0; + } + + void setPtr(char *p, size_t sz) + { + m_sz = sz; + m_data = reinterpret_cast<T *>(p + shiftAmt(p)); + } + + //void setPtr(T *p, size_t sz) + //{ + // m_sz = sz; + // if (A>1) + // myassert(((reinterpret_cast<size_t>(p) % A) == 0), "bad alignment"); + // m_data = p; + //} + + // internal allocation + void resize(size_t sz) + { + m_storage = new char[nBytes(sz)]; + setPtr(m_storage, sz); + } + + // external allocation + void set(char *storage, size_t sz) + { + setPtr(storage, sz); + } + + ~AlignedVec() + { + if (m_storage) + delete [] m_storage; + } + + size_t size() const { return m_sz; } + T& operator[](size_t i) { return m_data[i]; } + const T& operator[](size_t i) const { return m_data[i]; } + T* begin() { return m_data; } + T* end() { return m_data+m_sz; } + const T* begin() const { return m_data; } + const T* end() const { return m_data+m_sz; } + T& front() { return m_data[0]; } + T& back() { return m_data[m_sz-1]; } + const T& front() const { return m_data[0]; } + const T& back() const { return m_data[m_sz - 1]; } + +private: + char *m_storage; + T *m_data; + size_t m_sz; +}; + +} // namespace Details +} // namespace BinSearch diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h new file mode 100644 index 0000000..cf5f0c9 --- /dev/null +++ b/include/Algo-Direct-Common.h @@ -0,0 +1,341 @@ +#pragma once + +#include <algorithm> +#include <limits> +#include <type_traits> +#include "AAlloc.h" + +namespace BinSearch { +namespace Details { + +namespace DirectAux { + +#define SAFETY_MULTI_PASS true + +template <typename T> +struct HResults +{ + HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {} + T H; + double hRatio; + size_t nInc; +}; + + +#ifdef USE_FMA +template <Algos A> struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); }; +template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); }; +template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); }; +#else +template <Algos A> struct IsDirect { static const bool value = (A == Direct); }; +template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2); }; +template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache); }; +#endif + +// general definition +template <Algos A, typename T, typename Enable = void> +struct BucketElem +{ + FORCE_INLINE void set( uint32 b, const T *) + { + m_b = b; + } + + FORCE_INLINE uint32 index() const { return m_b; } + +private: + uint32 m_b; +}; + +// specialization for DirectCache methods + +template <typename T> struct MatchingIntType; +template <> struct MatchingIntType<double> { typedef uint64 type; }; +template <> struct MatchingIntType<float> { typedef uint32 type; }; + +template <Algos A, typename T> +struct BucketElem<A, T, typename std::enable_if< IsDirectCache<A>::value >::type > +{ + typedef typename MatchingIntType<T>::type I; + + void set(uint32 b, const T *xi) + { + u.u.x = xi[b]; + u.u.b = b; + } + + FORCE_INLINE I index() const { return u.u.b; } + FORCE_INLINE T x() const { return u.u.x; } + +private: + union { + double dummy; + struct + { + T x; + I b; + } u; + } u; +}; + + +template <bool UseFMA, unsigned char Gap, typename T> +struct DirectTraits +{ + static void checkH(T scaler, T x0, T xN) + { + T Dn = xN - x0; + T ifmax = Dn * scaler; + myassert((ifmax < std::numeric_limits<uint32>::max() - (Gap - 1)), + "Problem unfeasible: index size exceeds uint32 capacity:" + << " D[N] =" << Dn + << ", H =" << scaler + << ", H D[n] =" << ifmax << "\n" + ); + } + + FORCE_INLINE static uint32 f(T scaler, T x0, T z) + { + T tmp = scaler * (z - x0); +#ifdef USE_SSE2 + return ftoi(FVec1<SSE,T>(tmp)); +#else + return static_cast<uint32>(tmp); +#endif + } + + template <InstrSet I> + FORCE_INLINE static typename FTOITraits<I, T>::vec_t f(const FVec<I, T>& scaler, const FVec<I, T>& x0, const FVec<I, T>& z) + { + return ftoi(scaler*(z-x0)); + } + + static T cst0(T scaler, T x0) + { + return x0; + } +}; + +#ifdef USE_FMA +template <unsigned char Gap, typename T> +struct DirectTraits<true,Gap,T> +{ + typedef FVec1<SSE, T> fVec1; + + static void checkH(T scaler, T H_Times_x0, T xN) + { + union { + typename FVec1<SSE, T>::vec_t v; + T s; + } ifmax; + ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0)); + myassert((ifmax.s < std::numeric_limits<uint32>::max() - (Gap - 1)), + "Problem unfeasible: index size exceeds uint32 capacity:" + << " H X[0] =" << H_Times_x0 + << ", H =" << scaler + << ", X[N] =" << xN + << ", H X[N] - H X[0] =" << ifmax.s << "\n" + ); + } + + FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi) + { + return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0))); + } + + template <InstrSet I> + FORCE_INLINE static typename FTOITraits<I,T>::vec_t f(const FVec<I,T>& scaler, const FVec<I, T>& H_Times_X0, const FVec<I, T>& z) + { + return ftoi(mulSub(scaler, z, H_Times_X0)); + } + + static T cst0(T scaler, T x0) + { + return scaler*x0; + } +}; +#endif + +template <unsigned char Gap, typename T, Algos A> +struct DirectInfo +{ + static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA); + typedef DirectTraits<UseFMA, Gap, T> fun_t; + typedef BucketElem<A,T> bucket_t; + typedef AlignedVec<bucket_t> bucketvec_t; + + struct Data { + Data() : buckets(0), xi(0), scaler(0), cst0(0) {} + Data( const T *x // for Direct must persist if xws=NULL + , uint32 n + , T H + , bucket_t *bws // assumed to gave size nb, as computed below + , T *xws = NULL // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap + ) + : buckets(bws) + , scaler(H) + , cst0(fun_t::cst0(H, x[0])) + { + myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned"); + + uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]); + + const uint32 npad = Gap-1; + const uint32 n_sz = n + npad; // size of padded vector + + if (xws) { + myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned"); + std::fill_n(xws, npad, x[0]); // pad in front with x[0] + std::copy(x, x+n, xws + npad); + xi = xws; + } + else { + myassert(Gap==1, "if Gap>1 then X workspace must be provided"); + xi = x; + } + + populateIndex(bws, nb, xi, n_sz, scaler, cst0); + } + + const bucket_t *buckets; + const T *xi; + T scaler; + T cst0; // could be x0 or (scaler*x0), depending if we are using FMA or not + } data; + + static T growStep(T H) + { + T step; + T P = next(H); + while ((step = P - H) == 0) + P = next(P); + return step; + } + + static HResults<T> computeH(const T *px, uint32 nx) + { + myassert((nx > Gap), "Array X too small"); + myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap"); + + const T x0 = px[0]; + const T xN = px[nx-1]; + + const T range = xN - x0; + myassert((range < std::numeric_limits<T>::max()), "range too large"); + + // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i + T deltaDMin = range; + for (uint32 i = Gap; i < nx; ++i) { + T Dnew = px[i] - x0; + T Dold = px[i - Gap] - x0; + myassert((Dnew > Dold), + "Problem unfeasible: D_i sequence not strictly increasing" + << " X[" << 0 << "]=" << x0 + << " X[" << i - Gap << "]=" << px[i - Gap] + << " X[" << i << "]=" << px[i] + << "\n" + ); + T deltaD = Dnew - Dold; + if (deltaD < deltaDMin) + deltaDMin = deltaD; + } + + // initial guess for H + const T H0 = T(1.0) / deltaDMin; + T H = H0; + + T cst0 = fun_t::cst0(H, x0); + fun_t::checkH(H, cst0, xN); + + // adjust H by trial and error until succeed + size_t nInc = 0; + bool modified = false; + size_t npasses = 0; + T step = growStep(H); + uint32 seg_already_checked_from = nx; + do { + myassert((npasses++ < 2), "verification failed\n"); + // if there has been an increase, then check only up to that point + uint32 last_seg_to_be_checked = seg_already_checked_from - 1; + modified = false; + uint32 inew = 0; + for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) { + uint32 iold = fun_t::f(H, cst0, px[i-Gap]); + uint32 inew = fun_t::f(H, cst0, px[i]); + while (inew == iold) { + seg_already_checked_from = i; + last_seg_to_be_checked = nx-1; // everything needs to be checked + modified = true; + H = H + step; + step *= 2; + // recalculate all constants and indices + cst0 = fun_t::cst0(H, x0); + fun_t::checkH(H, cst0, xN); + iold = fun_t::f(H, cst0, px[i - Gap]); + inew = fun_t::f(H, cst0, px[i]); + } + } + } while (SAFETY_MULTI_PASS && modified); + + return HResults<T>(H, (((double)H) / H0) - 1.0, nInc); + } + + static void populateIndex(BucketElem<A, T> *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0) + { + for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) { + uint32 idx = fun_t::f(scaler, cst0, px[i]); + while (b > idx) { // in the 1st iteration it is j=0 but this condition is always false + buckets[b].set( j, px ); + --b; + } + if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant + j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check + buckets[b].set(j, px); + if (b-- == 0) + break; + } + } + } + + DirectInfo(const Data& d) + : data(d) + { + } + + DirectInfo(const T* px, const uint32 n) + { + HResults<T> res = computeH(px, n); + +#ifdef PAPER_TEST + nInc = res.nInc; + hRatio = res.hRatio; +#endif + const uint32 npad = Gap-1; + const uint32 n_sz = n + npad; // size of padded vector + + if (npad) + xi.resize(n_sz); + + T H = res.H; + T cst0 = fun_t::cst0(H, px[0]); + const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]); + buckets.resize(maxIndex + 1); + + data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL)); + } + +private: + bucketvec_t buckets; + AlignedVec<T,8> xi; + +#ifdef PAPER_TEST +public: + double hRatio; + size_t nInc; +#endif +}; + + +} // namespace DirectAux +} // namespace Details +} // namespace BinSearch diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h new file mode 100644 index 0000000..d5fa58d --- /dev/null +++ b/include/Algo-Direct2.h @@ -0,0 +1,305 @@ +#pragma once + +#include "Algo-Direct-Common.h" + +namespace BinSearch { +namespace Details { + +template <typename T, Algos A> +struct AlgoScalarBase<T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : DirectAux::DirectInfo<2, T, A> +{ +private: + typedef DirectAux::DirectInfo<2, T, A> base_t; + static const size_t Offset=2; + +public: + AlgoScalarBase(const T* x, const uint32 n) + : base_t(x, n) + { + } + + FORCE_INLINE uint32 scalar(T z) const + { + const T* px = base_t::data.xi; + const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets); + uint32 bidx = base_t::fun_t::f(base_t::data.scaler, base_t::data.cst0, z); + uint32 iidx = buckets[bidx]; + px += iidx; + if (z < *px) + --iidx; + if (z < *(px+1)) + --iidx; + return iidx; + } +}; + + +template <InstrSet I, typename T, Algos A> +struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : AlgoScalarBase<T, A> +{ + static const uint32 nElem = sizeof(typename InstrFloatTraits<I, T>::vec_t) / sizeof(T); + + typedef FVec<I, T> fVec; + typedef IVec<SSE, T> i128; + + struct Constants + { + fVec vscaler; + fVec vcst0; + IVec<I, T> one; + }; + +private: + typedef AlgoScalarBase<T, A> base_t; + + FORCE_INLINE + //NO_INLINE + void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const + { + union U { + __m128i vec; + uint32 ui32[4]; + } u; + + const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets); + const float *xi = base_t::data.xi; + + // read indices t + const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]); + const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]); + const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]); + const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]); + +#if 0 + // read pairs ( X(t-1), X(t) ) + __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3)); + __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2)); + __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1)); + __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0)); + + // build: + // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } + // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } + __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6)); + __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6)); + __m128 u01 = _mm_unpacklo_ps(h02, h13); + __m128 u23 = _mm_unpackhi_ps(h02, h13); + __m128 vxm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6)); + __m128 vxp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6)); +#else + __m128 xp23 = _mm_castpd_ps(_mm_set_pd(*p3, *p2)); + __m128 xp01 = _mm_castpd_ps(_mm_set_pd(*p1, *p0)); + __m128 vxm = _mm_shuffle_ps(xp01, xp23, (0) + (2 << 2) + (0 << 4) + (2 << 6)); + __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6)); +#endif + IVec<SSE, float> i(u.vec); + IVec<SSE, float> vlem = vz < vxm; + IVec<SSE, float> vlep = vz < vxp; + i = i + vlem + vlep; + i.store(pr); + } + + FORCE_INLINE + //NO_INLINE + void resolve(const FVec<SSE, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const + { + const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets); + const double *xi = base_t::data.xi; + + uint32 b1 = buckets[bidx.get1()]; + uint32 b0 = buckets[bidx.get0()]; + + const double *p1 = &xi[b1]; + const double *p0 = &xi[b0]; + + // read pairs ( X(t-1), X(t) ) + __m128d vx1 = _mm_loadu_pd(p1); + __m128d vx0 = _mm_loadu_pd(p0); + + // build: + // { X(t(0)-1), X(t(1)-1) } + // { X(t(0)), X(t(1)) } + __m128d vxm = _mm_shuffle_pd(vx0, vx1, 0); + __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3); + + IVec<SSE, double> i(b1, b0); + IVec<SSE, double> vlem = (vz < vxm); + IVec<SSE, double> vlep = (vz < vxp); + i = i + vlem + vlep; + + union { + __m128i vec; + uint32 ui32[4]; + } u; + u.vec = i; + pr[0] = u.ui32[0]; + pr[1] = u.ui32[2]; + } + +#ifdef USE_AVX + + FORCE_INLINE + //NO_INLINE + void resolve(const FVec<AVX, float>& vz, const IVec<AVX, float>& bidx, uint32 *pr) const + { + const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets); + const float *xi = base_t::data.xi; + +#if 0 // use gather instructions + + IVec<AVX,float> idxm; + idxm.setidx(buckets, bidx); + __m256i z = _mm256_setzero_si256(); + IVec<AVX,float> minusone = _mm256_cmpeq_epi32(z,z); + IVec<AVX,float> idxp = idxm - minusone; + + FVec<AVX, float> vxm = _mm256_i32gather_ps(xi, idxm, sizeof(float)); + FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float)); + IVec<AVX, float> ip = idxm; + +#else // do not use gather instrucions + + union U { + __m256i vec; + uint32 ui32[8]; + } u; + + // read indices t + + const double *p7 = reinterpret_cast<const double *>(&xi[(u.ui32[7] = buckets[bidx.get7()])]); + const double *p6 = reinterpret_cast<const double *>(&xi[(u.ui32[6] = buckets[bidx.get6()])]); + const double *p5 = reinterpret_cast<const double *>(&xi[(u.ui32[5] = buckets[bidx.get5()])]); + const double *p4 = reinterpret_cast<const double *>(&xi[(u.ui32[4] = buckets[bidx.get4()])]); + const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]); + const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]); + const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]); + const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]); + +#if 0 // perform 8 loads in double precision + + // read pairs ( X(t-1), X(t) ) + __m128 xp7 = _mm_castpd_ps(_mm_load_sd(p7)); + __m128 xp6 = _mm_castpd_ps(_mm_load_sd(p6)); + __m128 xp5 = _mm_castpd_ps(_mm_load_sd(p5)); + __m128 xp4 = _mm_castpd_ps(_mm_load_sd(p4)); + __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3)); + __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2)); + __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1)); + __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0)); + + // build: + // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } + // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } + __m128 h57 = _mm_shuffle_ps(xp5, xp7, (1 << 2) + (1 << 6)); // F- F+ H- H+ + __m128 h46 = _mm_shuffle_ps(xp4, xp6, (1 << 2) + (1 << 6)); // E- E+ G- G+ + __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6)); // B- B+ D- D+ + __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6)); // A- A+ C- C+ + + __m128 u01 = _mm_unpacklo_ps(h02, h13); // A- B- A+ B+ + __m128 u23 = _mm_unpackhi_ps(h02, h13); // C- D- C+ D+ + __m128 u45 = _mm_unpacklo_ps(h46, h57); // E- F- E+ F+ + __m128 u67 = _mm_unpackhi_ps(h46, h57); // G- H- G+ H+ + + __m128 abcdm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // A- B- C- D- + __m128 abcdp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // A+ B+ C+ D+ + __m128 efghm = _mm_shuffle_ps(u45, u67, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // E- F- G- H- + __m128 efghp = _mm_shuffle_ps(u45, u67, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // E+ F+ G+ H+ + + FVec<AVX, float> vxp = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdm), efghm, 1); + FVec<AVX, float> vxm = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdp), efghp, 1); + + IVec<AVX, float> ip(u.vec); + +#else // use __mm256_set_pd + + // read pairs ( X(t-1), X(t) ) + __m256 x0145 = _mm256_castpd_ps(_mm256_set_pd(*p5, *p4, *p1, *p0)); // { x0(t-1), x0(t), x1(t-1), x1(t), x4(t-1), x4(t), x5(t-1), x5(t) } + __m256 x2367 = _mm256_castpd_ps(_mm256_set_pd(*p7, *p6, *p3, *p2)); // { x2(t-1), x2(t), x3(t-1), x3(t), x6(t-1), x6(t), x7(t-1), x7(t) } + + // { x0(t-1), x1(t-1), x2(t-1), 3(t-1, x4(t-1), x5(t-1), x6(t-1), xt(t-1) } + FVec<AVX, float> vxm = _mm256_shuffle_ps(x0145, x2367, 0 + (2 << 2) + (0 << 4) + (2 << 6) ); + // { x0(t), x1(t), x2(t), 3(t, x4(t), x5(t), x6(t), xt(t) } + FVec<AVX, float> vxp = _mm256_shuffle_ps(x0145, x2367, 1 + (3 << 2) + (1 << 4) + (3 << 6) ); + + IVec<AVX, float> ip(u.vec); + +#endif + +#endif + + IVec<AVX, float> vlem = vz < vxm; + IVec<AVX, float> vlep = vz < vxp; + ip = ip + vlem + vlep; + + ip.store(pr); + } + + + + FORCE_INLINE + //NO_INLINE + void resolve(const FVec<AVX, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const + { + union { + __m256i vec; + uint64 ui64[4]; + } u; + + const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets); + const double *xi = base_t::data.xi; + + // read indices t + const double *p3 = &xi[(u.ui64[3] = buckets[bidx.get3()])]; + const double *p2 = &xi[(u.ui64[2] = buckets[bidx.get2()])]; + const double *p1 = &xi[(u.ui64[1] = buckets[bidx.get1()])]; + const double *p0 = &xi[(u.ui64[0] = buckets[bidx.get0()])]; + + // read pairs ( X(t-1), X(t) ) + __m128d xp3 = _mm_loadu_pd(p3); + __m128d xp2 = _mm_loadu_pd(p2); + __m128d xp1 = _mm_loadu_pd(p1); + __m128d xp0 = _mm_loadu_pd(p0); + + // build: + // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } + // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } + __m256d x02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp0), xp2, 1); + __m256d x13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp1), xp3, 1); + FVec<AVX, double> vxm = _mm256_unpacklo_pd(x02,x13); + FVec<AVX, double> vxp = _mm256_unpackhi_pd(x02,x13); + + +// __m128d h01m = _mm_shuffle_pd(xp0, xp1, 0); +// __m128d h23m = _mm_shuffle_pd(xp2, xp3, 0); +// __m128d h01p = _mm_shuffle_pd(xp0, xp1, 3); +// __m128d h23p = _mm_shuffle_pd(xp2, xp3, 3); +// FVec<AVX, double> vxm = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01m), h23m, 1); +// FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1); + + IVec<AVX, double> i(u.vec); + IVec<AVX, double> vlem = vz < vxm; + IVec<AVX, double> vlep = vz < vxp; + i = i + vlem + vlep; + i.extractLo32s().store(pr); + } +#endif + +public: + + AlgoVecBase(const T* x, const uint32 n) : base_t(x, n) {} + + void initConstants(Constants& cst) const + { + cst.vscaler.setN(base_t::data.scaler); + cst.vcst0.setN(base_t::data.cst0); + cst.one.setN(uint32(1)); + } + + void vectorial(uint32 *pr, const T *pz, const Constants& cst) const + { + fVec vz(pz); + resolve(vz, base_t::fun_t::f(cst.vscaler, cst.vcst0, vz), pr); + } +}; +} // namespace Details +} // namespace BinSearch diff --git a/include/AlgoXCodes.h b/include/AlgoXCodes.h new file mode 100644 index 0000000..bdc9b00 --- /dev/null +++ b/include/AlgoXCodes.h @@ -0,0 +1,23 @@ +ALGOENUM(DirectCacheFMA, 5) +ALGOENUM(DirectFMA, 15) +ALGOENUM(Direct2FMA, 25) +ALGOENUM(DirectCache, 10) +ALGOENUM(Direct, 20) +ALGOENUM(Direct2, 30) +ALGOENUM(Nonary, 40) +ALGOENUM(Pentary, 50) +ALGOENUM(Ternary, 60) +ALGOENUM(Eytzinger, 70) +ALGOENUM(BitSet, 80) +ALGOENUM(ClassicOffset, 90) +#ifdef PAPER_TEST +ALGOENUM(MorinOffset, 100) +ALGOENUM(BitSetNoPad, 110) +ALGOENUM(ClassicMod, 120) +ALGOENUM(MorinBranchy, 130) +ALGOENUM(Classic, 140) +ALGOENUM(LowerBound, 145) +#ifdef USE_MKL +ALGOENUM(MKL, 150) +#endif +#endif diff --git a/include/BinAlgo.h b/include/BinAlgo.h new file mode 100644 index 0000000..aac67a0 --- /dev/null +++ b/include/BinAlgo.h @@ -0,0 +1,77 @@ +#pragma once + +#include "Type.h" +#include <algorithm> + +namespace BinSearch { + +template <InstrSet I, typename T, Algos A, bool L=false, bool R=false> +struct BinAlgo : Details::BinAlgoBase<I,T,A> +{ + typedef Details::BinAlgoBase<I,T,A> base_t; + + BinAlgo(const T* px, const uint32 n) : base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {} + BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {} + + FORCE_INLINE + uint32 scalar(T z) const + { + if (!L || z >= x0) + if (!R || z < xN) + return base_t::scalar(z); + else + return N; + else + return std::numeric_limits<uint32>::max(); + } + + + FORCE_INLINE + void vectorial(uint32 *pr, const T *pz, uint32 n) const + { + if (!L && !R) { + Details::Loop<T,base_t>::loop(*this, pr, pz, n); + } + else { + const uint32 nElem = base_t::nElem; + const uint32 idealbufsize = 256; + const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0)); + T databuf[bufsize]; + uint32 resbuf[bufsize]; + uint32 indexbuf[bufsize]; + + uint32 *prend = pr + n; + while(pr != prend) { + uint32 cnt = 0; + uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend)); + for (uint32 j = 0; j < niter; ++j) { + T z = pz[j]; + // FIXME: use SSE2? + if (!L || z >= x0) + if (!R || z < xN) { + databuf[cnt] = z; + indexbuf[cnt] = j; + ++cnt; + } + else + pr[j] = N; + else + pr[j] = std::numeric_limits<uint32>::max(); + } + // FIXME: merge these two loops + Details::Loop<T,base_t>::loop(*this, resbuf, databuf, cnt); + for (uint32 j = 0; j < cnt; ++j) + pr[indexbuf[j]] = resbuf[j]; + pr += niter; + pz += niter; + } + } + } + + Details::CondData<T,L> x0; + Details::CondData<T,R> xN; + Details::CondData<uint32,R> N; +}; + + +} // namespace BinSearch diff --git a/include/BinSearch.h b/include/BinSearch.h new file mode 100644 index 0000000..336f529 --- /dev/null +++ b/include/BinSearch.h @@ -0,0 +1,11 @@ +#pragma once + +#include "AAlloc.h" +#include "BinAlgo.h" +#include "SIMD.h" + +#include <algorithm> +#include <limits> + + +#include "Algo-Direct2.h" diff --git a/include/Portable.h b/include/Portable.h new file mode 100644 index 0000000..1710b05 --- /dev/null +++ b/include/Portable.h @@ -0,0 +1,151 @@ +#pragma once +#include <limits> +#include <cmath> +#include <stdexcept> +#include <sstream> + +#ifdef __FMA__ +#define USE_FMA +#endif + +#ifdef __AVX2__ +#define USE_AVX2 +#endif + +#ifdef __AVX__ +#define USE_AVX +#endif + + +#ifdef __SSE4_1__ +#define USE_SSE41 +#endif + +#ifdef __SSE4_2__ +#define USE_SSE42 +#endif + + +#ifndef _MSC_VER +#include <stdint.h> +#endif + +namespace BinSearch { + +#ifndef _MSC_VER +typedef int8_t int8; +typedef uint8_t uint8; +typedef int32_t int32; +typedef uint32_t uint32; +typedef int64_t int64; +typedef uint64_t uint64; +#else +typedef __int8 int8; +typedef unsigned __int8 uint8; +typedef __int32 int32; +typedef unsigned __int32 uint32; +typedef __int64 int64; +typedef unsigned __int64 uint64; +#endif + +namespace Details { + +#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } + +// log2 is not defined in VS2008 +#if defined(_MSC_VER) +inline uint32 log2 (uint32 val) { + if (val == 1) return 0; + uint32 ret = 0; + do { + ret++; + val >>= 1; + } while (val > 1); + return ret; +} +#endif + +#ifdef _DEBUG +#define DEBUG +#endif + +#ifdef _MSC_VER +# define FORCE_INLINE __forceinline +# define NO_INLINE __declspec(noinline) +#else +# define NO_INLINE __attribute__((noinline)) +# ifdef DEBUG +# define FORCE_INLINE NO_INLINE +# else +# define FORCE_INLINE __attribute__((always_inline)) inline +# endif +#endif + +#ifdef USE_AVX +#define COMISS "vcomiss" +#define COMISD "vcomisd" +#else +#define COMISS "comiss" +#define COMISD "comisd" +#endif + +// nextafter is not defined in VS2008 +#if defined(_MSC_VER) && (_MSC_VER <= 1500) +#include <float.h> +inline float mynext(float x) +{ + return _nextafterf(x, std::numeric_limits<float>::max()); +} + +inline double mynext(double x) +{ + return _nextafter(x, std::numeric_limits<double>::max()); +} +inline float myprev(float x) +{ + return _nextafterf(x, -std::numeric_limits<float>::max()); +} + +inline double myprev(double x) +{ + return _nextafter(x, -std::numeric_limits<double>::max()); +} +#else +inline float mynext(float x) +{ + return std::nextafterf(x, std::numeric_limits<float>::max()); +} + +inline double mynext(double x) +{ + return std::nextafter(x, std::numeric_limits<double>::max()); +} +inline float myprev(float x) +{ + return std::nextafterf(x, -std::numeric_limits<float>::max()); +} + +inline double myprev(double x) +{ + return std::nextafter(x, -std::numeric_limits<double>::max()); +} +#endif + +template <typename T> +inline T next(T x) +{ + for (int i = 0; i < 4; ++i) + x = mynext(x); + return x; +} + +template <typename T> +inline T prev(T x) +{ + for (int i = 0; i < 4; ++i) + x = myprev(x); + return x; +} + +} // namepsace Details +} // namespace BinSearch diff --git a/include/SIMD.h b/include/SIMD.h new file mode 100644 index 0000000..642b80a --- /dev/null +++ b/include/SIMD.h @@ -0,0 +1,562 @@ +#pragma once + +#include "Portable.h" + +#ifdef USE_SSE42 +#ifndef _MSC_VER +#include <popcntintrin.h> +#define popcnt32 _mm_popcnt_u32 +#else +#include <intrin.h> +#define popcnt32 __popcnt +#endif +#else // USE_SSE42 +namespace BinSearch { +FORCE_INLINE int popcnt32(int x32) +{ + // strictly speaking this is not correct, as it ignores higher order bits + // however, this is only used on the resuot of movemask on a 128-bit register, which is 8 at most, so it is ok + // with 256-bit registers, SSE42 is defined, and we do not use this function + uint8 x = static_cast<uint8>(x32); + x = (x & 0x55) + (x >> 1 & 0x55); + x = (x & 0x33) + (x >> 2 & 0x33); + x = (x & 0x0f) + (x >> 4 & 0x0f); + return x; +} +} // namespace +#endif + +#if defined(USE_AVX) || defined(USE_AVX2) +#include <immintrin.h> +#else +#include <emmintrin.h> +#ifdef USE_SSE41 +#include <smmintrin.h> +#endif +#endif + +#include "Type.h" + +namespace BinSearch { +namespace Details { + +template <InstrSet I, class T> +struct FVec; + +template <InstrSet I, class T> +struct IVec; + +template <InstrSet I, class T> +struct FVec1; + +template <> struct InstrIntTraits<SSE> +{ + typedef __m128i vec_t; +}; + +template <> struct InstrFloatTraits<SSE, float> +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits<SSE, double> +{ + typedef __m128d vec_t; +}; + +template <InstrSet I, typename T> +struct FTOITraits +{ + typedef IVec<SSE, float> vec_t; +}; + +#ifdef USE_AVX + +template <> +struct FTOITraits<AVX, float> +{ + typedef IVec<AVX, float> vec_t; +}; + +template <> struct InstrIntTraits<AVX> +{ + typedef __m256i vec_t; +}; + +template <> struct InstrFloatTraits<AVX, float> +{ + typedef __m256 vec_t; +}; + +template <> struct InstrFloatTraits<AVX, double> +{ + typedef __m256d vec_t; +}; + +#endif + + +template <typename TR> +struct VecStorage +{ + typedef typename TR::vec_t vec_t; + + FORCE_INLINE operator vec_t&() { return vec; } + FORCE_INLINE operator const vec_t&() const { return vec; } + +protected: + FORCE_INLINE VecStorage() {} + FORCE_INLINE VecStorage(const vec_t& v) : vec( v ) {} + + vec_t vec; +}; + +template <InstrSet> +struct IVecBase; + +template <> +struct IVecBase<SSE> : VecStorage<InstrIntTraits<SSE>> +{ +protected: + FORCE_INLINE IVecBase() {} + FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<SSE>>( v ) {} +public: + FORCE_INLINE static vec_t zero() { return _mm_setzero_si128(); } + + FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32( vec ); } + + FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) + { +#ifdef USE_SSE41 + vec = _mm_blendv_epi8(vec, val, mask); +#else + vec = _mm_or_si128(_mm_andnot_si128(mask,vec), _mm_and_si128(mask,val)); +#endif + } + FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask) + { + vec = _mm_or_si128(vec, _mm_and_si128(val,mask)); + } +}; + +template <> +struct IVec<SSE, float> : IVecBase<SSE> +{ + FORCE_INLINE IVec() {} + FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi32( i ) ) {} + FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v ) {} + FORCE_INLINE IVec( uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<SSE>( _mm_set_epi32( u3, u2, u1, u0 ) ) {} + + void setN( int32 i ) { vec = _mm_set1_epi32( i ); } + +#ifdef USE_SSE41 + FORCE_INLINE int32 get1() const { return _mm_extract_epi32(vec, 1); } + FORCE_INLINE int32 get2() const { return _mm_extract_epi32(vec, 2); } + FORCE_INLINE int32 get3() const { return _mm_extract_epi32(vec, 3); } +#else + FORCE_INLINE int32 get1() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 1 ) ); } + FORCE_INLINE int32 get2() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); } + FORCE_INLINE int32 get3() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 3 ) ); } +#endif + + FORCE_INLINE void store( uint32 *pi ) const { _mm_storeu_si128( reinterpret_cast<vec_t*>(pi), vec ); } + + FORCE_INLINE int countbit() + { + return popcnt32(_mm_movemask_ps(_mm_castsi128_ps(vec))); + } +}; + +template <> +struct IVec<SSE, double> : IVecBase<SSE> +{ + FORCE_INLINE IVec() {} + FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi64x( i ) ) {} + FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v ) {} + FORCE_INLINE IVec( uint64 u1, uint64 u0 ) : IVecBase<SSE>( _mm_set_epi64x(u1, u0) ) {} + + void setN( int32 i ) { vec = _mm_set1_epi64x( i ); } + + FORCE_INLINE int32 get1() const + { +#ifdef USE_SSE41 + return _mm_extract_epi32(vec, 2); +#else + return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); +#endif + } + + // extract the 2 32 bits integers no. 0, 2 and store them in a __m128i + FORCE_INLINE IVec<SSE,float> extractLo32s() const + { + return _mm_shuffle_epi32(vec, ((2 << 2) | 0)); + } + + FORCE_INLINE void store( uint32 *pi ) const + { + pi[0] = get0(); + pi[1] = get1(); + } + + FORCE_INLINE int countbit() + { +#if 1 + // takes 4 cycles + __m128i hi = _mm_shuffle_epi32(vec, 2); // 1 cycle + __m128i s = _mm_add_epi32(vec, hi); + int32 x = _mm_cvtsi128_si32(s); + return -x; +#else + // takes 6 cycles + return popcnt32(_mm_movemask_pd(_mm_castsi128_pd(vec))); +#endif + } +}; + +template <typename T> +FORCE_INLINE IVec<SSE,T> operator>> (const IVec<SSE,T>& a, unsigned n) { return _mm_srli_epi32(a, n); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator<< (const IVec<SSE,T>& a, unsigned n) { return _mm_slli_epi32(a, n); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator& (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_and_si128( a, b ); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator| (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_or_si128( a, b ); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator^ (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_xor_si128( a, b ); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator+ (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_add_epi32( a, b ); } +template <typename T> +FORCE_INLINE IVec<SSE,T> operator- (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_sub_epi32( a, b ); } +#ifdef USE_SSE41 +template <typename T> +FORCE_INLINE IVec<SSE,T> min (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_min_epi32( a, b ); } +#endif + +typedef VecStorage<InstrFloatTraits<SSE,float>> FVec128Float; + +template <> +struct FVec1<SSE, float> : FVec128Float +{ + FORCE_INLINE FVec1() {} + FORCE_INLINE FVec1( float f ) : FVec128Float( _mm_load_ss( &f ) ) {} + FORCE_INLINE FVec1( const vec_t& v ): FVec128Float( v ) {} + + FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); } +}; + +template <> +struct FVec<SSE, float> : FVec128Float +{ + FORCE_INLINE FVec() {} + FORCE_INLINE FVec( float f ) : FVec128Float( _mm_set1_ps( f ) ) {} + FORCE_INLINE FVec( const float *v ) : FVec128Float( _mm_loadu_ps( v ) ) {} + FORCE_INLINE FVec( const vec_t& v) : FVec128Float(v) {} + FORCE_INLINE FVec( float f3, float f2, float f1, float f0 ) : FVec128Float( _mm_set_ps(f3, f2, f1, f0) ) {} + + void set0( float f ) { vec = _mm_load_ss( &f ); } + void setN( float f ) { vec = _mm_set1_ps( f ); } + + FORCE_INLINE void setidx( const float *xi, const IVec<SSE,float>& idx ) + { + uint32 i0 = idx.get0(); + uint32 i1 = idx.get1(); + uint32 i2 = idx.get2(); + uint32 i3 = idx.get3(); + vec = _mm_set_ps( xi[i3], xi[i2], xi[i1], xi[i0] ); + } + + FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); } + FORCE_INLINE float get1() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 1 ) ); } + FORCE_INLINE float get2() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 2 ) ); } + FORCE_INLINE float get3() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 3 ) ); } +}; + +FORCE_INLINE FVec1<SSE,float> operator+ (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_add_ss( a, b ); } +FORCE_INLINE FVec1<SSE,float> operator- (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_sub_ss( a, b ); } +FORCE_INLINE FVec1<SSE,float> operator* (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_mul_ss( a, b ); } +FORCE_INLINE FVec1<SSE,float> operator/ (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_div_ss( a, b ); } +FORCE_INLINE int ftoi (const FVec1<SSE,float>& a) { return _mm_cvttss_si32(a); } +FORCE_INLINE IVec<SSE,float> operator> (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_castps_si128( _mm_cmpgt_ss( a, b ) ); } +#ifdef USE_FMA +FORCE_INLINE FVec1<SSE, float> mulSub(const FVec1<SSE, float>& a, const FVec1<SSE, float>& b, const FVec1<SSE, float>& c) { return _mm_fmsub_ss(a, b, c); } +#endif + +FORCE_INLINE FVec<SSE,float> operator- (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_sub_ps( a, b ); } +FORCE_INLINE FVec<SSE,float> operator* (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_mul_ps( a, b ); } +FORCE_INLINE FVec<SSE,float> operator/ (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_div_ps( a, b ); } +FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,float>& a) { return _mm_cvttps_epi32(a); } +FORCE_INLINE IVec<SSE,float> operator<= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } +FORCE_INLINE IVec<SSE,float> operator>= (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } +FORCE_INLINE IVec<SSE,float> operator< (const FVec<SSE,float>& a, const FVec<SSE,float>& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#ifdef USE_FMA +FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); } +#endif + +typedef VecStorage<InstrFloatTraits<SSE,double>> FVec128Double; + +template <> +struct FVec1<SSE, double> : FVec128Double +{ + FORCE_INLINE FVec1() {} + FORCE_INLINE FVec1( double f ) : FVec128Double( _mm_load_sd( &f ) ) {} + FORCE_INLINE FVec1( const vec_t& v ) : FVec128Double( v ) {} + + FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); } +}; + +template <> +struct FVec<SSE, double> : FVec128Double +{ + FORCE_INLINE FVec() {} + FORCE_INLINE FVec( double d ) : FVec128Double( _mm_set1_pd( d ) ) {} + FORCE_INLINE FVec( const double *v ) : FVec128Double( _mm_loadu_pd( v ) ) {} + FORCE_INLINE FVec( const vec_t& v) : FVec128Double( v ) {} + FORCE_INLINE FVec( double f1, double f0 ) : FVec128Double( _mm_set_pd(f1, f0) ) {} + + void set0( double f ) { vec = _mm_load_sd( &f ); } + void setN( double f ) { vec = _mm_set1_pd( f ); } + + FORCE_INLINE void setidx( const double *xi, const IVec<SSE,double>& idx ) + { + vec = _mm_set_pd( xi[idx.get1()], xi[idx.get0()] ); + } + + FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); } + FORCE_INLINE double get1() const { return _mm_cvtsd_f64( _mm_shuffle_pd( vec, vec, 1 ) ); }; +}; + +FORCE_INLINE FVec1<SSE,double> operator+ (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_add_sd( a, b ); } +FORCE_INLINE FVec1<SSE,double> operator- (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_sub_sd( a, b ); } +FORCE_INLINE FVec1<SSE,double> operator* (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_mul_sd( a, b ); } +FORCE_INLINE FVec1<SSE,double> operator/ (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_div_sd( a, b ); } +FORCE_INLINE int ftoi (const FVec1<SSE,double>& a) { return _mm_cvttsd_si32(a); } +FORCE_INLINE IVec<SSE,double> operator> (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpgt_sd( a, b ) ); } +#ifdef USE_FMA +FORCE_INLINE FVec1<SSE, double> mulSub(const FVec1<SSE, double>& a, const FVec1<SSE, double>& b, const FVec1<SSE, double>& c) { return _mm_fmsub_sd(a, b, c); } +#endif + +FORCE_INLINE FVec<SSE,double> operator- (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_sub_pd( a, b ); } +FORCE_INLINE FVec<SSE,double> operator* (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_mul_pd( a, b ); } +FORCE_INLINE FVec<SSE,double> operator/ (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_div_pd( a, b ); } +FORCE_INLINE IVec<SSE,float> ftoi (const FVec<SSE,double>& a) { return _mm_cvttpd_epi32(a); } +FORCE_INLINE IVec<SSE,double> operator<= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } +FORCE_INLINE IVec<SSE,double> operator< (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } +FORCE_INLINE IVec<SSE,double> operator>= (const FVec<SSE,double>& a, const FVec<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#ifdef USE_FMA +FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); } +#endif + +#ifdef USE_AVX + +template <> +struct IVecBase<AVX> : VecStorage<InstrIntTraits<AVX>> +{ +protected: + FORCE_INLINE IVecBase() {} + FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<AVX>>( v ) {} +public: + FORCE_INLINE static vec_t zero() { return _mm256_setzero_si256(); } + + FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32(_mm256_castsi256_si128(vec)); } + + FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) { vec = _mm256_blendv_epi8(vec, val, mask); } + FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask) + { + vec = _mm256_blendv_epi8(vec, val, mask); + //vec = _mm256_or_si256(vec, _mm256_and_si256(val,mask)); + } + + FORCE_INLINE __m128i lo128() const { return _mm256_castsi256_si128(vec); } + FORCE_INLINE __m128i hi128() const { return _mm256_extractf128_si256(vec, 1); } +}; + +template <> +struct IVec<AVX, float> : IVecBase<AVX> +{ + FORCE_INLINE IVec() {} + FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi32( i ) ) {} + FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v ) {} + FORCE_INLINE IVec(uint32 u7, uint32 u6, uint32 u5, uint32 u4, uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<AVX>(_mm256_set_epi32(u7, u6, u5, u4, u3, u2, u1, u0)) {} + + void setN( int32 i ) { vec = _mm256_set1_epi32( i ); } + + FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 1); } + FORCE_INLINE int32 get2() const { return _mm256_extract_epi32(vec, 2); } + FORCE_INLINE int32 get3() const { return _mm256_extract_epi32(vec, 3); } + FORCE_INLINE int32 get4() const { return _mm256_extract_epi32(vec, 4); } + FORCE_INLINE int32 get5() const { return _mm256_extract_epi32(vec, 5); } + FORCE_INLINE int32 get6() const { return _mm256_extract_epi32(vec, 6); } + FORCE_INLINE int32 get7() const { return _mm256_extract_epi32(vec, 7); } + + FORCE_INLINE void setidx( const uint32 *bi, const IVec<AVX,float>& idx ) + { + vec = _mm256_i32gather_epi32(reinterpret_cast<const int32 *>(bi), idx, sizeof(uint32)); + } + + FORCE_INLINE void store( uint32 *pi ) const { _mm256_storeu_si256( reinterpret_cast<vec_t*>(pi), vec ); } + + FORCE_INLINE int countbit() + { + return popcnt32(_mm256_movemask_ps(_mm256_castsi256_ps(vec))); + } +}; + +template <> +struct IVec<AVX, double> : IVecBase<AVX> +{ + FORCE_INLINE IVec() {} + FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi64x( i ) ) {} + FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v ) {} + FORCE_INLINE IVec(uint64 u3, uint64 u2, uint64 u1, uint64 u0) : IVecBase<AVX>(_mm256_set_epi64x(u3, u2, u1, u0)) {} + + void setN( int32 i ) { vec = _mm256_set1_epi64x( i ); } + + // extract the 4 32 bits integers no. 0, 2, 4, 6 and store them in a __m128i + FORCE_INLINE IVec<SSE,float> extractLo32s() const + { + union { + uint32 u32[4]; + __m128i u; + } mask = {0,2,4,6}; + //__m256 ps256 = _mm256_castsi256_ps(vec); + //__m128 lo128 = _mm256_castps256_ps128(ps256); + //__m128 hi128 = _mm256_extractf128_ps(ps256, 1); + //__m128 blend = _mm_shuffle_ps(lo128, hi128, 0 + (2<<2) + (0<<4) + (2<<6)); + __m256i blend = _mm256_permutevar8x32_epi32(vec, _mm256_castsi128_si256(mask.u)); + return _mm256_castsi256_si128(blend); + } + + //int32 get1() const { return _mm256_cvtsi256_si32( _mm256_shuffle_epi32( vec, 2 ) ); }; + FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 2); } + + FORCE_INLINE void store( uint32 *pi ) const + { + extractLo32s().store(pi); + } + + FORCE_INLINE int countbit() + { + return popcnt32(_mm256_movemask_pd(_mm256_castsi256_pd(vec))); + } +}; + +template <typename T> +FORCE_INLINE IVec<AVX,T> operator>> (const IVec<AVX,T>& a, unsigned n) { return _mm256_srli_epi32(a, n); } +template <typename T> +FORCE_INLINE IVec<AVX,T> operator<< (const IVec<AVX,T>& a, unsigned n) { return _mm256_slli_epi32(a, n); } +template <typename T> +FORCE_INLINE IVec<AVX,T> operator& (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_and_si256( a, b ); } +template <typename T> +FORCE_INLINE IVec<AVX,T> operator| (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_or_si256( a, b ); } +template <typename T> +FORCE_INLINE IVec<AVX,T> operator^ (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_xor_si256( a, b ); } +template <typename T> +FORCE_INLINE IVec<AVX,T> min (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_min_epi32( a, b ); } + +FORCE_INLINE IVec<AVX,float> operator+ (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_add_epi32( a, b ); } +FORCE_INLINE IVec<AVX,float> operator- (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_sub_epi32( a, b ); } +FORCE_INLINE IVec<AVX,double> operator+ (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_add_epi64( a, b ); } +FORCE_INLINE IVec<AVX,double> operator- (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_sub_epi64( a, b ); } + + +typedef VecStorage<InstrFloatTraits<AVX,float>> FVec256Float; + +template <> +struct FVec<AVX, float> : FVec256Float +{ + FORCE_INLINE FVec() {} + FORCE_INLINE FVec( float f ) : FVec256Float( _mm256_set1_ps( f ) ) {} + FORCE_INLINE FVec( const float *v ) : FVec256Float( _mm256_loadu_ps( v ) ) {} + FORCE_INLINE FVec( const vec_t& v) : FVec256Float(v) {} + FORCE_INLINE FVec(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0) : FVec256Float(_mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0)) {} + + //void set0( float f ) { vec = _mm256_load_ss( &f ); } + void setN( float f ) { vec = _mm256_set1_ps( f ); } + + FORCE_INLINE void setidx( const float *xi, const IVec<AVX,float>& idx ) + { +#if 1 // use gather primitives + vec = _mm256_i32gather_ps (xi, idx, 4); +#elif 0 + uint32 i0 = idx.get0(); + uint32 i1 = idx.get1(); + uint32 i2 = idx.get2(); + uint32 i3 = idx.get3(); + uint32 i4 = idx.get4(); + uint32 i5 = idx.get5(); + uint32 i6 = idx.get6(); + uint32 i7 = idx.get7(); + vec = _mm256_set_ps( xi[i7], xi[i6], xi[i5], xi[i4], xi[i3], xi[i2], xi[i1], xi[i0] ); +#else + union { + __m256i vec; + uint32 ui32[8]; + } i; + i.vec = static_cast<const __m256i&>(idx); + vec = _mm256_set_ps(xi[i.ui32[7]], xi[i.ui32[6]], xi[i.ui32[5]], xi[i.ui32[4]], xi[i.ui32[3]], xi[i.ui32[2]], xi[i.ui32[1]], xi[i.ui32[0]]); +#endif + } + + FORCE_INLINE FVec<SSE, float> lo128() const { return _mm256_castps256_ps128(vec); } + FORCE_INLINE FVec<SSE, float> hi128() const { return _mm256_extractf128_ps(vec, 1); } + + //FORCE_INLINE float get0() const { return _mm256_cvtss_f32( vec ); } + //FORCE_INLINE float get1() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 1 ) ); } + //FORCE_INLINE float get2() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 2 ) ); } + //FORCE_INLINE float get3() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 3 ) ); } +}; + +FORCE_INLINE FVec<AVX,float> operator- (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_sub_ps( a, b ); } +FORCE_INLINE FVec<AVX,float> operator* (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_mul_ps( a, b ); } +FORCE_INLINE FVec<AVX,float> operator/ (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_div_ps( a, b ); } +FORCE_INLINE IVec<AVX,float> ftoi (const FVec<AVX,float>& a) { return _mm256_cvttps_epi32(a); } +FORCE_INLINE IVec<AVX,float> operator<= (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_LE_OS) ); } +FORCE_INLINE IVec<AVX,float> operator>= (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_GE_OS ) ); } +FORCE_INLINE IVec<AVX,float> operator< (const FVec<AVX,float>& a, const FVec<AVX,float>& b) { return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS )); } +#ifdef USE_FMA +FORCE_INLINE FVec<AVX, float> mulSub(const FVec<AVX, float>& a, const FVec<AVX, float>& b, const FVec<AVX, float>& c) { return _mm256_fmsub_ps(a, b, c); } +#endif + +typedef VecStorage<InstrFloatTraits<AVX,double>> FVec256Double; + +template <> +struct FVec<AVX, double> : FVec256Double +{ + FORCE_INLINE FVec() {} + FORCE_INLINE FVec( double d ) : FVec256Double( _mm256_set1_pd( d ) ) {} + FORCE_INLINE FVec( const double *v ) : FVec256Double( _mm256_loadu_pd( v ) ) {} + FORCE_INLINE FVec( const vec_t& v) : FVec256Double( v ) {} + FORCE_INLINE FVec(double d3, double d2, double d1, double d0) : FVec256Double(_mm256_set_pd(d3, d2, d1, d0)) {} + + //void set0( double f ) { vec = _mm256_load_sd( &f ); } + void setN( double f ) { vec = _mm256_set1_pd( f ); } + + FORCE_INLINE void setidx( const double *xi, const IVec<SSE,float>& idx ) + { + vec = _mm256_i32gather_pd(xi, idx, 8); + } + + FORCE_INLINE void setidx( const double *xi, const IVec<AVX,double>& idx ) + { + vec = _mm256_i64gather_pd(xi, idx, 8); + } + +// FORCE_INLINE double get0() const { return _mm256_cvtsd_f64( vec ); } +// FORCE_INLINE double get1() const { return _mm256_cvtsd_f64( _mm256_shuffle_pd( vec, vec, 1 ) ); }; +}; + +FORCE_INLINE FVec<AVX,double> operator- (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_sub_pd( a, b ); } +FORCE_INLINE FVec<AVX,double> operator* (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_mul_pd( a, b ); } +FORCE_INLINE FVec<AVX,double> operator/ (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_div_pd( a, b ); } +FORCE_INLINE IVec<SSE,float> ftoi (const FVec<AVX,double>& a) { return _mm256_cvttpd_epi32(a); } +FORCE_INLINE IVec<AVX,double> operator<= (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_LE_OS ) ); } +FORCE_INLINE IVec<AVX,double> operator< (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS)); } +FORCE_INLINE IVec<AVX,double> operator>= (const FVec<AVX,double>& a, const FVec<AVX,double>& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_GE_OS ) ); } +#ifdef USE_FMA +FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX, double>& b, const FVec<AVX, double>& c) { return _mm256_fmsub_pd(a, b, c); } +#endif + +#endif + +} // namepsace Details +} // namespace BinSearch diff --git a/include/Type.h b/include/Type.h new file mode 100644 index 0000000..720bfb8 --- /dev/null +++ b/include/Type.h @@ -0,0 +1,221 @@ + #pragma once + +#include <stddef.h> +#include <vector> +#include <limits> + +#include "Portable.h" + +using std::size_t; + +namespace BinSearch { + +enum InstrSet { Scalar, SSE, AVX }; + +#define ALGOENUM(x, b) x, +enum Algos + { +#include "AlgoXCodes.h" + }; +#undef ALGOENUM + +namespace Details { + + template <InstrSet I> + struct InstrIntTraits; + + template <InstrSet I, typename T> + struct InstrFloatTraits; + + // base class for algorithm supporting the method: + // uint32 scalar(T z) const + template <typename T, Algos A, typename Enable=void> + struct AlgoScalarBase; + + // base class for algorithm supporting the following methods, constants and definitions: + // static const uint32 nElem + // struct Constants; + // void initConstants(Constants& cst) const + // void vectorial(uint32 *pr, const T *pz, const Constants& cst) const + // The function vectorial processes nElem items + template <InstrSet I, typename T, Algos A, typename Enable=void> + struct AlgoVecBase; + + template <typename T> struct IntTraits; + + template <> struct IntTraits<float> + { + typedef uint32 itype; + }; + template <> struct IntTraits<double> + { + typedef uint64 itype; + }; + + template <int N> + struct Body + { + template <uint32 D, typename T, typename Expr> + FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const typename Expr::Constants& cst) + { + e.vectorial(ri, zi, cst); + Body<N - 1>::template iteration<D>(e, ri + D, zi + D, cst); + } + + }; + + template <> + struct Body<0> + { + template <uint32 D, typename T, typename Expr, typename H> + FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const H&) + { + } + }; + + template <typename T, typename Algo> + struct Loop + { + typedef Algo algo_type; + static const uint32 M = 4; + static const uint32 D = algo_type::nElem; + + FORCE_INLINE static void loop(const algo_type& e, uint32 *ri, const T* zi, uint32 n) + { + typename algo_type::Constants cst; + e.initConstants(cst); + + uint32 j = 0; + while (j + (D*M) <= n) { + Details::Body<M>::template iteration<D>(e, ri + j, zi + j, cst); + j += (D*M); + } + while (j + D <= n) { + e.vectorial(ri + j, zi + j, cst); + j += D; + } + while (D > 1 && j < n) { + ri[j] = e.scalar(zi[j]); + j += 1; + } + } + }; + + template <uint32 nIterTot, uint32 nIterLeft> + struct _Pipeliner + { + template <typename Expr, typename Data> + FORCE_INLINE static void go(const Expr& e, Data* d) + { + e.template run<nIterTot - nIterLeft>(d); + _Pipeliner<nIterTot, nIterLeft - 1>::go(e, d); + } + }; + + template <uint32 nIterTot> + struct _Pipeliner<nIterTot, 0> + { + template <typename Expr, typename Data> + FORCE_INLINE static void go(const Expr& e, Data* d) + { + } + }; + + template <uint32 nIter> + struct Pipeliner + { + template <typename Expr, typename Data> + FORCE_INLINE static void go(const Expr& e, Data* d) + { + _Pipeliner<nIter, nIter>::go(e, d); + } + }; + + +#if 1 + template <class T> + char is_complete_impl(char (*)[sizeof(T)]); + + template <class> + long is_complete_impl(...); + + template <class T> + struct IsComplete + { + static const bool value = sizeof(is_complete_impl<T>(0)) == sizeof(char); + }; +#else + template <class T, std::size_t = sizeof(T)> + std::true_type is_complete_impl(T *); + + std::false_type is_complete_impl(...); + + template <class T> + struct IsComplete : decltype(is_complete_impl(std::declval<T*>())) {}; +#endif + +template <typename T, Algos A> +struct AlgoScalarToVec : AlgoScalarBase<T,A> +{ + typedef AlgoScalarBase<T, A> base_t; + + AlgoScalarToVec(const typename base_t::Data& d) : base_t(d) {} + AlgoScalarToVec(const T* px, const uint32 n) : base_t(px, n) {} + + static const uint32 nElem = 1; + + struct Constants + { + }; + + void initConstants(Constants& cst) const + { + } + + FORCE_INLINE + void vectorial(uint32 *pr, const T *pz, const Constants& cst) const + { + *pr = base_t::scalar(*pz); + } +}; + +template<bool B, class T, class F> +struct conditional { typedef T type; }; + +template<class T, class F> +struct conditional<false, T, F> { typedef F type; }; + +template <typename T, bool C> +struct CondData +{ + FORCE_INLINE CondData(T x) : v(x) {} + FORCE_INLINE operator const T&() const { return v;} +private: + T v; +}; + +template <typename T> +struct CondData<T,false> +{ + FORCE_INLINE CondData(T) {} + FORCE_INLINE operator const T() const { return 0;} +}; + +template <InstrSet I, typename T, Algos A, bool L=false> +struct BinAlgoBase : Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value + , Details::AlgoVecBase<I, T, A> + , Details::AlgoScalarToVec<T,A> + >::type +{ + typedef typename Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value + , Details::AlgoVecBase<I, T, A> + , Details::AlgoScalarToVec<T,A> + >::type base_t; + + BinAlgoBase(const T* px, const uint32 n) : base_t(px, n) {} + BinAlgoBase(const typename base_t::Data& d) : base_t(d) {} +}; + +} // namespace Details + +} // namespace BinSearch |