From 7439924891496025edf60c9da6a782f362a50c70 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 5 Oct 2021 19:16:20 -0700 Subject: Initial commit --- include/Algo-Direct-Common.h | 341 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 include/Algo-Direct-Common.h (limited to 'include/Algo-Direct-Common.h') diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h new file mode 100644 index 0000000..cf5f0c9 --- /dev/null +++ b/include/Algo-Direct-Common.h @@ -0,0 +1,341 @@ +#pragma once + +#include +#include +#include +#include "AAlloc.h" + +namespace BinSearch { +namespace Details { + +namespace DirectAux { + +#define SAFETY_MULTI_PASS true + +template +struct HResults +{ + HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {} + T H; + double hRatio; + size_t nInc; +}; + + +#ifdef USE_FMA +template struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); }; +template struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); }; +template struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); }; +#else +template struct IsDirect { static const bool value = (A == Direct); }; +template struct IsDirect2 { static const bool value = (A == Direct2); }; +template struct IsDirectCache { static const bool value = (A == DirectCache); }; +#endif + +// general definition +template +struct BucketElem +{ + FORCE_INLINE void set( uint32 b, const T *) + { + m_b = b; + } + + FORCE_INLINE uint32 index() const { return m_b; } + +private: + uint32 m_b; +}; + +// specialization for DirectCache methods + +template struct MatchingIntType; +template <> struct MatchingIntType { typedef uint64 type; }; +template <> struct MatchingIntType { typedef uint32 type; }; + +template +struct BucketElem::value >::type > +{ + typedef typename MatchingIntType::type I; + + void set(uint32 b, const T *xi) + { + u.u.x = xi[b]; + u.u.b = b; + } + + FORCE_INLINE I index() const { return u.u.b; } + FORCE_INLINE T x() const { return u.u.x; } + +private: + union { + double dummy; + struct + { + T x; + I b; + } u; + } u; +}; + + +template +struct DirectTraits +{ + static void checkH(T scaler, T x0, T xN) + { + T Dn = xN - x0; + T ifmax = Dn * scaler; + myassert((ifmax < std::numeric_limits::max() - (Gap - 1)), + "Problem unfeasible: index size exceeds uint32 capacity:" + << " D[N] =" << Dn + << ", H =" << scaler + << ", H D[n] =" << ifmax << "\n" + ); + } + + FORCE_INLINE static uint32 f(T scaler, T x0, T z) + { + T tmp = scaler * (z - x0); +#ifdef USE_SSE2 + return ftoi(FVec1(tmp)); +#else + return static_cast(tmp); +#endif + } + + template + FORCE_INLINE static typename FTOITraits::vec_t f(const FVec& scaler, const FVec& x0, const FVec& z) + { + return ftoi(scaler*(z-x0)); + } + + static T cst0(T scaler, T x0) + { + return x0; + } +}; + +#ifdef USE_FMA +template +struct DirectTraits +{ + typedef FVec1 fVec1; + + static void checkH(T scaler, T H_Times_x0, T xN) + { + union { + typename FVec1::vec_t v; + T s; + } ifmax; + ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0)); + myassert((ifmax.s < std::numeric_limits::max() - (Gap - 1)), + "Problem unfeasible: index size exceeds uint32 capacity:" + << " H X[0] =" << H_Times_x0 + << ", H =" << scaler + << ", X[N] =" << xN + << ", H X[N] - H X[0] =" << ifmax.s << "\n" + ); + } + + FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi) + { + return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0))); + } + + template + FORCE_INLINE static typename FTOITraits::vec_t f(const FVec& scaler, const FVec& H_Times_X0, const FVec& z) + { + return ftoi(mulSub(scaler, z, H_Times_X0)); + } + + static T cst0(T scaler, T x0) + { + return scaler*x0; + } +}; +#endif + +template +struct DirectInfo +{ + static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA); + typedef DirectTraits fun_t; + typedef BucketElem bucket_t; + typedef AlignedVec bucketvec_t; + + struct Data { + Data() : buckets(0), xi(0), scaler(0), cst0(0) {} + Data( const T *x // for Direct must persist if xws=NULL + , uint32 n + , T H + , bucket_t *bws // assumed to gave size nb, as computed below + , T *xws = NULL // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap + ) + : buckets(bws) + , scaler(H) + , cst0(fun_t::cst0(H, x[0])) + { + myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned"); + + uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]); + + const uint32 npad = Gap-1; + const uint32 n_sz = n + npad; // size of padded vector + + if (xws) { + myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned"); + std::fill_n(xws, npad, x[0]); // pad in front with x[0] + std::copy(x, x+n, xws + npad); + xi = xws; + } + else { + myassert(Gap==1, "if Gap>1 then X workspace must be provided"); + xi = x; + } + + populateIndex(bws, nb, xi, n_sz, scaler, cst0); + } + + const bucket_t *buckets; + const T *xi; + T scaler; + T cst0; // could be x0 or (scaler*x0), depending if we are using FMA or not + } data; + + static T growStep(T H) + { + T step; + T P = next(H); + while ((step = P - H) == 0) + P = next(P); + return step; + } + + static HResults computeH(const T *px, uint32 nx) + { + myassert((nx > Gap), "Array X too small"); + myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap"); + + const T x0 = px[0]; + const T xN = px[nx-1]; + + const T range = xN - x0; + myassert((range < std::numeric_limits::max()), "range too large"); + + // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i + T deltaDMin = range; + for (uint32 i = Gap; i < nx; ++i) { + T Dnew = px[i] - x0; + T Dold = px[i - Gap] - x0; + myassert((Dnew > Dold), + "Problem unfeasible: D_i sequence not strictly increasing" + << " X[" << 0 << "]=" << x0 + << " X[" << i - Gap << "]=" << px[i - Gap] + << " X[" << i << "]=" << px[i] + << "\n" + ); + T deltaD = Dnew - Dold; + if (deltaD < deltaDMin) + deltaDMin = deltaD; + } + + // initial guess for H + const T H0 = T(1.0) / deltaDMin; + T H = H0; + + T cst0 = fun_t::cst0(H, x0); + fun_t::checkH(H, cst0, xN); + + // adjust H by trial and error until succeed + size_t nInc = 0; + bool modified = false; + size_t npasses = 0; + T step = growStep(H); + uint32 seg_already_checked_from = nx; + do { + myassert((npasses++ < 2), "verification failed\n"); + // if there has been an increase, then check only up to that point + uint32 last_seg_to_be_checked = seg_already_checked_from - 1; + modified = false; + uint32 inew = 0; + for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) { + uint32 iold = fun_t::f(H, cst0, px[i-Gap]); + uint32 inew = fun_t::f(H, cst0, px[i]); + while (inew == iold) { + seg_already_checked_from = i; + last_seg_to_be_checked = nx-1; // everything needs to be checked + modified = true; + H = H + step; + step *= 2; + // recalculate all constants and indices + cst0 = fun_t::cst0(H, x0); + fun_t::checkH(H, cst0, xN); + iold = fun_t::f(H, cst0, px[i - Gap]); + inew = fun_t::f(H, cst0, px[i]); + } + } + } while (SAFETY_MULTI_PASS && modified); + + return HResults(H, (((double)H) / H0) - 1.0, nInc); + } + + static void populateIndex(BucketElem *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0) + { + for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) { + uint32 idx = fun_t::f(scaler, cst0, px[i]); + while (b > idx) { // in the 1st iteration it is j=0 but this condition is always false + buckets[b].set( j, px ); + --b; + } + if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant + j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check + buckets[b].set(j, px); + if (b-- == 0) + break; + } + } + } + + DirectInfo(const Data& d) + : data(d) + { + } + + DirectInfo(const T* px, const uint32 n) + { + HResults res = computeH(px, n); + +#ifdef PAPER_TEST + nInc = res.nInc; + hRatio = res.hRatio; +#endif + const uint32 npad = Gap-1; + const uint32 n_sz = n + npad; // size of padded vector + + if (npad) + xi.resize(n_sz); + + T H = res.H; + T cst0 = fun_t::cst0(H, px[0]); + const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]); + buckets.resize(maxIndex + 1); + + data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL)); + } + +private: + bucketvec_t buckets; + AlignedVec xi; + +#ifdef PAPER_TEST +public: + double hRatio; + size_t nInc; +#endif +}; + + +} // namespace DirectAux +} // namespace Details +} // namespace BinSearch -- cgit v1.2.3