summaryrefslogtreecommitdiff
path: root/csrc
diff options
context:
space:
mode:
authorMax Ryabinin <mryabinin0@gmail.com>2022-07-01 17:41:48 +0300
committerMax Ryabinin <mryabinin0@gmail.com>2022-07-01 17:41:48 +0300
commit575aa698fa53df2f5c584413aed7bf7714f86039 (patch)
treeed2c6a2d787a5b934013a8105d581862edd4f619 /csrc
parent4d1d5b569f55dd613bea26714eb1ad931a10be35 (diff)
Reduce diff
Diffstat (limited to 'csrc')
-rw-r--r--csrc/ops.cu45
-rw-r--r--csrc/pythonInterface.c2
2 files changed, 21 insertions, 26 deletions
diff --git a/csrc/ops.cu b/csrc/ops.cu
index b2a1105..dbb50be 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -15,35 +15,30 @@ using namespace BinSearch;
using std::cout;
using std::endl;
-void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) {
- int threads = 512;
- int blocks = n / threads;
- blocks = n % threads == 0 ? blocks : blocks + 1;
- kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
- CUDA_CHECK_RETURN(cudaPeekAtLastError());
-}
-
-template<typename T>
-void estimateQuantiles(T *A, float *code, float offset, int n) {
- int blocks = n / 4096;
- blocks = n % 4096 == 0 ? blocks : blocks + 1;
- CUDA_CHECK_RETURN(cudaMemset(code, 0, 256 * sizeof(float)));
- kEstimateQuantiles < T ><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
- CUDA_CHECK_RETURN(cudaPeekAtLastError());
+void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n)
+{
+ int threads = 512;
+ int blocks = n/threads;
+ blocks = n % threads == 0 ? blocks : blocks + 1;
+ kHistogramScatterAdd2D<<<blocks, 512>>>(histogram, index1, index2, src, maxidx1, n);
+ CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
-void quantize(float *code, float *A, unsigned char *out, int n) {
- int blocks = n / 1024;
- blocks = n % 1024 == 0 ? blocks : blocks + 1;
- kQuantize<<<blocks, 1024>>>(code, A, out, n);
- CUDA_CHECK_RETURN(cudaPeekAtLastError());
+template <typename T> void estimateQuantiles(T *A, float *code, float offset, int n)
+{
+ int blocks = n/4096;
+ blocks = n % 4096 == 0 ? blocks : blocks + 1;
+ CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float)));
+ kEstimateQuantiles<T><<<blocks, 512>>>(A, code, offset, std::numeric_limits<T>::max(), n);
+ CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
-void dequantize(float *code, unsigned char *A, float *out, int n) {
- int blocks = n / 1024;
- blocks = n % 1024 == 0 ? blocks : blocks + 1;
- kDequantize<<<blocks, 1024>>>(code, A, out, n);
- CUDA_CHECK_RETURN(cudaPeekAtLastError());
+void quantize(float *code, float *A, unsigned char *out, int n)
+{
+ int blocks = n/1024;
+ blocks = n % 1024 == 0 ? blocks : blocks + 1;
+ kQuantize<<<blocks, 1024>>>(code, A, out, n);
+ CUDA_CHECK_RETURN(cudaPeekAtLastError());
}
template <typename T, int STOCHASTIC> void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n)
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c
index 1f690c5..c2fed6b 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.c
@@ -86,7 +86,7 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa
extern "C"
{
- if #BUILD_CUDA
+ #if BUILD_CUDA
void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }