From 575aa698fa53df2f5c584413aed7bf7714f86039 Mon Sep 17 00:00:00 2001 From: Max Ryabinin Date: Fri, 1 Jul 2022 17:41:48 +0300 Subject: Reduce diff --- csrc/ops.cu | 45 ++++++++++++++++++++------------------------- csrc/pythonInterface.c | 2 +- 2 files changed, 21 insertions(+), 26 deletions(-) (limited to 'csrc') diff --git a/csrc/ops.cu b/csrc/ops.cu index b2a1105..dbb50be 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -15,35 +15,30 @@ using namespace BinSearch; using std::cout; using std::endl; -void histogramScatterAdd2D(float *histogram, int *index1, int *index2, float *src, int maxidx1, int n) { - int threads = 512; - int blocks = n / threads; - blocks = n % threads == 0 ? blocks : blocks + 1; - kHistogramScatterAdd2D<<>>(histogram, index1, index2, src, maxidx1, n); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); -} - -template -void estimateQuantiles(T *A, float *code, float offset, int n) { - int blocks = n / 4096; - blocks = n % 4096 == 0 ? blocks : blocks + 1; - CUDA_CHECK_RETURN(cudaMemset(code, 0, 256 * sizeof(float))); - kEstimateQuantiles < T ><<>>(A, code, offset, std::numeric_limits::max(), n); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); +void histogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n) +{ + int threads = 512; + int blocks = n/threads; + blocks = n % threads == 0 ? blocks : blocks + 1; + kHistogramScatterAdd2D<<>>(histogram, index1, index2, src, maxidx1, n); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -void quantize(float *code, float *A, unsigned char *out, int n) { - int blocks = n / 1024; - blocks = n % 1024 == 0 ? blocks : blocks + 1; - kQuantize<<>>(code, A, out, n); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); +template void estimateQuantiles(T *A, float *code, float offset, int n) +{ + int blocks = n/4096; + blocks = n % 4096 == 0 ? blocks : blocks + 1; + CUDA_CHECK_RETURN(cudaMemset(code, 0, 256*sizeof(float))); + kEstimateQuantiles<<>>(A, code, offset, std::numeric_limits::max(), n); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); } -void dequantize(float *code, unsigned char *A, float *out, int n) { - int blocks = n / 1024; - blocks = n % 1024 == 0 ? blocks : blocks + 1; - kDequantize<<>>(code, A, out, n); - CUDA_CHECK_RETURN(cudaPeekAtLastError()); +void quantize(float *code, float *A, unsigned char *out, int n) +{ + int blocks = n/1024; + blocks = n % 1024 == 0 ? blocks : blocks + 1; + kQuantize<<>>(code, A, out, n); + CUDA_CHECK_RETURN(cudaPeekAtLastError()); } template void quantizeBlockwise(float * code, T *A, float *absmax, unsigned char *out, float *rand, int rand_offset, const int n) diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 1f690c5..c2fed6b 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -86,7 +86,7 @@ void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, floa extern "C" { - if #BUILD_CUDA + #if BUILD_CUDA void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); } void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); } void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); } -- cgit v1.2.3