summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Dettmers <tim.dettmers@gmail.com>2022-07-25 19:30:37 -0700
committerTim Dettmers <tim.dettmers@gmail.com>2022-07-25 19:30:37 -0700
commit9268dc9d887a3d54cd1f008dcb628aaa5b5bd90a (patch)
treec557a57a63e627e6694bb383d0f7c8ade5f9071a
parent1e88edd8c096bde5202dd61411d3c8d7eda56645 (diff)
Some progress on build script; added multi-cuda install script.
-rw-r--r--CHANGELOG.md3
-rw-r--r--Makefile7
-rw-r--r--csrc/kernels.cu17
-rw-r--r--csrc/kernels.cuh2
-rw-r--r--csrc/ops.cu22
-rw-r--r--cuda_install.sh77
-rw-r--r--cuda_install_111.sh38
-rw-r--r--deploy_from_slurm.sh125
8 files changed, 192 insertions, 99 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 08adfce..285984e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -64,3 +64,6 @@ Features:
- Added 8-bit Linear layers with 8-bit Params that perform memory efficient inference with an option for 8-bit mixed precision matrix decomposition for inference without performance degradation
- Added quantization methods for "fake" quantization as well as optimized kernels vector-wise quantization and equalization as well as optimized cuBLASLt transformations
- CPU only build now available (Thank you, @mryab)
+
+Deprecated:
+ - Pre-compiled release for CUDA 9.2, 10.0, 10.2 no longer available
diff --git a/Makefile b/Makefile
index 2e1d265..328faa5 100644
--- a/Makefile
+++ b/Makefile
@@ -27,13 +27,14 @@ COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta
+COMPUTE_CAPABILITY := -gencode arch=compute_75,code=sm_75 # Volta
# CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
CC_CUDA92 := -gencode arch=compute_30,code=sm_30
# Later versions of CUDA support the new architectures
CC_CUDA10x := -gencode arch=compute_30,code=sm_30
-CC_CUDA10x += -gencode arch=compute_75,code=sm_75
+CC_CUDA10x := -gencode arch=compute_75,code=sm_75
CC_CUDA110 := -gencode arch=compute_75,code=sm_75
CC_CUDA110 += -gencode arch=compute_80,code=sm_80
@@ -43,12 +44,12 @@ CC_CUDA11x += -gencode arch=compute_80,code=sm_80
CC_CUDA11x += -gencode arch=compute_86,code=sm_86
all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
- $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+ $(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
- $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+ $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 4e744fb..6eca3aa 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -2166,7 +2166,6 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
__shared__ char smem_data[32*33*ITEMS_PER_THREAD];
char local_data[ITEMS_PER_THREAD];
typedef cub::BlockExchange<char, THREADS, ITEMS_PER_THREAD> BlockExchange;
- __shared__ typename BlockExchange::TempStorage temp_storage;
// we load row after row from the base_position
// Load data row by row
@@ -2446,7 +2445,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
#define MAX_SPARSE_COUNT 32
#define SMEM_SIZE 8*256
template <typename T, int SPMM_ITEMS, int BITS>
-__global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB)
+__global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB)
{
// 0. load balancing: We process rows with most columns first (count_vec)and we process one row per block
@@ -2500,7 +2499,7 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o
{
for(int i = threadIdx.x; i < SMEM_SIZE; i+=blockDim.x)
if((idx_col_B+i-local_idx_col_B_offset) < colsB)
- smem_dequant_stats[i] = __ldg(&dequant_stats[idx_col_B+i-local_idx_col_B_offset]);
+ smem_dequant_stats[i] = dequant_stats[idx_col_B+i-local_idx_col_B_offset];
__syncthreads();
}
@@ -2596,12 +2595,12 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o
// TEMPLATE DEFINITIONS
//==============================================================
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
diff --git a/csrc/kernels.cuh b/csrc/kernels.cuh
index cbfbeba..4e65e96 100644
--- a/csrc/kernels.cuh
+++ b/csrc/kernels.cuh
@@ -107,7 +107,7 @@ template<typename T, int BLOCK_SIZE, int NUM_VALS> __global__ void kPercentileCl
__global__ void kHistogramScatterAdd2D(float* histogram, int *index1, int *index2, float *src, const int maxidx1, const int n);
-template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template <typename T, int SPMM_ITEMS, int BITS> __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kdequant_mm_int32_fp16(
int *__restrict__ const A, float *__restrict__ const rowStats, float *__restrict__ const colStats,
diff --git a/csrc/ops.cu b/csrc/ops.cu
index 8946015..c430d55 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -247,6 +247,8 @@ int roundoff(int v, int d) {
}
+#ifdef NO_CUBLASLT
+#else
template<int ORDER> cublasLtOrder_t get_order()
{
switch(ORDER)
@@ -266,7 +268,11 @@ template<int ORDER> cublasLtOrder_t get_order()
case COL_AMPERE:
return CUBLASLT_ORDER_COL32_2R_4R4;
break;
+ default:
+ break;
}
+
+ return CUBLASLT_ORDER_ROW;
}
template cublasLtOrder_t get_order<ROW>();
@@ -274,6 +280,7 @@ template cublasLtOrder_t get_order<COL>();
template cublasLtOrder_t get_order<COL32>();
template cublasLtOrder_t get_order<COL_TURING>();
template cublasLtOrder_t get_order<COL_AMPERE>();
+#endif
template<int ORDER> int get_leading_dim(int dim1, int dim2)
@@ -297,6 +304,9 @@ template<int ORDER> int get_leading_dim(int dim1, int dim2)
// 32*32 tiles
return 32*roundoff(dim1, 32);
break;
+ default:
+ return 0;
+ break;
}
}
@@ -306,7 +316,8 @@ template int get_leading_dim<COL32>(int dim1, int dim2);
template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void transform(cublasLtHandle_t ltHandle, T *A, T *out, int dim1, int dim2)
{
-
+#ifdef NO_CUBLASLT
+#else
cublasLtOrder_t orderA = get_order<SRC>();
cublasLtOrder_t orderOut = get_order<TARGET>();
int ldA = get_leading_dim<SRC>(dim1, dim2);
@@ -345,6 +356,7 @@ template <typename T, int SRC, int TARGET, bool transpose, int DTYPE> void trans
if (A_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(A_desc));
if (out_desc) checkCublasStatus(cublasLtMatrixLayoutDestroy(out_desc));
if (A2Out_desc) checkCublasStatus(cublasLtMatrixTransformDescDestroy(A2Out_desc));
+#endif
}
template void transform<int8_t, ROW, COL, false, 8>(cublasLtHandle_t ltHandle, int8_t *A, int8_t *out, int dim1, int dim2);
@@ -358,6 +370,9 @@ template void transform<int32_t, COL32, ROW, false, 32>(cublasLtHandle_t ltHandl
template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
{
+#ifdef NO_CUBLASLT
+ return 0;
+#else
int has_error = 0;
cublasLtMatmulDesc_t matmulDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
@@ -412,6 +427,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle
printf("error detected");
return has_error;
+#endif
}
int fill_up_to_nearest_multiple(int value, int multiple)
@@ -523,6 +539,9 @@ template <int FORMAT, int TRANSPOSE> void transformRowToFormat(char * A, char *o
void spmm_coo(cusparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
{
+#ifdef NO_CUBLASLT
+#else
+
cusparseSpMatDescr_t descA;
cusparseDnMatDescr_t descB, descC;
@@ -569,6 +588,7 @@ void spmm_coo(cusparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_val
CHECK_CUSPARSE( cusparseDestroyDnMat(descB) );
CHECK_CUSPARSE( cusparseDestroyDnMat(descC) );
CUDA_CHECK_RETURN( cudaFree(dBuffer) );
+#endif
}
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
diff --git a/cuda_install.sh b/cuda_install.sh
new file mode 100644
index 0000000..856cbe5
--- /dev/null
+++ b/cuda_install.sh
@@ -0,0 +1,77 @@
+URL92=https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux
+URL100=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
+URL101=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run
+URL102=https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
+URL110=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run
+URL111=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
+URL112=https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
+URL113=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+URL114=https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run
+URL115=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
+URL116=https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
+URL117=https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
+
+
+CUDA_VERSION=$1
+BASE_PATH=$2
+
+if [[ -n "$CUDA_VERSION" ]]; then
+ if [[ "$CUDA_VERSION" -eq "92" ]]; then
+ URL=$URL92
+ FOLDER=cuda-9.2
+ elif [[ "$CUDA_VERSION" -eq "100" ]]; then
+ URL=$URL100
+ FOLDER=cuda-10.0
+ elif [[ "$CUDA_VERSION" -eq "101" ]]; then
+ URL=$URL101
+ FOLDER=cuda-10.1
+ elif [[ "$CUDA_VERSION" -eq "102" ]]; then
+ URL=$URL102
+ FOLDER=cuda-10.2
+ elif [[ "$CUDA_VERSION" -eq "110" ]]; then
+ URL=$URL110
+ FOLDER=cuda-11.0
+ elif [[ "$CUDA_VERSION" -eq "111" ]]; then
+ URL=$URL111
+ FOLDER=cuda-11.1
+ elif [[ "$CUDA_VERSION" -eq "112" ]]; then
+ URL=$URL112
+ FOLDER=cuda-11.2
+ elif [[ "$CUDA_VERSION" -eq "113" ]]; then
+ URL=$URL113
+ FOLDER=cuda-11.3
+ elif [[ "$CUDA_VERSION" -eq "114" ]]; then
+ URL=$URL114
+ FOLDER=cuda-11.4
+ elif [[ "$CUDA_VERSION" -eq "115" ]]; then
+ URL=$URL115
+ FOLDER=cuda-11.5
+ elif [[ "$CUDA_VERSION" -eq "116" ]]; then
+ URL=$URL116
+ FOLDER=cuda-11.6
+ elif [[ "$CUDA_VERSION" -eq "117" ]]; then
+ URL=$URL117
+ FOLDER=cuda-11.7
+ else
+ echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
+ fi
+else
+ echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
+fi
+
+FILE=$(basename $URL)
+
+if [[ -n "$CUDA_VERSION" ]]; then
+ echo $URL
+ echo $FILE
+ wget $URL
+ bash $FILE --no-drm --no-man-page --override --installpath=~/local --librarypath=$BASE_PATH/lib --toolkitpath=$BASE_PATH/$FOLDER/ --toolkit --silent
+ echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$BASE_PATH/$FOLDER/lib64/" >> ~/.bashrc
+ echo "export PATH=$PATH:$BASE_PATH/$FOLDER/bin/" >> ~/.bashrc
+ source ~/.bashrc
+else
+ echo ""
+fi
+
+
+
diff --git a/cuda_install_111.sh b/cuda_install_111.sh
deleted file mode 100644
index 476ab59..0000000
--- a/cuda_install_111.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-FILE115=:cuda_11.5.1_495.29.05_linux.run
-FILE111=:cuda_11.1.1_455.32.00_linux.run
-URL115=:https://developer.download.nvidia.com/compute/cuda/11.5.1/local_installers/cuda_11.5.1_495.29.05_linux.run
-URL111=:https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
-
-
-CUDA_VERSION=$1
-
-if [[ -n "$CUDA_VERSION" ]]; then
- if [[ "$CUDA_VERSION" -eq "111" ]]; then
- FILE=cuda_11.1.1_455.32.00_linux.run
- URL=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
- FOLDER=cuda-11.1
- elif [[ "$CUDA_VERSION" -eq "115" ]]; then
- FILE=cuda_11.5.1_495.29.05_linux.run
- URL=https://developer.download.nvidia.com/compute/cuda/11.5.1/local_installers/cuda_11.5.1_495.29.05_linux.run
- FOLDER=cuda-11.5
- else
- echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
- fi
-else
- echo "argument error: No cuda version passed as input. Choose among: {111, 115}"
-fi
-
-if [[ -n "$CUDA_VERSION" ]]; then
- echo $URL
- echo $FILE
- wget $URL
- bash $FILE --no-drm --no-man-page --override --installpath=~/local --librarypath=~/local/lib --toolkitpath=~/local/$FOLDER/ --toolkit --silent
- echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/local/$FOLDER/lib64/" >> ~/.bashrc
- echo "export PATH=$PATH:~/local/$FOLDER/bin/" >> ~/.bashrc
- source ~/.bashrc
-else
- echo ""
-fi
-
-
-
diff --git a/deploy_from_slurm.sh b/deploy_from_slurm.sh
index 6357e1d..5a554bb 100644
--- a/deploy_from_slurm.sh
+++ b/deploy_from_slurm.sh
@@ -1,86 +1,117 @@
#!/bin/bash
+BASE_PATH=$1
+
module unload cuda
module unload gcc
rm -rf dist build
make clean
make cleaneggs
-module load cuda/9.2
-module load gcc/7.3.0
-CUDA_HOME=/public/apps/cuda/9.2
-make
-CUDA_VERSION=92 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+export CUDA_HOME=$BASE_PATH/cuda-11.0
+make cuda110
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=110 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/10.0
-CUDA_HOME=/public/apps/cuda/10.0
-make cuda10x
-CUDA_VERSION=100 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
-module unload gcc
-module load gcc/8.4
+export CUDA_HOME=$BASE_PATH/cuda-11.1
+make cuda11x
+
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=111 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/10.1
-CUDA_HOME=/public/apps/cuda/10.1
-make cuda10x
-CUDA_VERSION=101 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+export CUDA_HOME=$BASE_PATH/cuda-11.2
+make cuda11x
+
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=112 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/10.2
-CUDA_HOME=/public/apps/cuda/10.2/
-make cuda10x
-CUDA_VERSION=102 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+export CUDA_HOME=$BASE_PATH/cuda-11.3
+make cuda11x
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=113 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/11.0
-CUDA_HOME=/public/apps/cuda/11.0
-make cuda110
-CUDA_VERSION=110 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+export CUDA_HOME=$BASE_PATH/cuda-11.4
+make cuda11x
+
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=114 python -m build
+##python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/11.1
-CUDA_HOME=/public/apps/cuda/11.1
+export CUDA_HOME=$BASE_PATH/cuda-11.5
make cuda11x
-CUDA_VERSION=111 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=115 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-module load cuda/11.2
-CUDA_HOME=/public/apps/cuda/11.2
+export CUDA_HOME=$BASE_PATH/cuda-11.6
+
make cuda11x
-CUDA_VERSION=112 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=116 python -m build
+#python -m twine upload dist/* --verbose
rm -rf dist build
make clean
make cleaneggs
-CUDA_HOME=/private/home/timdettmers/git/autoswap/local/cuda-11.3 make cuda11x
-CUDA_VERSION=113 python -m build
-python -m twine upload dist/* --verbose
-module unload cuda
+export CUDA_HOME=$BASE_PATH/cuda-11.7
+make cuda11x
+
+if [ ! -f "./bitsandbytes/libbitsandbytes.so" ]; then
+ # Control will enter here if $DIRECTORY doesn't exist.
+ echo "Compilation unsuccessul!" 1>&2
+ exit 64
+fi
+#CUDA_VERSION=117 python -m build
+#python -m twine upload dist/* --verbose