6 files changed, 84 insertions, 40 deletions
diff --git a/Makefile b/Makefile
index 4fbe918..36ab33c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,11 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
 ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
 
 GPP:= /usr/bin/g++
+ifeq ($(CUDA_HOME),)
+	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
+endif
 NVCC := $(CUDA_HOME)/bin/nvcc
+
 ###########################################
 
 CSRC := $(ROOT_DIR)/csrc
@@ -15,58 +19,66 @@ INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
-#COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler 
-#COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler 
-#COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
-#COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
-#COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
-#COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
-#COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
-#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
-#
-## CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
-#CC_CUDA92 := -gencode arch=compute_30,code=sm_30
-#
-## Later versions of CUDA support the new architectures
-#CC_CUDA10x := -gencode arch=compute_30,code=sm_30
-#CC_CUDA10x += -gencode arch=compute_75,code=sm_75
-#
-#CC_CUDA110 := -gencode arch=compute_75,code=sm_75
-#CC_CUDA110 += -gencode arch=compute_80,code=sm_80
-#
-#CC_CUDA11x := -gencode arch=compute_75,code=sm_75
-#CC_CUDA11x += -gencode arch=compute_80,code=sm_80
-#CC_CUDA11x += -gencode arch=compute_86,code=sm_86
-
-COMPUTE_CAPABILITY := -gencode arch=compute_70,code=sm_70 # Volta
-
-
-all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler 
+COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler 
+COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
+COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
+COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
+COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
+COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
+COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
+COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
+
+# CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
+CC_CUDA92 := -gencode arch=compute_30,code=sm_30
+
+# Later versions of CUDA support the new architectures
+CC_CUDA10x := -gencode arch=compute_30,code=sm_30
+CC_CUDA10x += -gencode arch=compute_75,code=sm_75
+
+CC_CUDA110 := -gencode arch=compute_75,code=sm_75
+CC_CUDA110 += -gencode arch=compute_80,code=sm_80
+
+CC_CUDA11x := -gencode arch=compute_75,code=sm_75
+CC_CUDA11x += -gencode arch=compute_80,code=sm_80
+CC_CUDA11x += -gencode arch=compute_86,code=sm_86
+
+all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda110: $(BUILD_DIR)
+cuda110: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda11x: $(BUILD_DIR)
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+cuda11x: $(BUILD_DIR) env
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
+env:
+	@echo "ENVIRONMENT"
+	@echo "============================"
+	@echo "NVCC path: $(NVCC)"
+	@echo "GPP path: $(GPP)"
+	@echo "CUDA_HOME: $(CUDA_HOME)"
+	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
+	@echo "PATH: $(PATH)"
+	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
+	@echo "============================"
+
 $(BUILD_DIR):
 	mkdir -p cuda_build
 	mkdir -p dependencies
diff --git a/README.md b/README.md
index 1ea0733..4a731b0 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,10 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 
 1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available)
 
+## Compile from source
+
+To compile from source, please follow the [compile_from_source.md](compile_from_source.md) instructions.
+
 ## License
 
 The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license.
diff --git a/compile_from_source.md b/compile_from_source.md
new file mode 100644
index 0000000..c6a8b18
--- /dev/null
+++ b/compile_from_source.md
@@ -0,0 +1,20 @@
+# Compiling from source
+
+Basic steps.
+1. `make cudaXXX` where `cudaXXX` is among `cuda92, cuda10x, cuda110, cuda11x`
+2. `CUDA_VERSION=XXX python setup.py install`
+
+To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). 
+
+For your convenience, there is a install script int he root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
+```bash
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/" >> ~/.bashrc
+echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
+source ~/.bashrc
+```
+
+By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. 
+
+Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed
+
+If you have problems compiling the library with these instructions from source, please open an issue.
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index d1fa253..d8dfee1 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -715,7 +715,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
           switch(OPTIMIZER)
           {
               case ADAM: 
-									if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+									if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 									{
 										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
 										s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
@@ -868,7 +868,7 @@ __global__ void kOptimizer32bit1State(T *g, T *p,
       # pragma unroll 4
       for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
       {
-					if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+					if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 					{
 						switch(OPTIMIZER)
 						{
@@ -1475,7 +1475,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         {
             g_val = float(g_vals[j]);
             g_val *= gnorm_scale;
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
 							s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val));
@@ -1518,7 +1518,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         # pragma unroll N_PER_TH
         for(unsigned int j = 0; j < N_PER_TH; j++)
         {
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							g_vals[j] = (T)(((float)g_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
 							if(weight_decay > 0.0f)
@@ -1635,7 +1635,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         {
             g_val = float(g_vals[j]);
             g_val *= gnorm_scale;
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							if(weight_decay > 0.0f)
 								g_val += ((float)p_vals[j])*weight_decay;
@@ -1677,7 +1677,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         # pragma unroll N_PER_TH
         for(unsigned int j = 0; j < N_PER_TH; j++)
 				{
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							switch(OPTIMIZER)
 							{
diff --git a/errors_and_solutions.md b/errors_and_solutions.md
index 99461bc..dd99f7c 100644
--- a/errors_and_solutions.md
+++ b/errors_and_solutions.md
@@ -3,3 +3,6 @@
 This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. So solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``?
 
 If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation.
+
+
+__If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future.
diff --git a/install_cuda.sh b/install_cuda.sh
new file mode 100644
index 0000000..6a4ff0c
--- /dev/null
+++ b/install_cuda.sh
@@ -0,0 +1,5 @@
+wget https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
+bash cuda_11.1.1_455.32.00_linux.run --no-drm --no-man-page --override --installpath=~/local --librarypath=~/local/lib --toolkitpath=~/local/cuda-11.1/ --toolkit --silent
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/local/cuda-11.1/lib64/" >> ~/.bashrc
+echo "export PATH=$PATH:~/local/cuda-11.1/bin/" >> ~/.bashrc
+source ~/.bashrc