From 7e0fb655e1e040221054886fbee9d5682aa6e4e2 Mon Sep 17 00:00:00 2001 From: Tim Dettmers Date: Tue, 23 Aug 2022 13:59:34 -0700 Subject: Some initial code. Needs to be tested. --- bitsandbytes/cuda_setup/main.py | 9 ++++++-- bitsandbytes/functional.py | 5 ++--- bitsandbytes/optim/__init__.py | 15 ++++++------- tests/test_autograd.py | 2 ++ tests/test_functional.py | 48 ++++++++++++++++++++--------------------- 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 975b772..d305c64 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -17,6 +17,7 @@ evaluation: """ import ctypes +import torch from pathlib import Path from ..utils import execute_and_return @@ -28,7 +29,7 @@ def check_cuda_result(cuda, result_val): if result_val != 0: error_str = ctypes.c_char_p() cuda.cuGetErrorString(result_val, ctypes.byref(error_str)) - raise Exception(f"CUDA exception! Error code: {error_str.value.decode()}") + print(f"CUDA exception! Error code: {error_str.value.decode()}") def get_cuda_version(cuda, cudart_path): # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION @@ -57,7 +58,7 @@ def get_cuda_lib_handle(): cuda = ctypes.CDLL("libcuda.so") except OSError: # TODO: shouldn't we error or at least warn here? - raise Exception('CUDA SETUP: ERROR! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!') + print('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!') return None check_cuda_result(cuda, cuda.cuInit(0)) @@ -119,6 +120,10 @@ def evaluate_cuda_setup(): print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link') print('='*80) binary_name = "libbitsandbytes_cpu.so" + #if not torch.cuda.is_available(): + #print('No GPU detected. Loading CPU library...') + #return binary_name + cudart_path = determine_cuda_runtime_lib_path() if cudart_path is None: print( diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 6637554..745e7e4 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1686,11 +1686,10 @@ def double_quant( def get_special_format_str(): + if not torch.cuda.is_available(): return 'col_turning' major, minor = torch.cuda.get_device_capability() if major < 7: - print( - f"Device with CUDA capability of {major} not supported for 8-bit matmul. Device has no tensor cores!" - ) + print(f"Device with CUDA capability of {major} not supported for 8-bit matmul. Device has no tensor cores!") assert major >= 7 if major == 7: return 'col_turing' diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py index a76d717..d18f1d1 100644 --- a/bitsandbytes/optim/__init__.py +++ b/bitsandbytes/optim/__init__.py @@ -5,13 +5,12 @@ from bitsandbytes.cextension import COMPILED_WITH_CUDA -if COMPILED_WITH_CUDA: - from .adam import Adam, Adam8bit, Adam32bit - from .adamw import AdamW, AdamW8bit, AdamW32bit - from .sgd import SGD, SGD8bit, SGD32bit - from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS - from .lamb import LAMB, LAMB8bit, LAMB32bit - from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit - from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit +from .adam import Adam, Adam8bit, Adam32bit +from .adamw import AdamW, AdamW8bit, AdamW32bit +from .sgd import SGD, SGD8bit, SGD32bit +from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS +from .lamb import LAMB, LAMB8bit, LAMB32bit +from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit +from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit from .optimizer import GlobalOptimManager diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 0cd17c9..bae26de 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -40,6 +40,7 @@ names = [ ids=names, ) def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): + if not torch.cuda.is_available(): pytest.skip('No GPU found.') if dim2 > 0: dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -306,6 +307,7 @@ def test_matmullt( has_fp16_weights, has_bias ): + if not torch.cuda.is_available(): pytest.skip('No GPU found.') dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda") diff --git a/tests/test_functional.py b/tests/test_functional.py index 09a01d8..14cc21e 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1813,16 +1813,16 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): batch_size = 1 -seqdim = 2048 +seqdim = 1 values = [] -values.append((batch_size, seqdim, 768, 4 * 768)) +#values.append((batch_size, seqdim, 768, 4 * 768)) # values.append((batch_size, seqdim, 1024, 4*1024)) # values.append((batch_size, seqdim, 1536, 4*1536)) # values.append((batch_size, seqdim, 2048, 4*2048)) # values.append((batch_size, seqdim, 2560, 4*2560)) # values.append((batch_size, seqdim, 4096, 4*4096)) # values.append((batch_size, seqdim, 5140, 4*5140)) -# values.append((batch_size, seqdim, 12288, 4*12288)) +values.append((batch_size, seqdim, 12288, 4*12288)) names = [ "batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values ] @@ -1830,6 +1830,7 @@ names = [ @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) def test_bench_matmul(batch, seq, model, hidden): + iters = 128 formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -1848,28 +1849,33 @@ def test_bench_matmul(batch, seq, model, hidden): linearMixedBit.eval() # warmup - for i in range(100): + for i in range(iters): torch.matmul(A, B.t()) torch.cuda.synchronize() print("") torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): torch.matmul(A, B.t()) torch.cuda.synchronize() print( - f"pytorch: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" + f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): bnb.matmul(A, B) torch.cuda.synchronize() - print( - f"bnb lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + torch.cuda.synchronize() + t0 = time.time() + for i in range(iters): + bnb.matmul(A, B, threshold=6.0) + torch.cuda.synchronize() + print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0) C32A, SA = F.transform(CA, "col32") @@ -1877,18 +1883,16 @@ def test_bench_matmul(batch, seq, model, hidden): CxB, SB = F.transform(CB, to_order=formatB) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) torch.cuda.synchronize() - print( - f"igemmlt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") BA, statsB = F.vectorwise_quant(B, dim=1) CxB, SB = F.nvidia_transform(CB, to_order=formatB) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): A2 = A.view(-1, A.shape[-1]).contiguous() CA, statsA = F.vectorwise_quant(A2, dim=1) C32A, SA = F.nvidia_transform(CA, "col32") @@ -1896,15 +1900,13 @@ def test_bench_matmul(batch, seq, model, hidden): Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) F.vectorwise_mm_dequant(Cout, statsA, statsB.t()) torch.cuda.synchronize() - print( - f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") CxB, SB = F.nvidia_transform(CB, to_order=formatB) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): A2 = A.view(-1, A.shape[-1]).contiguous() CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear") C32A, SA = F.nvidia_transform(CA, "col32") @@ -1912,14 +1914,12 @@ def test_bench_matmul(batch, seq, model, hidden): Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) out = Cout * statsB * statsA * (1.0 / (127 * 127)) torch.cuda.synchronize() - print( - f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" - ) + #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") linear8bit(A) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): linear8bit(A) torch.cuda.synchronize() print( @@ -1929,7 +1929,7 @@ def test_bench_matmul(batch, seq, model, hidden): linearMixedBit(A) torch.cuda.synchronize() t0 = time.time() - for i in range(100): + for i in range(iters): linearMixedBit(A) torch.cuda.synchronize() print( -- cgit v1.2.3