summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorDmitry Baranchuk <dmitrybaranchuk@gmail.com>2022-09-10 19:33:21 -0700
committerGitHub <noreply@github.com>2022-09-10 19:33:21 -0700
commit843ad0631c65eabc7f64e80906ecf5482cc1a036 (patch)
tree07ab541ec59ab3474a711c155daa118fc0ae6864 /tests
parent8d34d36f150b0fd4914cdb56d4e3bda34c029ccc (diff)
parent2e630b55f51d454f3bd723dffda68a07ef93190c (diff)
Merge pull request #1 from TimDettmers/main
Update main branch
Diffstat (limited to 'tests')
-rw-r--r--tests/test_autograd.py2
-rw-r--r--tests/test_functional.py48
2 files changed, 26 insertions, 24 deletions
diff --git a/tests/test_autograd.py b/tests/test_autograd.py
index 0cd17c9..bae26de 100644
--- a/tests/test_autograd.py
+++ b/tests/test_autograd.py
@@ -40,6 +40,7 @@ names = [
ids=names,
)
def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose):
+ if not torch.cuda.is_available(): pytest.skip('No GPU found.')
if dim2 > 0:
dim2 = dim2 - (dim2 % 16)
dim3 = dim3 - (dim3 % 16)
@@ -306,6 +307,7 @@ def test_matmullt(
has_fp16_weights,
has_bias
):
+ if not torch.cuda.is_available(): pytest.skip('No GPU found.')
dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2)
dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3)
outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda")
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 09a01d8..14cc21e 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1813,16 +1813,16 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):
batch_size = 1
-seqdim = 2048
+seqdim = 1
values = []
-values.append((batch_size, seqdim, 768, 4 * 768))
+#values.append((batch_size, seqdim, 768, 4 * 768))
# values.append((batch_size, seqdim, 1024, 4*1024))
# values.append((batch_size, seqdim, 1536, 4*1536))
# values.append((batch_size, seqdim, 2048, 4*2048))
# values.append((batch_size, seqdim, 2560, 4*2560))
# values.append((batch_size, seqdim, 4096, 4*4096))
# values.append((batch_size, seqdim, 5140, 4*5140))
-# values.append((batch_size, seqdim, 12288, 4*12288))
+values.append((batch_size, seqdim, 12288, 4*12288))
names = [
"batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values
]
@@ -1830,6 +1830,7 @@ names = [
@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
def test_bench_matmul(batch, seq, model, hidden):
+ iters = 128
formatB = F.get_special_format_str()
A = torch.randn(batch, seq, model, device="cuda").half()
@@ -1848,28 +1849,33 @@ def test_bench_matmul(batch, seq, model, hidden):
linearMixedBit.eval()
# warmup
- for i in range(100):
+ for i in range(iters):
torch.matmul(A, B.t())
torch.cuda.synchronize()
print("")
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
torch.matmul(A, B.t())
torch.cuda.synchronize()
print(
- f"pytorch: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+ f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
bnb.matmul(A, B)
torch.cuda.synchronize()
- print(
- f"bnb lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
- )
+ print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+
+ torch.cuda.synchronize()
+ t0 = time.time()
+ for i in range(iters):
+ bnb.matmul(A, B, threshold=6.0)
+ torch.cuda.synchronize()
+ print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0)
C32A, SA = F.transform(CA, "col32")
@@ -1877,18 +1883,16 @@ def test_bench_matmul(batch, seq, model, hidden):
CxB, SB = F.transform(CB, to_order=formatB)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB)
torch.cuda.synchronize()
- print(
- f"igemmlt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
- )
+ print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
BA, statsB = F.vectorwise_quant(B, dim=1)
CxB, SB = F.nvidia_transform(CB, to_order=formatB)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
A2 = A.view(-1, A.shape[-1]).contiguous()
CA, statsA = F.vectorwise_quant(A2, dim=1)
C32A, SA = F.nvidia_transform(CA, "col32")
@@ -1896,15 +1900,13 @@ def test_bench_matmul(batch, seq, model, hidden):
Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
F.vectorwise_mm_dequant(Cout, statsA, statsB.t())
torch.cuda.synchronize()
- print(
- f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
- )
+ #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear")
CxB, SB = F.nvidia_transform(CB, to_order=formatB)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
A2 = A.view(-1, A.shape[-1]).contiguous()
CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear")
C32A, SA = F.nvidia_transform(CA, "col32")
@@ -1912,14 +1914,12 @@ def test_bench_matmul(batch, seq, model, hidden):
Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
out = Cout * statsB * statsA * (1.0 / (127 * 127))
torch.cuda.synchronize()
- print(
- f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
- )
+ #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
linear8bit(A)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
linear8bit(A)
torch.cuda.synchronize()
print(
@@ -1929,7 +1929,7 @@ def test_bench_matmul(batch, seq, model, hidden):
linearMixedBit(A)
torch.cuda.synchronize()
t0 = time.time()
- for i in range(100):
+ for i in range(iters):
linearMixedBit(A)
torch.cuda.synchronize()
print(