Merge branch 'bugfix_cpu_and_cc' into main

author: Tim Dettmers <tim.dettmers@gmail.com> 2022-09-05 16:23:03 -0700
committer: Tim Dettmers <tim.dettmers@gmail.com> 2022-09-05 16:23:03 -0700
commit: eab4d8232d558f2e6bd7f7cc3d00e2e6e94f4e80 (patch)
tree: 1a70991e0d9d5632a20388bf87e4c56dfb06d64e /tests/test_functional.py
parent: 5d9bc7a301618d6642a70951ac46637d4889c82b (diff)
parent: 8e7053a988f7bf6c0b2605a3d0ec461f5d9b05e4 (diff)
1 files changed, 24 insertions, 24 deletions
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 09a01d8..14cc21e 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -1813,16 +1813,16 @@ def test_spmm_coo_dequant(dim1, dim2, dtype):
 
 
 batch_size = 1
-seqdim = 2048
+seqdim = 1
 values = []
-values.append((batch_size, seqdim, 768, 4 * 768))
+#values.append((batch_size, seqdim, 768, 4 * 768))
 # values.append((batch_size, seqdim, 1024, 4*1024))
 # values.append((batch_size, seqdim, 1536, 4*1536))
 # values.append((batch_size, seqdim, 2048, 4*2048))
 # values.append((batch_size, seqdim, 2560, 4*2560))
 # values.append((batch_size, seqdim, 4096, 4*4096))
 # values.append((batch_size, seqdim, 5140, 4*5140))
-# values.append((batch_size, seqdim, 12288, 4*12288))
+values.append((batch_size, seqdim, 12288, 4*12288))
 names = [
     "batch_{0}_seq_{1}_model_{2}_hidden_{3}".format(*vals) for vals in values
 ]
@@ -1830,6 +1830,7 @@ names = [
 
 @pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names)
 def test_bench_matmul(batch, seq, model, hidden):
+    iters = 128
     formatB = F.get_special_format_str()
 
     A = torch.randn(batch, seq, model, device="cuda").half()
@@ -1848,28 +1849,33 @@ def test_bench_matmul(batch, seq, model, hidden):
     linearMixedBit.eval()
 
     # warmup
-    for i in range(100):
+    for i in range(iters):
         torch.matmul(A, B.t())
     torch.cuda.synchronize()
     print("")
 
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         torch.matmul(A, B.t())
     torch.cuda.synchronize()
     print(
-        f"pytorch: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
+        f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
     )
 
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         bnb.matmul(A, B)
     torch.cuda.synchronize()
-    print(
-        f"bnb lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
+
+    torch.cuda.synchronize()
+    t0 = time.time()
+    for i in range(iters):
+        bnb.matmul(A, B, threshold=6.0)
+    torch.cuda.synchronize()
+    print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
 
     CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0)
     C32A, SA = F.transform(CA, "col32")
@@ -1877,18 +1883,16 @@ def test_bench_matmul(batch, seq, model, hidden):
     CxB, SB = F.transform(CB, to_order=formatB)
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB)
     torch.cuda.synchronize()
-    print(
-        f"igemmlt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
 
     BA, statsB = F.vectorwise_quant(B, dim=1)
     CxB, SB = F.nvidia_transform(CB, to_order=formatB)
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         A2 = A.view(-1, A.shape[-1]).contiguous()
         CA, statsA = F.vectorwise_quant(A2, dim=1)
         C32A, SA = F.nvidia_transform(CA, "col32")
@@ -1896,15 +1900,13 @@ def test_bench_matmul(batch, seq, model, hidden):
         Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
         F.vectorwise_mm_dequant(Cout, statsA, statsB.t())
     torch.cuda.synchronize()
-    print(
-        f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
 
     BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear")
     CxB, SB = F.nvidia_transform(CB, to_order=formatB)
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         A2 = A.view(-1, A.shape[-1]).contiguous()
         CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear")
         C32A, SA = F.nvidia_transform(CA, "col32")
@@ -1912,14 +1914,12 @@ def test_bench_matmul(batch, seq, model, hidden):
         Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32)
         out = Cout * statsB * statsA * (1.0 / (127 * 127))
     torch.cuda.synchronize()
-    print(
-        f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s"
-    )
+    #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s")
 
     linear8bit(A)
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         linear8bit(A)
     torch.cuda.synchronize()
     print(
@@ -1929,7 +1929,7 @@ def test_bench_matmul(batch, seq, model, hidden):
     linearMixedBit(A)
     torch.cuda.synchronize()
     t0 = time.time()
-    for i in range(100):
+    for i in range(iters):
         linearMixedBit(A)
     torch.cuda.synchronize()
     print(
author	Tim Dettmers <tim.dettmers@gmail.com>	2022-09-05 16:23:03 -0700
committer	Tim Dettmers <tim.dettmers@gmail.com>	2022-09-05 16:23:03 -0700
commit	eab4d8232d558f2e6bd7f7cc3d00e2e6e94f4e80 (patch)
tree	1a70991e0d9d5632a20388bf87e4c56dfb06d64e /tests/test_functional.py
parent	5d9bc7a301618d6642a70951ac46637d4889c82b (diff)
parent	8e7053a988f7bf6c0b2605a3d0ec461f5d9b05e4 (diff)