From 591f60395a1e9c62f291e23c91af45cc699f072c Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 00:52:53 +0300
Subject: add memory efficient backward

---
 tests/test_modules.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index c0b3311..53a675f 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -14,13 +14,15 @@ class MockArgs(object):
 
 
 class MLP8bit(torch.nn.Module):
-    def __init__(self, dim1, dim2, has_fp16_weights=True, threshold=0.0):
+    def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0):
         super(MLP8bit, self).__init__()
         self.fc1 = bnb.nn.Linear8bitLt(
-            dim1, dim2, has_fp16_weights=has_fp16_weights, threshold=threshold
+            dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
+            threshold=threshold
         )
         self.fc2 = bnb.nn.Linear8bitLt(
-            dim2, dim1, has_fp16_weights=has_fp16_weights, threshold=threshold
+            dim2, dim1, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward,
+            threshold=threshold
         )
 
     def forward(self, x):
@@ -451,9 +453,12 @@ names = ["threshold_{0}".format(vals) for vals in values]
 
 
 @pytest.mark.parametrize("threshold", values, ids=names)
-def test_linear8bitlt_no_fp16_weights(threshold):
+@pytest.mark.parametrize("memory_efficient_backward", [True, False])
+def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
     l1 = (
-        bnb.nn.Linear8bitLt(32, 64, threshold=threshold, has_fp16_weights=False)
+        bnb.nn.Linear8bitLt(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
         .cuda()
         .half()
     )
@@ -513,7 +518,9 @@ def test_linear8bitlt_no_fp16_weights(threshold):
     assert mlp.fc2.weight.dtype == torch.int8
 
     mlp = (
-        MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False)
+        MLP8bit(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
         .half()
         .to("cuda")
     )
@@ -532,7 +539,9 @@ def test_linear8bitlt_no_fp16_weights(threshold):
     assert mlp.fc2.weight.device.type == "cuda"
 
     mlp = (
-        MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False)
+        MLP8bit(
+            32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
+        )
         .to(torch.float16)
         .to("cuda")
     )
@@ -551,6 +560,7 @@ def test_linear8bitlt_no_fp16_weights(threshold):
     assert mlp.fc2.weight.device.type == "cuda"
 
 
+
 def test_linear8bitlt_fp32_bias():
     # casts model to fp16 -> int8 automatically
     l1 = bnb.nn.Linear8bitLt(32, 64, has_fp16_weights=False).cuda()
-- 
cgit v1.2.3


From 2cd047e35da3a421c4b491ff1a137e19b9c6c919 Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 00:55:53 +0300
Subject: run backward

---
 tests/test_modules.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 53a675f..d3992a9 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -554,11 +554,22 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
             assert mlp.fc1.state.idx is not None
         if threshold > 0:
             assert mlp.fc2.state.idx is not None
+
     assert mlp.fc1.weight.dtype == torch.int8
     assert mlp.fc2.weight.dtype == torch.int8
     assert mlp.fc1.weight.device.type == "cuda"
     assert mlp.fc2.weight.device.type == "cuda"
 
+    if memory_efficient_backward:
+        b1 = torch.randn(16, 8, 32, device="cuda", requires_grad=True, dtype=torch.half)
+        o1 = mlp(b1)
+        assert o1.dtype == torch.float16
+        assert o1.requires_grad
+        grad_proj = torch.randn_like(o1)
+
+        (o1 * grad_proj).sum().backward()
+
+
 
 
 def test_linear8bitlt_fp32_bias():
-- 
cgit v1.2.3


From d9b8789818191f9992733394d7ccfa00a63d4dba Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:13:58 +0300
Subject: debug

---
 tests/test_modules.py | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index d3992a9..c6e7f85 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -545,6 +545,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         .to(torch.float16)
         .to("cuda")
     )
+    w1, w2 = mlp.fc1.weight.clone(), mlp.fc2.weight.clone()
 
     for i in range(100):
         b1 = torch.randn(16, 8, 32, device="cuda").half()
@@ -567,8 +568,15 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         assert o1.requires_grad
         grad_proj = torch.randn_like(o1)
 
+        mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
 
+        grad_ref = grad_proj.flatten(2) @ w2 @ w1
+        assert torch.allclose(b1.grad, grad_ref)
+
+
+
+
 
 
-- 
cgit v1.2.3


From 6a826c41a6e4b9d8e6d2b8c768d769587cc85672 Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:20:34 +0300
Subject: pre-cast

---
 tests/test_modules.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index c6e7f85..01c9389 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -538,14 +538,11 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
     assert mlp.fc1.weight.device.type == "cuda"
     assert mlp.fc2.weight.device.type == "cuda"
 
-    mlp = (
-        MLP8bit(
+    mlp = MLP8bit(
             32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
         )
-        .to(torch.float16)
-        .to("cuda")
-    )
     w1, w2 = mlp.fc1.weight.clone(), mlp.fc2.weight.clone()
+    mlp = mlp.cuda().half()
 
     for i in range(100):
         b1 = torch.randn(16, 8, 32, device="cuda").half()
-- 
cgit v1.2.3


From 37f805bb44cd577422b792ae5bd1110f3eec69f6 Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:21:12 +0300
Subject: debug

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 01c9389..8108b35 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -567,7 +567,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
-
+        assert False, (w1, w2)
         grad_ref = grad_proj.flatten(2) @ w2 @ w1
         assert torch.allclose(b1.grad, grad_ref)
 
-- 
cgit v1.2.3


From 95dafc6475bc36490e213269d1028adfd4f75363 Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:22:31 +0300
Subject: cast before allclose

---
 tests/test_modules.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 8108b35..dbadea9 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -541,8 +541,8 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
     mlp = MLP8bit(
             32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
         )
-    w1, w2 = mlp.fc1.weight.clone(), mlp.fc2.weight.clone()
-    mlp = mlp.cuda().half()
+    w1, w2 = mlp.fc1.weight.clone(), mlp.fc2.weight.clone()  # note: we grad original weights before quantization,
+    mlp = mlp.cuda().half()  # and this line triggers quantization
 
     for i in range(100):
         b1 = torch.randn(16, 8, 32, device="cuda").half()
@@ -567,8 +567,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
-        assert False, (w1, w2)
-        grad_ref = grad_proj.flatten(2) @ w2 @ w1
+        grad_ref = grad_proj.flatten(2) @ w2.to(grad_proj.device) @ w1.to(grad_proj.device)
         assert torch.allclose(b1.grad, grad_ref)
 
 
-- 
cgit v1.2.3


From 28a9313ddcf09c40d6cea75b3fd932ef09b4c715 Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:24:27 +0300
Subject: cast before allclose

---
 tests/test_modules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index dbadea9..bb65edb 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -541,7 +541,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
     mlp = MLP8bit(
             32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward
         )
-    w1, w2 = mlp.fc1.weight.clone(), mlp.fc2.weight.clone()  # note: we grad original weights before quantization,
+    w1, w2 = mlp.fc1.weight.clone().cuda(), mlp.fc2.weight.clone().cuda()  # grab weights before quantization,
     mlp = mlp.cuda().half()  # and this line triggers quantization
 
     for i in range(100):
@@ -567,7 +567,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
-        grad_ref = grad_proj.flatten(2) @ w2.to(grad_proj.device) @ w1.to(grad_proj.device)
+        grad_ref = grad_proj.flatten(2) @ w2.to() @ w1.to(grad_proj.device)
         assert torch.allclose(b1.grad, grad_ref)
 
 
-- 
cgit v1.2.3


From 725cc729931e21fd57377caba702da1ebecaa2ff Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:24:44 +0300
Subject: cast device

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index bb65edb..8e009b4 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -567,7 +567,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
-        grad_ref = grad_proj.flatten(2) @ w2.to() @ w1.to(grad_proj.device)
+        grad_ref = grad_proj.flatten(2) @ w2 @ w1
         assert torch.allclose(b1.grad, grad_ref)
 
 
-- 
cgit v1.2.3


From e4086a2758c171993f47b46cf0980030afe6db4a Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:24:57 +0300
Subject: cast device

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 8e009b4..049858c 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -567,7 +567,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
-        grad_ref = grad_proj.flatten(2) @ w2 @ w1
+        grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
         assert torch.allclose(b1.grad, grad_ref)
 
 
-- 
cgit v1.2.3


From 01b4c6a048abad182fc7c40038c232ce1493c54f Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:25:56 +0300
Subject: cast device

---
 tests/test_modules.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 049858c..d2ef856 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -568,7 +568,8 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         mlp.zero_grad()
         (o1 * grad_proj).sum().backward()
         grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
-        assert torch.allclose(b1.grad, grad_ref)
+        scale = grad_ref.abs().mean()
+        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.1 * scale)
 
 
-- 
cgit v1.2.3


From 32a9a88f987e26c5b891ce1f881f008307b4548c Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:26:12 +0300
Subject: cast device

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index d2ef856..163edf6 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -569,7 +569,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         (o1 * grad_proj).sum().backward()
         grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
         scale = grad_ref.abs().mean()
-        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.1 * scale)
+        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.01 * scale)
 
 
-- 
cgit v1.2.3


From cff3a7159943369841675dbc1076e555ffb2260b Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Sun, 18 Sep 2022 01:26:25 +0300
Subject: cast device

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 163edf6..faf91b8 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -569,7 +569,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         (o1 * grad_proj).sum().backward()
         grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
         scale = grad_ref.abs().mean()
-        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.01 * scale)
+        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.05 * scale)
 
 
-- 
cgit v1.2.3


From a07825ac31eb5585bd75f9788880536d5fc77f3a Mon Sep 17 00:00:00 2001
From: justheuristic <justheuristic@gmail.com>
Date: Tue, 20 Sep 2022 06:40:36 +0300
Subject: review

---
 tests/test_modules.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index faf91b8..235acde 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -569,12 +569,10 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         (o1 * grad_proj).sum().backward()
         grad_ref = grad_proj.flatten(2) @ w2.half() @ w1.half()
         scale = grad_ref.abs().mean()
-        assert torch.allclose(b1.grad, grad_ref, rtol=0, atol=0.05 * scale)
-
-
-
-
 
+        torch.testing.assert_allclose(b1.grad, grad_ref, rtol=0, atol=0.05 * scale)
+        idx = torch.isclose(b1.grad, grad_ref, atol=0.01 * scale, rtol=0.1)
+        assert (idx == 0).sum().item() <= b1.numel() * 0.0
 
 
 def test_linear8bitlt_fp32_bias():
-- 
cgit v1.2.3


From 292a47871603cc1ebe620221358d571a8f5c6d8f Mon Sep 17 00:00:00 2001
From: Tim Dettmers <dettmers@cs.washington.edu>
Date: Tue, 20 Sep 2022 06:42:05 +0300
Subject: set threshold

---
 tests/test_modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'tests/test_modules.py')

diff --git a/tests/test_modules.py b/tests/test_modules.py
index 235acde..2879846 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -572,7 +572,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
 
         torch.testing.assert_allclose(b1.grad, grad_ref, rtol=0, atol=0.05 * scale)
         idx = torch.isclose(b1.grad, grad_ref, atol=0.01 * scale, rtol=0.1)
-        assert (idx == 0).sum().item() <= b1.numel() * 0.0
+        assert (idx == 0).sum().item() <= b1.numel() * 0.005
 
 
 def test_linear8bitlt_fp32_bias():
-- 
cgit v1.2.3