summaryrefslogtreecommitdiff
path: root/Biz
diff options
context:
space:
mode:
authorBen Sima <ben@bsima.me>2023-08-10 21:11:23 -0400
committerBen Sima <ben@bsima.me>2023-08-16 14:29:43 -0400
commit247678afc7c74c98f64e8d19f67355d128946974 (patch)
tree6bde2696aab9029f67ff6eb136f26b81bcd5a4c4 /Biz
parent4e67ef22a7508150798413081bf8a5bb4adab6e5 (diff)
Add llama-cpp and exllama
Diffstat (limited to 'Biz')
-rw-r--r--Biz/Bild/Deps.nix8
-rw-r--r--Biz/Bild/Deps/exllama.nix64
-rw-r--r--Biz/Bild/Deps/llama-cpp.nix41
-rw-r--r--Biz/Bild/Sources.json24
4 files changed, 134 insertions, 3 deletions
diff --git a/Biz/Bild/Deps.nix b/Biz/Bild/Deps.nix
index 8fc9fc8..da18d89 100644
--- a/Biz/Bild/Deps.nix
+++ b/Biz/Bild/Deps.nix
@@ -33,9 +33,9 @@ in rec
python3 = super.python3.override {
packageOverrides = _: pysuper: with pysuper.pkgs.python3Packages; {
- accelerate = callPackage ./Deps/accelerate.nix {};
- bitsandbytes = callPackage ./Deps/bitsandbytes.nix {};
- lion-pytorch = callPackage ./Deps/lion-pytorch.nix {};
+ exllama = callPackage ./Deps/exllama.nix {
+ cudaPackages = super.pkgs.cudaPackages_11_7;
+ };
};
};
@@ -70,5 +70,7 @@ in rec
];
};
+ llama-cpp = super.callPackage ./Deps/llama-cpp.nix {};
+
nostr-rs-relay = super.callPackage ./Deps/nostr-rs-relay.nix {};
}
diff --git a/Biz/Bild/Deps/exllama.nix b/Biz/Bild/Deps/exllama.nix
new file mode 100644
index 0000000..54d6df1
--- /dev/null
+++ b/Biz/Bild/Deps/exllama.nix
@@ -0,0 +1,64 @@
+{ lib
+, sources
+, buildPythonPackage
+, pythonOlder
+, fetchFromGitHub
+, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
+, safetensors
+, sentencepiece
+, ninja
+, cudaPackages
+, addOpenGLRunpath
+, which
+, gcc11 # cuda 11.7 requires g++ <12
+}:
+
+buildPythonPackage rec {
+ pname = "exllama";
+ version = sources.exllama.rev;
+ format = "setuptools";
+ disabled = pythonOlder "3.9";
+
+ src = sources.exllama;
+
+ # I only care about compiling for the Ampere architecture, which is what my
+ # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
+ # builder
+ # cannot autodetect the arch
+ TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";
+
+ CUDA_HOME = "${cudaPackages.cuda_nvcc}";
+
+ nativeBuildInputs = [
+ gcc11
+ which
+ addOpenGLRunpath
+ cudaPackages.cuda_nvcc
+ cudaPackages.cuda_cudart
+ ];
+
+ propagatedBuildInputs = [
+ torch safetensors sentencepiece ninja
+ cudaPackages.cudatoolkit
+ ];
+
+ doCheck = false; # no tests currently
+ pythonImportsCheck = [
+ "exllama"
+ "exllama.cuda_ext"
+ "exllama.generator"
+ "exllama.lora"
+ "exllama.model"
+ "exllama.tokenizer"
+ ];
+
+ meta = with lib; {
+ description = ''
+ A more memory-efficient rewrite of the HF transformers implementation of
+ Llama for use with quantized weights.
+ '';
+ homepage = "https://github.com/jllllll/exllama";
+ license = licenses.mit;
+ maintainers = with maintainers; [ bsima ];
+ };
+}
diff --git a/Biz/Bild/Deps/llama-cpp.nix b/Biz/Bild/Deps/llama-cpp.nix
new file mode 100644
index 0000000..85bd778
--- /dev/null
+++ b/Biz/Bild/Deps/llama-cpp.nix
@@ -0,0 +1,41 @@
+{ stdenv
+, pkgs
+, sources
+, python3
+, cmake
+, pkgconfig
+, openmpi
+, cudaPackages
+}:
+let
+ llama-python = python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
+in stdenv.mkDerivation {
+ name = "llama.cpp";
+ version = sources.llama-cpp.rev;
+
+ src = sources.llama-cpp;
+
+ postPatch = ''
+ substituteInPlace ./ggml-metal.m \
+ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+ substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
+ '';
+
+ nativeBuildInputs = [ cmake pkgconfig ];
+ buildInputs = [ openmpi cudaPackages.cudatoolkit ];
+
+ cmakeFlags = [
+ "-DLLAMA_BUILD_SERVER=ON"
+ "-DLLAMA_MPI=ON"
+ "-DBUILD_SHARED_LIBS=ON"
+ "-DCMAKE_SKIP_BUILD_RPATH=ON"
+ "-DLLAMA_CUBLAS=ON"
+ ];
+
+ postInstall = ''
+ mv $out/bin/main $out/bin/llama
+ mv $out/bin/server $out/bin/llama-server
+ '';
+
+ meta.mainProgram = "llama";
+}
diff --git a/Biz/Bild/Sources.json b/Biz/Bild/Sources.json
index 6213d95..5d05ea0 100644
--- a/Biz/Bild/Sources.json
+++ b/Biz/Bild/Sources.json
@@ -24,6 +24,18 @@
"url": "https://github.com/docopt/docopt.hs/archive/cdd32227eaff46fb57330ced96d5c290cbd9e035.tar.gz",
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
},
+ "exllama": {
+ "branch": "master",
+ "description": "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights.",
+ "homepage": null,
+ "owner": "jllllll",
+ "repo": "exllama",
+ "rev": "3ddf3bd39bdff330623f3740cda4ae1537ef86d9",
+ "sha256": "0g87xm71jmw5bl4ya5dbk72fghhhwvrjqspaayq7zass16jixr1d",
+ "type": "tarball",
+ "url": "https://github.com/jllllll/exllama/archive/3ddf3bd39bdff330623f3740cda4ae1537ef86d9.tar.gz",
+ "url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
+ },
"ghc-exactprint": {
"branch": "master",
"description": "GHC version of haskell-src-exts exactPrint",
@@ -64,6 +76,18 @@
"url_template": "https://gitlab.com/kavalogic-inc/inspekt3d/-/archive/<version>/inspekt3d-<version>.tar.gz",
"version": "703f52ccbfedad2bf5240bf8183d1b573c9d54ef"
},
+ "llama-cpp": {
+ "branch": "master",
+ "description": "Port of Facebook's LLaMA model in C/C++",
+ "homepage": null,
+ "owner": "ggerganov",
+ "repo": "llama.cpp",
+ "rev": "e59fcb2bc129881f4a269fee748fb38bce0a64de",
+ "sha256": "18171pv8ymgkvv2q3y8f6l64sm9dmpa0w7yqipzhdxx2n9m1x6ln",
+ "type": "tarball",
+ "url": "https://github.com/ggerganov/llama.cpp/archive/e59fcb2bc129881f4a269fee748fb38bce0a64de.tar.gz",
+ "url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
+ },
"niv": {
"branch": "master",
"description": "Easy dependency management for Nix projects",