From 247678afc7c74c98f64e8d19f67355d128946974 Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Thu, 10 Aug 2023 21:11:23 -0400 Subject: Add llama-cpp and exllama --- Biz/Bild/Deps.nix | 8 +++--- Biz/Bild/Deps/exllama.nix | 64 +++++++++++++++++++++++++++++++++++++++++++++ Biz/Bild/Deps/llama-cpp.nix | 41 +++++++++++++++++++++++++++++ Biz/Bild/Sources.json | 24 +++++++++++++++++ 4 files changed, 134 insertions(+), 3 deletions(-) create mode 100644 Biz/Bild/Deps/exllama.nix create mode 100644 Biz/Bild/Deps/llama-cpp.nix (limited to 'Biz/Bild') diff --git a/Biz/Bild/Deps.nix b/Biz/Bild/Deps.nix index 8fc9fc8..da18d89 100644 --- a/Biz/Bild/Deps.nix +++ b/Biz/Bild/Deps.nix @@ -33,9 +33,9 @@ in rec python3 = super.python3.override { packageOverrides = _: pysuper: with pysuper.pkgs.python3Packages; { - accelerate = callPackage ./Deps/accelerate.nix {}; - bitsandbytes = callPackage ./Deps/bitsandbytes.nix {}; - lion-pytorch = callPackage ./Deps/lion-pytorch.nix {}; + exllama = callPackage ./Deps/exllama.nix { + cudaPackages = super.pkgs.cudaPackages_11_7; + }; }; }; @@ -70,5 +70,7 @@ in rec ]; }; + llama-cpp = super.callPackage ./Deps/llama-cpp.nix {}; + nostr-rs-relay = super.callPackage ./Deps/nostr-rs-relay.nix {}; } diff --git a/Biz/Bild/Deps/exllama.nix b/Biz/Bild/Deps/exllama.nix new file mode 100644 index 0000000..54d6df1 --- /dev/null +++ b/Biz/Bild/Deps/exllama.nix @@ -0,0 +1,64 @@ +{ lib +, sources +, buildPythonPackage +, pythonOlder +, fetchFromGitHub +, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118 +, safetensors +, sentencepiece +, ninja +, cudaPackages +, addOpenGLRunpath +, which +, gcc11 # cuda 11.7 requires g++ <12 +}: + +buildPythonPackage rec { + pname = "exllama"; + version = sources.exllama.rev; + format = "setuptools"; + disabled = pythonOlder "3.9"; + + src = sources.exllama; + + # I only care about compiling for the Ampere architecture, which is what my + # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension + # builder + # cannot autodetect the arch + TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX"; + + CUDA_HOME = "${cudaPackages.cuda_nvcc}"; + + nativeBuildInputs = [ + gcc11 + which + addOpenGLRunpath + cudaPackages.cuda_nvcc + cudaPackages.cuda_cudart + ]; + + propagatedBuildInputs = [ + torch safetensors sentencepiece ninja + cudaPackages.cudatoolkit + ]; + + doCheck = false; # no tests currently + pythonImportsCheck = [ + "exllama" + "exllama.cuda_ext" + "exllama.generator" + "exllama.lora" + "exllama.model" + "exllama.tokenizer" + ]; + + meta = with lib; { + description = '' + A more memory-efficient rewrite of the HF transformers implementation of + Llama for use with quantized weights. + ''; + homepage = "https://github.com/jllllll/exllama"; + license = licenses.mit; + maintainers = with maintainers; [ bsima ]; + }; +} diff --git a/Biz/Bild/Deps/llama-cpp.nix b/Biz/Bild/Deps/llama-cpp.nix new file mode 100644 index 0000000..85bd778 --- /dev/null +++ b/Biz/Bild/Deps/llama-cpp.nix @@ -0,0 +1,41 @@ +{ stdenv +, pkgs +, sources +, python3 +, cmake +, pkgconfig +, openmpi +, cudaPackages +}: +let + llama-python = python3.withPackages (ps: with ps; [ numpy sentencepiece ]); +in stdenv.mkDerivation { + name = "llama.cpp"; + version = sources.llama-cpp.rev; + + src = sources.llama-cpp; + + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' + ''; + + nativeBuildInputs = [ cmake pkgconfig ]; + buildInputs = [ openmpi cudaPackages.cudatoolkit ]; + + cmakeFlags = [ + "-DLLAMA_BUILD_SERVER=ON" + "-DLLAMA_MPI=ON" + "-DBUILD_SHARED_LIBS=ON" + "-DCMAKE_SKIP_BUILD_RPATH=ON" + "-DLLAMA_CUBLAS=ON" + ]; + + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + ''; + + meta.mainProgram = "llama"; +} diff --git a/Biz/Bild/Sources.json b/Biz/Bild/Sources.json index 6213d95..5d05ea0 100644 --- a/Biz/Bild/Sources.json +++ b/Biz/Bild/Sources.json @@ -24,6 +24,18 @@ "url": "https://github.com/docopt/docopt.hs/archive/cdd32227eaff46fb57330ced96d5c290cbd9e035.tar.gz", "url_template": "https://github.com///archive/.tar.gz" }, + "exllama": { + "branch": "master", + "description": "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights.", + "homepage": null, + "owner": "jllllll", + "repo": "exllama", + "rev": "3ddf3bd39bdff330623f3740cda4ae1537ef86d9", + "sha256": "0g87xm71jmw5bl4ya5dbk72fghhhwvrjqspaayq7zass16jixr1d", + "type": "tarball", + "url": "https://github.com/jllllll/exllama/archive/3ddf3bd39bdff330623f3740cda4ae1537ef86d9.tar.gz", + "url_template": "https://github.com///archive/.tar.gz" + }, "ghc-exactprint": { "branch": "master", "description": "GHC version of haskell-src-exts exactPrint", @@ -64,6 +76,18 @@ "url_template": "https://gitlab.com/kavalogic-inc/inspekt3d/-/archive//inspekt3d-.tar.gz", "version": "703f52ccbfedad2bf5240bf8183d1b573c9d54ef" }, + "llama-cpp": { + "branch": "master", + "description": "Port of Facebook's LLaMA model in C/C++", + "homepage": null, + "owner": "ggerganov", + "repo": "llama.cpp", + "rev": "e59fcb2bc129881f4a269fee748fb38bce0a64de", + "sha256": "18171pv8ymgkvv2q3y8f6l64sm9dmpa0w7yqipzhdxx2n9m1x6ln", + "type": "tarball", + "url": "https://github.com/ggerganov/llama.cpp/archive/e59fcb2bc129881f4a269fee748fb38bce0a64de.tar.gz", + "url_template": "https://github.com///archive/.tar.gz" + }, "niv": { "branch": "master", "description": "Easy dependency management for Nix projects", -- cgit v1.2.3