Biz/Bild/Deps/exllama.nix


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

{ lib, sources, buildPythonPackage, pythonOlder
, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
, safetensors, sentencepiece, ninja, cudaPackages, addOpenGLRunpath, which
, libGL, gcc11 # cuda 11.7 requires g++ <12
}:

buildPythonPackage rec {
  pname = "exllama";
  version = sources.exllama.rev;
  format = "setuptools";
  disabled = pythonOlder "3.9";

  src = sources.exllama;

  # I only care about compiling for the Ampere architecture, which is what my
  # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
  # builder
  # cannot autodetect the arch
  TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";

  CUDA_HOME = "${cudaPackages.cuda_nvcc}";

  nativeBuildInputs = [
    gcc11
    which
    libGL
    addOpenGLRunpath
    cudaPackages.cuda_nvcc
    cudaPackages.cuda_cudart
  ];

  propagatedBuildInputs =
    [ torch safetensors sentencepiece ninja cudaPackages.cudatoolkit ];

  doCheck = false; # no tests currently
  pythonImportsCheck = [
    "exllama"
    "exllama.cuda_ext"
    "exllama.generator"
    "exllama.lora"
    "exllama.model"
    "exllama.tokenizer"
  ];

  meta = with lib; {
    description = ''
      A more memory-efficient rewrite of the HF transformers implementation of
      Llama for use with quantized weights.
    '';
    homepage = "https://github.com/jllllll/exllama";
    license = licenses.mit;
    maintainers = with maintainers; [ bsima ];
  };
}