1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
{ lib
, sources
, buildPythonPackage
, pythonOlder
, fetchFromGitHub
, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
, safetensors
, sentencepiece
, ninja
, cudaPackages
, addOpenGLRunpath
, which
, gcc11 # cuda 11.7 requires g++ <12
}:
buildPythonPackage rec {
pname = "exllama";
version = sources.exllama.rev;
format = "setuptools";
disabled = pythonOlder "3.9";
src = sources.exllama;
# I only care about compiling for the Ampere architecture, which is what my
# RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
# builder
# cannot autodetect the arch
TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";
CUDA_HOME = "${cudaPackages.cuda_nvcc}";
nativeBuildInputs = [
gcc11
which
addOpenGLRunpath
cudaPackages.cuda_nvcc
cudaPackages.cuda_cudart
];
propagatedBuildInputs = [
torch safetensors sentencepiece ninja
cudaPackages.cudatoolkit
];
doCheck = false; # no tests currently
pythonImportsCheck = [
"exllama"
"exllama.cuda_ext"
"exllama.generator"
"exllama.lora"
"exllama.model"
"exllama.tokenizer"
];
meta = with lib; {
description = ''
A more memory-efficient rewrite of the HF transformers implementation of
Llama for use with quantized weights.
'';
homepage = "https://github.com/jllllll/exllama";
license = licenses.mit;
maintainers = with maintainers; [ bsima ];
};
}
|