1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
|
{ lib, sources, buildPythonPackage, pythonOlder
, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
, safetensors, sentencepiece, ninja, cudaPackages, addOpenGLRunpath, which
, libGL, gcc11 # cuda 11.7 requires g++ <12
}:
buildPythonPackage rec {
pname = "exllama";
version = sources.exllama.rev;
format = "setuptools";
disabled = pythonOlder "3.9";
src = sources.exllama;
# I only care about compiling for the Ampere architecture, which is what my
# RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
# builder
# cannot autodetect the arch
TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";
CUDA_HOME = "${cudaPackages.cuda_nvcc}";
nativeBuildInputs = [
gcc11
which
libGL
addOpenGLRunpath
cudaPackages.cuda_nvcc
cudaPackages.cuda_cudart
];
propagatedBuildInputs =
[ torch safetensors sentencepiece ninja cudaPackages.cudatoolkit ];
doCheck = false; # no tests currently
pythonImportsCheck = [
"exllama"
"exllama.cuda_ext"
"exllama.generator"
"exllama.lora"
"exllama.model"
"exllama.tokenizer"
];
meta = with lib; {
description = ''
A more memory-efficient rewrite of the HF transformers implementation of
Llama for use with quantized weights.
'';
homepage = "https://github.com/jllllll/exllama";
license = licenses.mit;
maintainers = with maintainers; [ bsima ];
};
}
|