{ lib , sources , buildPythonPackage , pythonOlder , fetchFromGitHub , torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118 , safetensors , sentencepiece , ninja , cudaPackages , addOpenGLRunpath , which , gcc11 # cuda 11.7 requires g++ <12 }: buildPythonPackage rec { pname = "exllama"; version = sources.exllama.rev; format = "setuptools"; disabled = pythonOlder "3.9"; src = sources.exllama; # I only care about compiling for the Ampere architecture, which is what my # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension # builder # cannot autodetect the arch TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX"; CUDA_HOME = "${cudaPackages.cuda_nvcc}"; nativeBuildInputs = [ gcc11 which addOpenGLRunpath cudaPackages.cuda_nvcc cudaPackages.cuda_cudart ]; propagatedBuildInputs = [ torch safetensors sentencepiece ninja cudaPackages.cudatoolkit ]; doCheck = false; # no tests currently pythonImportsCheck = [ "exllama" "exllama.cuda_ext" "exllama.generator" "exllama.lora" "exllama.model" "exllama.tokenizer" ]; meta = with lib; { description = '' A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights. ''; homepage = "https://github.com/jllllll/exllama"; license = licenses.mit; maintainers = with maintainers; [ bsima ]; }; }