{ lib
, sources
, buildPythonPackage
, pythonOlder
, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
, safetensors
, sentencepiece
, ninja
, cudaPackages
, addOpenGLRunpath
, which
, gcc11 # cuda 11.7 requires g++ <12
}:

buildPythonPackage rec {
  pname = "exllama";
  version = sources.exllama.rev;
  format = "setuptools";
  disabled = pythonOlder "3.9";

  src = sources.exllama;

  # I only care about compiling for the Ampere architecture, which is what my
  # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
  # builder
  # cannot autodetect the arch
  TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";

  CUDA_HOME = "${cudaPackages.cuda_nvcc}";

  nativeBuildInputs = [
    gcc11
    which
    addOpenGLRunpath
    cudaPackages.cuda_nvcc
    cudaPackages.cuda_cudart
  ];

  propagatedBuildInputs = [
    torch safetensors sentencepiece ninja
    cudaPackages.cudatoolkit
  ];

  doCheck = false;  # no tests currently
  pythonImportsCheck = [
    "exllama"
    "exllama.cuda_ext"
    "exllama.generator"
    "exllama.lora"
    "exllama.model"
    "exllama.tokenizer"
  ];

  meta = with lib; {
    description = ''
      A more memory-efficient rewrite of the HF transformers implementation of
      Llama for use with quantized weights.
    '';
    homepage = "https://github.com/jllllll/exllama";
    license = licenses.mit;
    maintainers = with maintainers; [ bsima ];
  };
}