summaryrefslogtreecommitdiff
path: root/Biz
diff options
context:
space:
mode:
authorBen Sima <ben@bsima.me>2024-12-02 07:52:56 -0500
committerBen Sima <ben@bsima.me>2024-12-20 21:09:24 -0500
commit18956baad9cde85bc55bd0113092e51786d9e1ee (patch)
tree07e987380a05db04c4814f4ebbc1f5780817899c /Biz
parentbc78e72960dee2721c3648e8061cb543f775710b (diff)
Delete Biz/Mynion.py and exllama
Mynion was a prototype and while it was cool and worked well, it is unused and causing magma to build, which takes forever. I have settled on using ollama for local inference and a hosted inference API for production.
Diffstat (limited to 'Biz')
-rw-r--r--Biz/Bild/Deps/exllama.nix54
-rw-r--r--Biz/Bild/Python.nix2
-rw-r--r--Biz/Bild/Sources.json12
-rw-r--r--Biz/Mynion.py265
4 files changed, 0 insertions, 333 deletions
diff --git a/Biz/Bild/Deps/exllama.nix b/Biz/Bild/Deps/exllama.nix
deleted file mode 100644
index 434e9a9..0000000
--- a/Biz/Bild/Deps/exllama.nix
+++ /dev/null
@@ -1,54 +0,0 @@
-{ lib, sources, buildPythonPackage, pythonOlder
-, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118
-, safetensors, sentencepiece, ninja, cudaPackages, addOpenGLRunpath, which
-, libGL, gcc11 # cuda 11.7 requires g++ <12
-}:
-
-buildPythonPackage rec {
- pname = "exllama";
- version = sources.exllama.rev;
- format = "setuptools";
- disabled = pythonOlder "3.9";
-
- src = sources.exllama;
-
- # I only care about compiling for the Ampere architecture, which is what my
- # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension
- # builder
- # cannot autodetect the arch
- TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX";
-
- CUDA_HOME = "${cudaPackages.cuda_nvcc}";
-
- nativeBuildInputs = [
- gcc11
- which
- libGL
- addOpenGLRunpath
- cudaPackages.cuda_nvcc
- cudaPackages.cuda_cudart
- ];
-
- propagatedBuildInputs =
- [ torch safetensors sentencepiece ninja cudaPackages.cudatoolkit ];
-
- doCheck = false; # no tests currently
- pythonImportsCheck = [
- "exllama"
- "exllama.cuda_ext"
- "exllama.generator"
- "exllama.lora"
- "exllama.model"
- "exllama.tokenizer"
- ];
-
- meta = with lib; {
- description = ''
- A more memory-efficient rewrite of the HF transformers implementation of
- Llama for use with quantized weights.
- '';
- homepage = "https://github.com/jllllll/exllama";
- license = licenses.mit;
- maintainers = with maintainers; [ bsima ];
- };
-}
diff --git a/Biz/Bild/Python.nix b/Biz/Bild/Python.nix
index 2385987..50a1779 100644
--- a/Biz/Bild/Python.nix
+++ b/Biz/Bild/Python.nix
@@ -4,8 +4,6 @@ _self: super: {
with pysuper.pkgs.python3Packages;
let dontCheck = p: p.overridePythonAttrs (_: { doCheck = false; });
in {
- exllama = callPackage ./Deps/exllama.nix { };
- exllamav2 = callPackage ./Deps/exllamav2.nix { };
interegular = callPackage ./Deps/interegular.nix { };
llm-ollama = callPackage ./Deps/llm-ollama.nix { };
mypy = dontCheck pysuper.mypy;
diff --git a/Biz/Bild/Sources.json b/Biz/Bild/Sources.json
index 3b1e4fd..c12b6ce 100644
--- a/Biz/Bild/Sources.json
+++ b/Biz/Bild/Sources.json
@@ -25,18 +25,6 @@
"url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz",
"version": "0.7.0.8"
},
- "exllama": {
- "branch": "master",
- "description": "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights.",
- "homepage": null,
- "owner": "jllllll",
- "repo": "exllama",
- "rev": "3ddf3bd39bdff330623f3740cda4ae1537ef86d9",
- "sha256": "0g87xm71jmw5bl4ya5dbk72fghhhwvrjqspaayq7zass16jixr1d",
- "type": "tarball",
- "url": "https://github.com/jllllll/exllama/archive/3ddf3bd39bdff330623f3740cda4ae1537ef86d9.tar.gz",
- "url_template": "https://github.com/<owner>/<repo>/archive/<rev>.tar.gz"
- },
"ghc-exactprint": {
"branch": "master",
"description": "GHC version of haskell-src-exts exactPrint",
diff --git a/Biz/Mynion.py b/Biz/Mynion.py
deleted file mode 100644
index 83d427b..0000000
--- a/Biz/Mynion.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""Mynion is a helper."""
-
-# : out mynion
-# : dep exllama
-# : dep slixmpp
-import argparse
-import Biz.Log
-import dataclasses
-import exllama # type: ignore[import]
-import logging
-import os
-import pathlib
-import slixmpp
-import slixmpp.exceptions
-import sys
-import torch
-import typing
-
-
-def smoosh(s: str) -> str:
- """Replace newlines with spaces."""
- return s.replace("\n", " ")
-
-
-@dataclasses.dataclass
-class Auth:
- """Container for XMPP authentication."""
-
- jid: str
- password: str
-
-
-class Mynion(slixmpp.ClientXMPP):
- """A helper via xmpp."""
-
- def __init__(
- self: "Mynion",
- auth: Auth,
- model: exllama.model.ExLlama,
- tokenizer: exllama.tokenizer.ExLlamaTokenizer,
- generator: exllama.generator.ExLlamaGenerator,
- ) -> None:
- """Initialize Mynion chat bot service."""
- slixmpp.ClientXMPP.__init__(self, auth.jid, auth.password)
- self.plugin.enable("xep_0085") # type: ignore[attr-defined]
- self.plugin.enable("xep_0184") # type: ignore[attr-defined]
-
- self.name = "mynion"
- self.user = "ben"
- self.first_round = True
- self.min_response_tokens = 4
- self.max_response_tokens = 256
- self.extra_prune = 256
- self.max_seq_len = 8000
-
- self.model = model
- self.tokenizer = tokenizer
- self.generator = generator
-
- root = os.getenv("CODEROOT", "")
- # this should be parameterized somehow
- promptfile = pathlib.Path(root) / "Biz" / "Mynion" / "Prompt.md"
- txt = promptfile.read_text().format(user=self.user, name=self.name)
-
- # this is the "system prompt", ideally i would load this in/out of a
- # database with all of the past history. if the history gets too long, i
- # can roll it up by asking llama to summarize it
- self.past = smoosh(txt)
-
- ids = tokenizer.encode(self.past)
- self.generator.gen_begin(ids)
-
- self.add_event_handler("session_start", self.session_start)
- self.add_event_handler("message", self.message)
-
- def session_start(self: "Mynion") -> None:
- """Start online session with xmpp server."""
- self.send_presence()
- try:
- self.get_roster() # type: ignore[no-untyped-call]
- except slixmpp.exceptions.IqError as err:
- logging.exception("There was an error getting the roster")
- logging.exception(err.iq["error"]["condition"])
- self.disconnect()
- except slixmpp.exceptions.IqTimeout:
- logging.exception("Server is taking too long to respond")
- self.disconnect()
-
- def message(self: "Mynion", msg: slixmpp.Message) -> None:
- """Send a message."""
- if msg["type"] in {"chat", "normal"}:
- res_line = f"{self.name}: "
- res_tokens = self.tokenizer.encode(res_line)
- num_res_tokens = res_tokens.shape[-1]
-
- if self.first_round:
- in_tokens = res_tokens
- else:
- # read and format input
- in_line = f"{self.user}: " + msg["body"].strip() + "\n"
- in_tokens = self.tokenizer.encode(in_line)
- in_tokens = torch.cat((in_tokens, res_tokens), dim=1)
-
- # If we're approaching the context limit, prune some whole lines
- # from the start of the context. Also prune a little extra so we
- # don't end up rebuilding the cache on every line when up against
- # the limit.
- expect_tokens = in_tokens.shape[-1] + self.max_response_tokens
- max_tokens = self.max_seq_len - expect_tokens
- if self.generator.gen_num_tokens() >= max_tokens:
- generator.gen_prune_to(
- self.max_seq_len - expect_tokens - self.extra_prune,
- self.tokenizer.newline_token_id,
- )
-
- # feed in the user input and "{self.name}:", tokenized
- self.generator.gen_feed_tokens(in_tokens)
-
- # start beam search?
- self.generator.begin_beam_search()
-
- # generate tokens, with streaming
- for i in range(self.max_response_tokens):
- # disallowing the end condition tokens seems like a clean way to
- # force longer replies
- if i < self.min_response_tokens:
- self.generator.disallow_tokens(
- [
- self.tokenizer.newline_token_id,
- self.tokenizer.eos_token_id,
- ],
- )
- else:
- self.generator.disallow_tokens(None)
-
- # get a token
- gen_token = self.generator.beam_search()
-
- # if token is EOS, replace it with a newline before continuing
- if gen_token.item() == self.tokenizer.eos_token_id:
- self.generator.replace_last_token(
- self.tokenizer.newline_token_id,
- )
-
- # decode the current line
- num_res_tokens += 1
- text = self.tokenizer.decode(
- self.generator.sequence_actual[:, -num_res_tokens:][0],
- )
-
- # append to res_line
- res_line += text[len(res_line) :]
-
- # end conditions
- breakers = [
- self.tokenizer.eos_token_id,
- # self.tokenizer.newline_token_id,
- ]
- if gen_token.item() in breakers:
- break
-
- # try to drop the "ben:" at the end
- if res_line.endswith(f"{self.user}:"):
- logging.info("rewinding!")
- plen = self.tokenizer.encode(f"{self.user}:").shape[-1]
- self.generator.gen_rewind(plen)
- break
-
- # end generation and send the reply
- self.generator.end_beam_search()
- res_line = res_line.removeprefix(f"{self.name}:")
- res_line = res_line.removesuffix(f"{self.user}:")
- self.first_round = False
- msg.reply(res_line).send() # type: ignore[no-untyped-call]
-
-
-MY_MODELS = [
- "Llama-2-13B-GPTQ",
- "Nous-Hermes-13B-GPTQ",
- "Nous-Hermes-Llama2-13b-GPTQ",
- "Wizard-Vicuna-13B-Uncensored-GPTQ",
- "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ",
- "Wizard-Vicuna-30B-Uncensored-GPTQ",
- "CodeLlama-13B-Python-GPTQ",
- "CodeLlama-13B-Instruct-GPTQ",
- "CodeLlama-34B-Instruct-GPTQ",
-]
-
-
-def load_model(model_name: str) -> typing.Any:
- """Load an ML model from disk."""
- if model_name not in MY_MODELS:
- msg = f"{model_name} not available"
- raise ValueError(msg)
- if not torch.cuda.is_available():
- msg = "no cuda"
- raise ValueError(msg)
- sys.exit(1)
-
- torch.set_grad_enabled(mode=False)
- torch.cuda.init() # type: ignore[no-untyped-call]
-
- ml_models = "/mnt/campbell/ben/ml-models"
-
- model_dir = pathlib.Path(ml_models) / model_name
-
- tokenizer_path = pathlib.Path(model_dir) / "tokenizer.model"
- config_path = pathlib.Path(model_dir) / "config.json"
- st = list(pathlib.Path(model_dir).glob("*.safetensors"))
- if len(st) > 1:
- msg = "found multiple safetensors!"
- raise ValueError(msg)
- if len(st) < 1:
- msg = "could not find model"
- raise ValueError(msg)
- model_path = st[0]
-
- config = exllama.model.ExLlamaConfig(config_path)
- config.model_path = model_path
-
- # gpu split
- config.set_auto_map("23")
-
- model = exllama.model.ExLlama(config)
- cache = exllama.model.ExLlamaCache(model)
- tokenizer = exllama.tokenizer.ExLlamaTokenizer(tokenizer_path)
-
- generator = exllama.generator.ExLlamaGenerator(model, tokenizer, cache)
- generator.settings = exllama.generator.ExLlamaGenerator.Settings()
-
- return (model, tokenizer, generator)
-
-
-def main(
- model: exllama.model.ExLlama,
- tokenizer: exllama.tokenizer.ExLlamaTokenizer,
- generator: exllama.generator.ExLlamaGenerator,
- user: str,
- password: str,
-) -> None:
- """
- Start the chatbot.
-
- This purposefully does not call 'load_model()' so that you can load the
- model in the repl and then restart the chatbot without unloading it.
- """
- Biz.Log.setup()
- auth = Auth(user, password)
- xmpp = Mynion(auth, model, tokenizer, generator)
- xmpp.connect()
- xmpp.process(forever=True) # type: ignore[no-untyped-call]
-
-
-if __name__ == "__main__":
- if "test" in sys.argv:
- sys.stdout.write("pass: test: Biz/Mynion.py\n")
- sys.exit(0)
- else:
- cli = argparse.ArgumentParser(description=__doc__)
- cli.add_argument("-u", "--user")
- cli.add_argument("-p", "--password")
- cli.add_argument("-m", "--model", choices=MY_MODELS)
- args = cli.parse_args()
- model, tokenizer, generator = load_model(args.model)
- main(model, tokenizer, generator, args.user, args.password)