From 18956baad9cde85bc55bd0113092e51786d9e1ee Mon Sep 17 00:00:00 2001 From: Ben Sima Date: Mon, 2 Dec 2024 07:52:56 -0500 Subject: Delete Biz/Mynion.py and exllama Mynion was a prototype and while it was cool and worked well, it is unused and causing magma to build, which takes forever. I have settled on using ollama for local inference and a hosted inference API for production. --- Biz/Bild/Deps/exllama.nix | 54 ---------- Biz/Bild/Python.nix | 2 - Biz/Bild/Sources.json | 12 --- Biz/Mynion.py | 265 ---------------------------------------------- 4 files changed, 333 deletions(-) delete mode 100644 Biz/Bild/Deps/exllama.nix delete mode 100644 Biz/Mynion.py (limited to 'Biz') diff --git a/Biz/Bild/Deps/exllama.nix b/Biz/Bild/Deps/exllama.nix deleted file mode 100644 index 434e9a9..0000000 --- a/Biz/Bild/Deps/exllama.nix +++ /dev/null @@ -1,54 +0,0 @@ -{ lib, sources, buildPythonPackage, pythonOlder -, torch # tested on 2.0.1 and 2.1.0 (nightly) with cu118 -, safetensors, sentencepiece, ninja, cudaPackages, addOpenGLRunpath, which -, libGL, gcc11 # cuda 11.7 requires g++ <12 -}: - -buildPythonPackage rec { - pname = "exllama"; - version = sources.exllama.rev; - format = "setuptools"; - disabled = pythonOlder "3.9"; - - src = sources.exllama; - - # I only care about compiling for the Ampere architecture, which is what my - # RTX 3090 TI is, and for some reason (nix sandbox?) the torch extension - # builder - # cannot autodetect the arch - TORCH_CUDA_ARCH_LIST = "8.0;8.6+PTX"; - - CUDA_HOME = "${cudaPackages.cuda_nvcc}"; - - nativeBuildInputs = [ - gcc11 - which - libGL - addOpenGLRunpath - cudaPackages.cuda_nvcc - cudaPackages.cuda_cudart - ]; - - propagatedBuildInputs = - [ torch safetensors sentencepiece ninja cudaPackages.cudatoolkit ]; - - doCheck = false; # no tests currently - pythonImportsCheck = [ - "exllama" - "exllama.cuda_ext" - "exllama.generator" - "exllama.lora" - "exllama.model" - "exllama.tokenizer" - ]; - - meta = with lib; { - description = '' - A more memory-efficient rewrite of the HF transformers implementation of - Llama for use with quantized weights. - ''; - homepage = "https://github.com/jllllll/exllama"; - license = licenses.mit; - maintainers = with maintainers; [ bsima ]; - }; -} diff --git a/Biz/Bild/Python.nix b/Biz/Bild/Python.nix index 2385987..50a1779 100644 --- a/Biz/Bild/Python.nix +++ b/Biz/Bild/Python.nix @@ -4,8 +4,6 @@ _self: super: { with pysuper.pkgs.python3Packages; let dontCheck = p: p.overridePythonAttrs (_: { doCheck = false; }); in { - exllama = callPackage ./Deps/exllama.nix { }; - exllamav2 = callPackage ./Deps/exllamav2.nix { }; interegular = callPackage ./Deps/interegular.nix { }; llm-ollama = callPackage ./Deps/llm-ollama.nix { }; mypy = dontCheck pysuper.mypy; diff --git a/Biz/Bild/Sources.json b/Biz/Bild/Sources.json index 3b1e4fd..c12b6ce 100644 --- a/Biz/Bild/Sources.json +++ b/Biz/Bild/Sources.json @@ -25,18 +25,6 @@ "url_template": "https://github.com///archive/.tar.gz", "version": "0.7.0.8" }, - "exllama": { - "branch": "master", - "description": "A more memory-efficient rewrite of the HF transformers implementation of Llama for use with quantized weights.", - "homepage": null, - "owner": "jllllll", - "repo": "exllama", - "rev": "3ddf3bd39bdff330623f3740cda4ae1537ef86d9", - "sha256": "0g87xm71jmw5bl4ya5dbk72fghhhwvrjqspaayq7zass16jixr1d", - "type": "tarball", - "url": "https://github.com/jllllll/exllama/archive/3ddf3bd39bdff330623f3740cda4ae1537ef86d9.tar.gz", - "url_template": "https://github.com///archive/.tar.gz" - }, "ghc-exactprint": { "branch": "master", "description": "GHC version of haskell-src-exts exactPrint", diff --git a/Biz/Mynion.py b/Biz/Mynion.py deleted file mode 100644 index 83d427b..0000000 --- a/Biz/Mynion.py +++ /dev/null @@ -1,265 +0,0 @@ -"""Mynion is a helper.""" - -# : out mynion -# : dep exllama -# : dep slixmpp -import argparse -import Biz.Log -import dataclasses -import exllama # type: ignore[import] -import logging -import os -import pathlib -import slixmpp -import slixmpp.exceptions -import sys -import torch -import typing - - -def smoosh(s: str) -> str: - """Replace newlines with spaces.""" - return s.replace("\n", " ") - - -@dataclasses.dataclass -class Auth: - """Container for XMPP authentication.""" - - jid: str - password: str - - -class Mynion(slixmpp.ClientXMPP): - """A helper via xmpp.""" - - def __init__( - self: "Mynion", - auth: Auth, - model: exllama.model.ExLlama, - tokenizer: exllama.tokenizer.ExLlamaTokenizer, - generator: exllama.generator.ExLlamaGenerator, - ) -> None: - """Initialize Mynion chat bot service.""" - slixmpp.ClientXMPP.__init__(self, auth.jid, auth.password) - self.plugin.enable("xep_0085") # type: ignore[attr-defined] - self.plugin.enable("xep_0184") # type: ignore[attr-defined] - - self.name = "mynion" - self.user = "ben" - self.first_round = True - self.min_response_tokens = 4 - self.max_response_tokens = 256 - self.extra_prune = 256 - self.max_seq_len = 8000 - - self.model = model - self.tokenizer = tokenizer - self.generator = generator - - root = os.getenv("CODEROOT", "") - # this should be parameterized somehow - promptfile = pathlib.Path(root) / "Biz" / "Mynion" / "Prompt.md" - txt = promptfile.read_text().format(user=self.user, name=self.name) - - # this is the "system prompt", ideally i would load this in/out of a - # database with all of the past history. if the history gets too long, i - # can roll it up by asking llama to summarize it - self.past = smoosh(txt) - - ids = tokenizer.encode(self.past) - self.generator.gen_begin(ids) - - self.add_event_handler("session_start", self.session_start) - self.add_event_handler("message", self.message) - - def session_start(self: "Mynion") -> None: - """Start online session with xmpp server.""" - self.send_presence() - try: - self.get_roster() # type: ignore[no-untyped-call] - except slixmpp.exceptions.IqError as err: - logging.exception("There was an error getting the roster") - logging.exception(err.iq["error"]["condition"]) - self.disconnect() - except slixmpp.exceptions.IqTimeout: - logging.exception("Server is taking too long to respond") - self.disconnect() - - def message(self: "Mynion", msg: slixmpp.Message) -> None: - """Send a message.""" - if msg["type"] in {"chat", "normal"}: - res_line = f"{self.name}: " - res_tokens = self.tokenizer.encode(res_line) - num_res_tokens = res_tokens.shape[-1] - - if self.first_round: - in_tokens = res_tokens - else: - # read and format input - in_line = f"{self.user}: " + msg["body"].strip() + "\n" - in_tokens = self.tokenizer.encode(in_line) - in_tokens = torch.cat((in_tokens, res_tokens), dim=1) - - # If we're approaching the context limit, prune some whole lines - # from the start of the context. Also prune a little extra so we - # don't end up rebuilding the cache on every line when up against - # the limit. - expect_tokens = in_tokens.shape[-1] + self.max_response_tokens - max_tokens = self.max_seq_len - expect_tokens - if self.generator.gen_num_tokens() >= max_tokens: - generator.gen_prune_to( - self.max_seq_len - expect_tokens - self.extra_prune, - self.tokenizer.newline_token_id, - ) - - # feed in the user input and "{self.name}:", tokenized - self.generator.gen_feed_tokens(in_tokens) - - # start beam search? - self.generator.begin_beam_search() - - # generate tokens, with streaming - for i in range(self.max_response_tokens): - # disallowing the end condition tokens seems like a clean way to - # force longer replies - if i < self.min_response_tokens: - self.generator.disallow_tokens( - [ - self.tokenizer.newline_token_id, - self.tokenizer.eos_token_id, - ], - ) - else: - self.generator.disallow_tokens(None) - - # get a token - gen_token = self.generator.beam_search() - - # if token is EOS, replace it with a newline before continuing - if gen_token.item() == self.tokenizer.eos_token_id: - self.generator.replace_last_token( - self.tokenizer.newline_token_id, - ) - - # decode the current line - num_res_tokens += 1 - text = self.tokenizer.decode( - self.generator.sequence_actual[:, -num_res_tokens:][0], - ) - - # append to res_line - res_line += text[len(res_line) :] - - # end conditions - breakers = [ - self.tokenizer.eos_token_id, - # self.tokenizer.newline_token_id, - ] - if gen_token.item() in breakers: - break - - # try to drop the "ben:" at the end - if res_line.endswith(f"{self.user}:"): - logging.info("rewinding!") - plen = self.tokenizer.encode(f"{self.user}:").shape[-1] - self.generator.gen_rewind(plen) - break - - # end generation and send the reply - self.generator.end_beam_search() - res_line = res_line.removeprefix(f"{self.name}:") - res_line = res_line.removesuffix(f"{self.user}:") - self.first_round = False - msg.reply(res_line).send() # type: ignore[no-untyped-call] - - -MY_MODELS = [ - "Llama-2-13B-GPTQ", - "Nous-Hermes-13B-GPTQ", - "Nous-Hermes-Llama2-13b-GPTQ", - "Wizard-Vicuna-13B-Uncensored-GPTQ", - "Wizard-Vicuna-13B-Uncensored-SuperHOT-8K-GPTQ", - "Wizard-Vicuna-30B-Uncensored-GPTQ", - "CodeLlama-13B-Python-GPTQ", - "CodeLlama-13B-Instruct-GPTQ", - "CodeLlama-34B-Instruct-GPTQ", -] - - -def load_model(model_name: str) -> typing.Any: - """Load an ML model from disk.""" - if model_name not in MY_MODELS: - msg = f"{model_name} not available" - raise ValueError(msg) - if not torch.cuda.is_available(): - msg = "no cuda" - raise ValueError(msg) - sys.exit(1) - - torch.set_grad_enabled(mode=False) - torch.cuda.init() # type: ignore[no-untyped-call] - - ml_models = "/mnt/campbell/ben/ml-models" - - model_dir = pathlib.Path(ml_models) / model_name - - tokenizer_path = pathlib.Path(model_dir) / "tokenizer.model" - config_path = pathlib.Path(model_dir) / "config.json" - st = list(pathlib.Path(model_dir).glob("*.safetensors")) - if len(st) > 1: - msg = "found multiple safetensors!" - raise ValueError(msg) - if len(st) < 1: - msg = "could not find model" - raise ValueError(msg) - model_path = st[0] - - config = exllama.model.ExLlamaConfig(config_path) - config.model_path = model_path - - # gpu split - config.set_auto_map("23") - - model = exllama.model.ExLlama(config) - cache = exllama.model.ExLlamaCache(model) - tokenizer = exllama.tokenizer.ExLlamaTokenizer(tokenizer_path) - - generator = exllama.generator.ExLlamaGenerator(model, tokenizer, cache) - generator.settings = exllama.generator.ExLlamaGenerator.Settings() - - return (model, tokenizer, generator) - - -def main( - model: exllama.model.ExLlama, - tokenizer: exllama.tokenizer.ExLlamaTokenizer, - generator: exllama.generator.ExLlamaGenerator, - user: str, - password: str, -) -> None: - """ - Start the chatbot. - - This purposefully does not call 'load_model()' so that you can load the - model in the repl and then restart the chatbot without unloading it. - """ - Biz.Log.setup() - auth = Auth(user, password) - xmpp = Mynion(auth, model, tokenizer, generator) - xmpp.connect() - xmpp.process(forever=True) # type: ignore[no-untyped-call] - - -if __name__ == "__main__": - if "test" in sys.argv: - sys.stdout.write("pass: test: Biz/Mynion.py\n") - sys.exit(0) - else: - cli = argparse.ArgumentParser(description=__doc__) - cli.add_argument("-u", "--user") - cli.add_argument("-p", "--password") - cli.add_argument("-m", "--model", choices=MY_MODELS) - args = cli.parse_args() - model, tokenizer, generator = load_model(args.model) - main(model, tokenizer, generator, args.user, args.password) -- cgit v1.2.3