add macos support

This commit is contained in:
Yu Li
2023-12-25 16:17:48 -06:00
parent b4229e4534
commit 44ef329d41
12 changed files with 4891 additions and 44 deletions

View File

@@ -6,6 +6,10 @@ AirLLM优化inference内存4GB单卡GPU可以运行70B大语言模型推理
## Updates
[2023/12/25] v2.8: Support MacOS running 70B large language models.
支持苹果系统运行70B大模型
[2023/12/20] v2.7: Support AirLLMMixtral.
[2023/12/20] v2.6: Added AutoModel, automatically detect model type, no need to provide model class to initialize model.
@@ -30,7 +34,16 @@ airllm2.0。支持模型压缩速度提升3倍。
airllm发布。
## Table of Contents
* [Quick start](#quickstart)
* [Model Compression](#3-model-compression---3x-inference-speed-up)
* [Configurations](#configurations)
* [Run on MacOS](#macos)
* [Example notebooks](#example-python-notebook)
* [Supported Models](#supported-models)
* [Acknowledgement](#acknowledgement)
* [FAQ](#faq)
## Quickstart
@@ -99,7 +112,7 @@ Note: During inference, the original model will first be decomposed and saved la
注意推理过程会首先将原始模型按层分拆转存。请保证huggingface cache目录有足够的磁盘空间。
### 3. Model Compression - 3x Inference Speed Up!
## Model Compression - 3x Inference Speed Up!
We just added model compression based on block-wise quantization based model compression. Which can further **speed up the inference speed** for up to **3x** , with **almost ignorable accuracy loss!** (see more performance evaluation and why we use block-wise quantization in [this paper](https://arxiv.org/abs/2212.09720))
@@ -125,7 +138,7 @@ Quantization normally needs to quantize both weights and activations to really s
While in our case the bottleneck is mainly at the disk loading, we only need to make the model loading size smaller. So we get to only quantize the weights part, which is easier to ensure the accuracy.
### 4. Configurations
## Configurations
When initialize the model, we support the following configurations:
@@ -137,7 +150,18 @@ When initialize the model, we support the following configurations:
* **hf_token**: huggingface token can be provided here if downloading gated models like: *meta-llama/Llama-2-7b-hf*
* **prefetching**: prefetching to overlap the model loading and compute. By default turned on. For now only AirLLMLlama2 supports this.
### 5. Example Python Notebook
## MacOS
Just install airllm and run the code the same as on linux. See more in [Quick Start](#uickstart).
* make sure you installed [mlx](https://github.com/ml-explore/mlx?tab=readme-ov-file#installation) and torch
* you probabaly need to install python native see more [here](https://stackoverflow.com/a/65432861/21230266)
* only [Apple silicon](https://support.apple.com/en-us/HT211814) is supported
Example [python notebook](https://github.com/lyogavin/Anima/blob/main/air_llm/examples/run_on_macos.ipynb)
## Example Python Notebook
Example colabs here:
@@ -145,7 +169,7 @@ Example colabs here:
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
### 6. Supported Models
## Supported Models
#### [HF open llm leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) top models
@@ -267,7 +291,7 @@ model.tokenizer.decode(generation_output.sequences[0])
## 7. Acknowledgement
## Acknowledgement
A lot of the code are based on SimJeg's great work in the Kaggle exam competition. Big shoutout to SimJeg:
@@ -276,7 +300,7 @@ A lot of the code are based on SimJeg's great work in the Kaggle exam competitio
[the associated discussion](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/446414).
## 8. FAQ
## FAQ
### 8.1. MetadataIncompleteBuffer

View File

@@ -1,11 +1,22 @@
from .airllm import AirLLMLlama2
from .airllm_chatglm import AirLLMChatGLM
from .airllm_qwen import AirLLMQWen
from .airllm_baichuan import AirLLMBaichuan
from .airllm_internlm import AirLLMInternLM
from .airllm_mistral import AirLLMMistral
from .airllm_mixtral import AirLLMMixtral
from .airllm_base import AirLLMBaseModel
from .auto_model import AutoModel
from .utils import split_and_save_layers
from .utils import NotEnoughSpaceException
from sys import platform
is_on_mac_os = False
if platform == "darwin":
is_on_mac_os = True
if is_on_mac_os:
from .airllm_llama_mlx import AirLLMLlamaMlx
else:
from .airllm import AirLLMLlama2
from .airllm_chatglm import AirLLMChatGLM
from .airllm_qwen import AirLLMQWen
from .airllm_baichuan import AirLLMBaichuan
from .airllm_internlm import AirLLMInternLM
from .airllm_mistral import AirLLMMistral
from .airllm_mixtral import AirLLMMixtral
from .airllm_base import AirLLMBaseModel
from .auto_model import AutoModel
from .utils import split_and_save_layers
from .utils import NotEnoughSpaceException

View File

@@ -155,7 +155,12 @@ class AirLLMBaseModel(GenerationMixin):
# if derived class needs to create generation config differently, like Mistrial, this function can be overridden
def get_generation_config(self):
return GenerationConfig.from_pretrained(self.model_local_path)
# protective on generation config
try:
return GenerationConfig.from_pretrained(self.model_local_path)
except Exception as e:
return GenerationConfig()
# a chance to customize tokenizer
def get_tokenizer(self, hf_token=None):
@@ -414,6 +419,7 @@ class AirLLMBaseModel(GenerationMixin):
for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.running_device,
desc='running layers:',
total=len(self.layers)):
if self.prefetching:

View File

@@ -0,0 +1,431 @@
import argparse
import json
import time
import gc
from tqdm import tqdm
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Tuple
import mlx.core as mx
import mlx.nn as nn
from sentencepiece import SentencePieceProcessor
from .persist import ModelPersister
import psutil
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel, GenerationMixin, LlamaForCausalLM, GenerationConfig
from .utils import clean_memory, load_layer, \
find_or_create_local_splitted_path
@dataclass
class ModelArgs:
dim: int
n_layers: int
head_dim: int
hidden_dim: int
n_heads: int
n_kv_heads: int
norm_eps: float
vocab_size: int
rope_theta: float
rope_traditional: bool = True
def sanitize_config(config, weights=None):
config.pop("model_type", None)
n_heads = config["n_heads"] if 'n_heads' in config else config['num_attention_heads']
if "n_kv_heads" not in config:
config["n_kv_heads"] = n_heads
if "head_dim" not in config:
config["head_dim"] = config["dim"] // n_heads
#if "hidden_dim" not in config:
# config["hidden_dim"] = weights["layers.0.feed_forward.w1.weight"].shape[0]
#if config.get("vocab_size", -1) < 0:
# config["vocab_size"] = weights["output.weight"].shape[-1]
if "rope_theta" not in config:
config["rope_theta"] = 10000
unused = ["multiple_of", "ffn_dim_multiplier"]
for k in unused:
config.pop(k, None)
return config
def get_model_args_from_config(config):
params = {}
params["dim"] = config.hidden_size
params["hidden_dim"] = config.intermediate_size
params["n_heads"] = config.num_attention_heads
if hasattr(config, "num_key_value_heads"):
params["n_kv_heads"] = config.num_key_value_heads
params["n_layers"] = config.num_hidden_layers
params["vocab_size"] = config.vocab_size
params["norm_eps"] = config.rms_norm_eps
params["rope_traditional"] = False
sconfig = sanitize_config(params)
# quantization = config.pop("quantization", None)
model_args = ModelArgs(**sconfig)
return model_args
class RMSNorm(nn.Module):
def __init__(self, dims: int, eps: float = 1e-5):
super().__init__()
self.weight = mx.ones((dims,))
self.eps = eps
def _norm(self, x):
return x * mx.rsqrt(x.square().mean(-1, keepdims=True) + self.eps)
def __call__(self, x):
output = self._norm(x.astype(mx.float32)).astype(x.dtype)
return self.weight * output
class Attention(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.args = args
self.n_heads: int = args.n_heads
self.n_kv_heads: int = args.n_kv_heads
self.repeats = self.n_heads // self.n_kv_heads
self.scale = self.args.head_dim**-0.5
self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=False)
self.wk = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
self.wv = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=False)
self.rope = nn.RoPE(
args.head_dim, traditional=args.rope_traditional, base=args.rope_theta
)
def __call__(
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[Tuple[mx.array, mx.array]] = None,
) -> mx.array:
B, L, D = x.shape
queries, keys, values = self.wq(x), self.wk(x), self.wv(x)
# Prepare the queries, keys and values for the attention computation
queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
def repeat(a):
a = mx.concatenate([mx.expand_dims(a, 2)] * self.repeats, axis=2)
return a.reshape([B, self.n_heads, L, -1])
keys, values = map(repeat, (keys, values))
if cache is not None:
key_cache, value_cache = cache
queries = self.rope(queries, offset=key_cache.shape[2])
keys = self.rope(keys, offset=key_cache.shape[2])
keys = mx.concatenate([key_cache, keys], axis=2)
values = mx.concatenate([value_cache, values], axis=2)
else:
queries = self.rope(queries)
keys = self.rope(keys)
scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)
if mask is not None:
scores += mask
scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
output = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
return self.wo(output), (keys, values)
class FeedForward(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=False)
self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=False)
self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=False)
def __call__(self, x) -> mx.array:
return self.w2(nn.silu(self.w1(x)) * self.w3(x))
class TransformerBlock(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.n_heads = args.n_heads
self.dim = args.dim
self.attention = Attention(args)
self.feed_forward = FeedForward(args=args)
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.args = args
def __call__(
self,
x: mx.array,
mask: Optional[mx.array] = None,
cache: Optional[Tuple[mx.array, mx.array]] = None,
) -> mx.array:
r, cache = self.attention(self.attention_norm(x), mask, cache)
h = x + r
r = self.feed_forward(self.ffn_norm(h))
out = h + r
return out, cache
def sample(logits, temperature=0):
if temperature == 0:
return mx.argmax(logits, axis=-1)
else:
return mx.random.categorical(logits * (1 / temperature))
class AirLLMLlamaMlx:
# customize layer names here
def set_layer_names_dict(self):
self.layer_names_dict = {'embed': 'model.embed_tokens',
'layer_prefix': 'model.layers',
'norm': 'model.norm',
'lm_head': 'lm_head',}
def record_memory(self, msg=None):
if not self.show_memory_util:
return
available = psutil.virtual_memory().available / 1024 / 1024
if self.least_available is None:
self.least_available = available
else:
self.least_available = min(available, self.least_available)
print(f"[{msg}] - available mem: {available:.02}mb, least available:{available:.02}mb")
def __init__(self, model_local_path_or_repo_id, device="cuda:0", dtype=None, max_seq_len=512,
layer_shards_saving_path=None, profiling_mode=False, compression=None,
hf_token=None, prefetching=True, test_nonlayered=False, show_memory_util=False):
self.hf_token = hf_token
self.set_layer_names_dict()
self.test_nonlayered = test_nonlayered
self.show_memory_util = show_memory_util
self.least_available = None
self.model_local_path, self.checkpoint_path = find_or_create_local_splitted_path(model_local_path_or_repo_id,
layer_shards_saving_path,
compression=compression,
layer_names=self.layer_names_dict,
hf_token=hf_token)
if hf_token is not None:
self.config = AutoConfig.from_pretrained(self.model_local_path, token=hf_token, trust_remote_code=True)
else:
self.config = AutoConfig.from_pretrained(self.model_local_path, trust_remote_code=True)
self.model_args = get_model_args_from_config(self.config)
self.layer_names = [self.layer_names_dict['embed']] + \
[f'{self.layer_names_dict["layer_prefix"]}.{i}' for i in range(self.model_args.n_layers)] + \
[self.layer_names_dict['norm'], self.layer_names_dict['lm_head']]
self.tokenizer = self.get_tokenizer(hf_token=hf_token)
def get_tokenizer(self, hf_token=None):
if hf_token is not None:
return AutoTokenizer.from_pretrained(self.model_local_path, token=hf_token, trust_remote_code=True)
else:
return AutoTokenizer.from_pretrained(self.model_local_path, trust_remote_code=True)
def generate(self, x, temperature=0, max_new_tokens=None, **kwargs):
tokens = []
for token in self.model_generate(x, temperature=temperature):
tokens.append(token)
if len(tokens) >= max_new_tokens:
break
s = self.tokenizer.decode([t.item() for t in tokens])
return s
def model_generate(self, x, temperature=0, max_new_tokens=None):
cache = []
TEST_NO_LAYERED = True
# Make an additive causal mask. We will need that to process the prompt.
mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
# First we process the prompt x the same was as in __call__ but
# save the caches in cache
self.record_memory('before_tok_embeddings')
self.tok_embeddings = nn.Embedding(self.model_args.vocab_size, self.model_args.dim)
#w0 = self.tok_embeddings.weight[0][0]
mask = mask.astype(self.tok_embeddings.weight.dtype)
self.record_memory('before_loading_tok')
update_weights = ModelPersister.get_model_persister().load_model(self.layer_names_dict['embed'], self.checkpoint_path)
self.record_memory('after_loading_tok')
self.tok_embeddings.update(update_weights['tok_embeddings'])
#w1 = self.tok_embeddings.weight[0][0]
#assert w0 != w1, f"weight should change after updates, weights: {update_weights}"
x = self.tok_embeddings(x)
# force execution
mx.eval(x)
if not self.test_nonlayered:
del self.tok_embeddings
gc.collect()
else:
print(f"self.test_nonlayered:{self.test_nonlayered}, save layers")
self.layers = []
self.record_memory('after_tok_embeddings')
#for l in self.layers:
for il in tqdm(range(self.model_args.n_layers), desc='running layers:'):
self.record_memory(f'before layer {il}')
l = TransformerBlock(args=self.model_args)
l.update(
ModelPersister.get_model_persister().load_model(f'{self.layer_names_dict["layer_prefix"]}.{il}',
self.checkpoint_path)['layers'][il]
)
x, c = l(x, mask=mask)
# force execution
mx.eval(x)
# We store the per layer cache in a simple python list
cache.append(c)
if not self.test_nonlayered:
del l
gc.collect()
else:
self.layers.append(l)
self.record_memory(f'after layer {il}')
self.record_memory('before_norm')
self.norm = RMSNorm(self.model_args.dim, eps=self.model_args.norm_eps)
self.norm.update(
ModelPersister.get_model_persister().load_model(self.layer_names_dict['norm'], self.checkpoint_path)['norm']
)
x = self.norm(x)
# force execution
mx.eval(x)
if not self.test_nonlayered:
del self.norm
gc.collect()
self.record_memory('after_norm')
# We only care about the last logits that generate the next token
self.record_memory('before_lmhead')
self.output = nn.Linear(self.model_args.dim, self.model_args.vocab_size, bias=False)
self.output.update(
ModelPersister.get_model_persister().load_model(self.layer_names_dict['lm_head'], self.checkpoint_path)['output']
)
y = self.output(x[:, -1])
# force execution
mx.eval(y)
if not self.test_nonlayered:
del self.output
gc.collect()
self.record_memory('after_lmhead')
y = sample(y)
# y now has size [1]
# Since MLX is lazily evaluated nothing is computed yet.
# Calling y.item() would force the computation to happen at
# this point but we can also choose not to do that and let the
# user choose when to start the computation.
yield y
# Now we parsed the prompt and generated the first token we
# need to feed it back into the model and loop to generate the
# rest.
while True:
# Unsqueezing the last dimension to add a sequence length
# dimension of 1
x = y[:, None]
if not self.test_nonlayered:
self.record_memory('before_tok_embeddings')
self.tok_embeddings = nn.Embedding(self.model_args.vocab_size, self.model_args.dim)
#w0 = self.tok_embeddings.weight[0][0]
self.tok_embeddings.update(
ModelPersister.get_model_persister().load_model(self.layer_names_dict['embed'], self.checkpoint_path)['tok_embeddings'])
#w1 = self.tok_embeddings.weight[0][0]
#assert w0 != w1, f"weight should change after updates."
x = self.tok_embeddings(x)
# force execution
mx.eval(x)
if not self.test_nonlayered:
del self.tok_embeddings
gc.collect()
self.record_memory('after_tok_embeddings')
for i in tqdm(range(len(cache)), desc='running layers:'):
self.record_memory(f'before layer {il}')
# We are overwriting the arrays in the cache list. When
# the computation will happen, MLX will be discarding the
# old cache the moment it is not needed anymore.
if not self.test_nonlayered:
l = TransformerBlock(args=self.model_args)
l.update(ModelPersister.get_model_persister().load_model(f'{self.layer_names_dict["layer_prefix"]}.{i}',
self.checkpoint_path)['layers'][i])
else:
l = self.layers[i]
x, cache[i] = l(x, mask=None, cache=cache[i])
# force execution
mx.eval(x)
if not self.test_nonlayered:
del l
gc.collect()
self.record_memory(f'after layer {il}')
self.record_memory('before_norm')
if not self.test_nonlayered:
self.norm = RMSNorm(self.model_args.dim, eps=self.model_args.norm_eps)
self.norm.update(ModelPersister.get_model_persister().load_model(self.layer_names_dict['norm'], self.checkpoint_path)['norm'])
x = self.norm(x)
# force execution
mx.eval(x)
if not self.test_nonlayered:
del self.norm
gc.collect()
self.record_memory('after_norm')
if not self.test_nonlayered:
self.output = nn.Linear(self.model_args.dim, self.model_args.vocab_size, bias=False)
self.output.update(ModelPersister.get_model_persister().load_model(self.layer_names_dict['lm_head'], self.checkpoint_path)['output'])
y = sample(self.output(x[:, -1]))
# force execution
mx.eval(y)
if not self.test_nonlayered:
del self.output
gc.collect()
self.record_memory('after_lmhead')
yield y

View File

@@ -0,0 +1 @@
from .model_persister import ModelPersister

View File

@@ -0,0 +1,113 @@
import os
from pathlib import Path
import mlx.core as mx
from .model_persister import ModelPersister
from mlx.utils import tree_unflatten
import torch
import psutil
import numpy as np
from itertools import starmap
def map_torch_to_mlx(model):
# things to change
# 1. there's no "model." in the weight names
model = {k.replace("model.", ""): v for k, v in model.items()}
# 2. mlp is called feed_forward
model = {k.replace("mlp", "feed_forward"): v for k, v in model.items()}
# 3. up_proj, down_proj, gate_proj
model = {k.replace("down_proj", "w2"): v for k, v in model.items()}
model = {k.replace("up_proj", "w3"): v for k, v in model.items()}
model = {k.replace("gate_proj", "w1"): v for k, v in model.items()}
# 4. layernorms
model = {
k.replace("input_layernorm", "attention_norm"): v for k, v in model.items()
}
model = {
k.replace("post_attention_layernorm", "ffn_norm"): v for k, v in model.items()
}
# 5. lm head
model = {k.replace("lm_head", "output"): v for k, v in model.items()}
# 6. token emb
model = {k.replace("embed_tokens", "tok_embeddings"): v for k, v in model.items()}
# 7. attention
model = {k.replace("self_attn", "attention"): v for k, v in model.items()}
model = {k.replace("q_proj", "wq"): v for k, v in model.items()}
model = {k.replace("k_proj", "wk"): v for k, v in model.items()}
model = {k.replace("v_proj", "wv"): v for k, v in model.items()}
model = {k.replace("o_proj", "wo"): v for k, v in model.items()}
#weights = {k: v.to(torch.float16).numpy() for k, v in model.items()}
return model
class MlxModelPersister(ModelPersister):
def __init__(self, *args, **kwargs):
super(MlxModelPersister, self).__init__(*args, **kwargs)
def model_persist_exist(self, layer_name, saving_path):
safetensor_exists = os.path.exists(str(saving_path / (layer_name + 'mlx.npz')))
done_marker_exists = os.path.exists(str(saving_path / (layer_name + 'mlx.done')))
#print(f"checking {layer_name}, {saving_path} - {safetensor_exists},{done_marker_exists}")
return safetensor_exists and done_marker_exists
def persist_model(self, state_dict, layer_name, saving_path):
#save_file(state_dict, saving_path / (layer_name + 'safetensors'))
weights = {k: v.to(torch.float16).numpy() for k, v in state_dict.items()}
np.savez(
saving_path / (layer_name + 'mlx'),
**weights#map_torch_to_mlx(state_dict)
)
print(f"saved as: {saving_path / (layer_name + 'mlx')}")
# set done marker
(saving_path / (layer_name + 'mlx.done')).touch()
def load_model(self, layer_name, path):
try:
to_load_path = Path(path) / (layer_name + ".mlx.npz")
#available = psutil.virtual_memory().available / 1024 / 1024
#print(f"start loading: {to_load_path}, before loading: {available:.02f}")
layer_state_dict = mx.load(to_load_path)
#available = psutil.virtual_memory().available / 1024 / 1024
#print(f"loaded {layer_name}, available mem: {available:.02f}")
layer_state_dict = map_torch_to_mlx(layer_state_dict)
weights = tree_unflatten(list(layer_state_dict.items()))
#for el in layer_name.split("."):
# if len(el) > 0:
# if el.isdigit():
# el = int(el)
# weights = weights[el]
return weights
except Exception as ex:
print(f"error: {layer_name}, {path}")
raise ex

View File

@@ -0,0 +1,39 @@
model_persister = None
class ModelPersister:
def __init__(self):
pass
@classmethod
def get_model_persister(cls):
global model_persister
if model_persister is not None:
return model_persister
from sys import platform
is_on_mac_os = False
if platform == "darwin":
is_on_mac_os = True
if is_on_mac_os:
from .mlx_model_persister import MlxModelPersister
model_persister = MlxModelPersister()
else:
from .safetensor_model_persister import SafetensorModelPersister
model_persister = SafetensorModelPersister()
return model_persister
def model_persist_exist(self, layer_name, saving_path):
pass
def persist_model(self, state_dict, layer_name, path):
pass
def load_model(self, layer_name, path):
pass

View File

@@ -0,0 +1,38 @@
import os
from pathlib import Path
from .model_persister import ModelPersister
from safetensors.torch import load_file, save_file
class SafetensorModelPersister(ModelPersister):
def __init__(self, *args, **kwargs):
super(SafetensorModelPersister, self).__init__(*args, **kwargs)
def model_persist_exist(self, layer_name, saving_path):
safetensor_exists = os.path.exists(str(saving_path / (layer_name + 'safetensors')))
done_marker_exists = os.path.exists(str(saving_path / (layer_name + 'safetensors.done')))
return safetensor_exists and done_marker_exists
def persist_model(self, state_dict, layer_name, saving_path):
save_file(state_dict, saving_path / (layer_name + 'safetensors'))
print(f"saved as: {saving_path / (layer_name + 'safetensors')}")
# set done marker
(saving_path / (layer_name + 'safetensors.done')).touch()
def load_model(self, layer_name, path):
layer_state_dict = load_file(Path(path) / (layer_name + ".safetensors"), device="cpu")
return layer_state_dict

View File

@@ -10,11 +10,20 @@ import time
from collections import OrderedDict, defaultdict
from typing import Dict, List, Optional, Tuple, Union
from sys import platform
is_on_mac_os = False
if platform == "darwin":
is_on_mac_os = True
import torch
import torch.nn as nn
from safetensors.torch import load_file, save_file
from .persist import ModelPersister
try:
import bitsandbytes as bnb
@@ -27,7 +36,6 @@ except ImportError:
import huggingface_hub
# replacement for bnb quantstat.as_dict(True), until the bug is fixed....
def save_quant_state_to_dict(self, packed=True):
"""
@@ -105,7 +113,8 @@ def uncompress_layer_state_dict(layer_state_dict):
return layer_state_dict if uncompressed_layer_state_dict is None else uncompressed_layer_state_dict
def load_layer(local_path, layer_name, profiling=False):
layer_state_dict = load_file(Path(local_path) / (layer_name + ".safetensors"), device="cpu")
#layer_state_dict = load_file(Path(local_path) / (layer_name + ".safetensors"), device="cpu")
layer_state_dict = ModelPersister.get_model_persister().load_model(layer_name, local_path)
if profiling:
t = time.process_time()
@@ -216,22 +225,10 @@ def split_and_save_layers(checkpoint_path, layer_shards_saving_path=None, splitt
#print(f"checking exists: {saving_path}")
if os.path.exists(saving_path):
# dir already exists, check if all layer files are there
files_in_saving_path = glob(str(saving_path / "*.safetensors"))
done_files_in_saving_path = glob(str(saving_path / "*.safetensors.done"))
found_layers = {}
for layer in layers:
found_safetensor_file = [layer+'safetensors' in file_in_saving_path for file_in_saving_path in files_in_saving_path]
#print(layer)
#print(found_safetensor_file)
found_safetensor_file = any(found_safetensor_file)
found_done_file = [layer+'safetensors.done' in file_in_saving_path for file_in_saving_path in done_files_in_saving_path]
#print(found_done_file)
found_done_file = any(found_done_file)
found_layers[layer] = found_safetensor_file and found_done_file
found_layers[layer] = ModelPersister.get_model_persister().model_persist_exist(layer, saving_path)
if all(found_layers.values()):
@@ -278,15 +275,11 @@ def split_and_save_layers(checkpoint_path, layer_shards_saving_path=None, splitt
# Save layer state dict as using safetensors
safetensor_exists = os.path.exists(str(saving_path / (layer + 'safetensors')))
done_marker_exists = os.path.exists(str(saving_path / (layer + 'safetensors.done')))
if (not safetensor_exists) or (not done_marker_exists):
save_file(layer_state_dict, saving_path / (layer + 'safetensors'))
print(f"saved as: {saving_path / (layer + 'safetensors')}")
marker_exists = ModelPersister.get_model_persister().model_persist_exist(layer, saving_path)
if not marker_exists:
ModelPersister.get_model_persister().persist_model(layer_state_dict, layer, saving_path)
# set done marker
(saving_path / (layer + 'safetensors.done')).touch()
# Free memory
for k in layer_state_dict.keys():
@@ -332,10 +325,19 @@ def find_or_create_local_splitted_path(model_local_path_or_repo_id, layer_shards
f"Found local directory in {model_local_path_or_repo_id}, but didn't find downloaded model. Try using {model_local_path_or_repo_id} as a HF repo...")
# it should be a repo id at this point...
if hf_token is not None:
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token)
# check if there's safetensors saved, if so, exclude torch saves
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token, allow_patterns="model.safetensors.index.json")
if len(glob(str(Path(hf_cache_path) / "model.safetensors.index.json"))) > 0:
# there's safe tensor version, exclude torch version
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token,
ignore_patterns=['pytorch_model.bin.index.json', '*.bin'])
else:
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id)
hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id,
token=hf_token)
assert os.path.exists(Path(hf_cache_path) / 'pytorch_model.bin.index.json') or \
os.path.exists(Path(hf_cache_path) / 'model.safetensors.index.json'), \
f"{hf_cache_path}/pytorch_model.bin.index.json or {hf_cache_path}/model.safetensors.index.json should exists."

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools.setup(
name="airllm",
version="2.7",
version="2.8",
author="Gavin Li",
author_email="gavinli@animaai.cloud",
description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",

File diff suppressed because it is too large Load Diff