add macos support

2026-03-07 22:33:47 +00:00 · 2023-12-25 16:17:48 -06:00
parent b4229e4534
commit 44ef329d41
12 changed files with 4891 additions and 44 deletions
--- a/air_llm/README.md
+++ b/air_llm/README.md
@@ -6,6 +6,10 @@ AirLLM优化inference内存，4GB单卡GPU可以运行70B大语言模型推理

 ## Updates

+[2023/12/25] v2.8: Support MacOS running 70B large language models.
+
+支持苹果系统运行70B大模型！
+
 [2023/12/20] v2.7: Support AirLLMMixtral. 

 [2023/12/20] v2.6: Added AutoModel, automatically detect model type, no need to provide model class to initialize model.
@@ -30,7 +34,16 @@ airllm2.0。支持模型压缩，速度提升3倍。

 airllm发布。

+## Table of Contents

+* [Quick start](#quickstart)
+* [Model Compression](#3-model-compression---3x-inference-speed-up)
+* [Configurations](#configurations)
+* [Run on MacOS](#macos)
+* [Example notebooks](#example-python-notebook)
+* [Supported Models](#supported-models)
+* [Acknowledgement](#acknowledgement)
+* [FAQ](#faq)

 ## Quickstart

@@ -99,7 +112,7 @@ Note: During inference, the original model will first be decomposed and saved la
 注意：推理过程会首先将原始模型按层分拆，转存。请保证huggingface cache目录有足够的磁盘空间。


-### 3. Model Compression - 3x Inference Speed Up!
+## Model Compression - 3x Inference Speed Up!

 We just added model compression based on block-wise quantization based model compression. Which can further **speed up the inference speed** for up to **3x** , with **almost ignorable accuracy loss!** (see more performance evaluation and why we use block-wise quantization in [this paper](https://arxiv.org/abs/2212.09720))

@@ -125,7 +138,7 @@ Quantization normally needs to quantize both weights and activations to really s

 While in our case the bottleneck is mainly at the disk loading, we only need to make the model loading size smaller. So we get to only quantize the weights part, which is easier to ensure the accuracy.

-### 4. Configurations
+## Configurations
 
 When initialize the model, we support the following configurations:

@@ -137,7 +150,18 @@ When initialize the model, we support the following configurations:
 * **hf_token**: huggingface token can be provided here if downloading gated models like: *meta-llama/Llama-2-7b-hf*
 * **prefetching**: prefetching to overlap the model loading and compute. By default turned on. For now only AirLLMLlama2 supports this.

-### 5. Example Python Notebook
+## MacOS
+
+Just install airllm and run the code the same as on linux. See more in [Quick Start](#uickstart).
+
+* make sure you installed [mlx](https://github.com/ml-explore/mlx?tab=readme-ov-file#installation) and torch
+* you probabaly need to install python native see more [here](https://stackoverflow.com/a/65432861/21230266)
+* only [Apple silicon](https://support.apple.com/en-us/HT211814) is supported
+
+Example [python notebook](https://github.com/lyogavin/Anima/blob/main/air_llm/examples/run_on_macos.ipynb)
+
+
+## Example Python Notebook

 Example colabs here:

@@ -145,7 +169,7 @@ Example colabs here:
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

-### 6. Supported Models
+## Supported Models

 #### [HF open llm leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) top models 

@@ -267,7 +291,7 @@ model.tokenizer.decode(generation_output.sequences[0])



-## 7. Acknowledgement
+## Acknowledgement

 A lot of the code are based on SimJeg's great work in the Kaggle exam competition. Big shoutout to SimJeg:

@@ -276,7 +300,7 @@ A lot of the code are based on SimJeg's great work in the Kaggle exam competitio
 [the associated discussion](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/446414).


-## 8. FAQ
+## FAQ

 ### 8.1. MetadataIncompleteBuffer

--- a/air_llm/airllm/init.py
+++ b/air_llm/airllm/init.py
@@ -1,11 +1,22 @@
-from .airllm import AirLLMLlama2
-from .airllm_chatglm import AirLLMChatGLM
-from .airllm_qwen import AirLLMQWen
-from .airllm_baichuan import AirLLMBaichuan
-from .airllm_internlm import AirLLMInternLM
-from .airllm_mistral import AirLLMMistral
-from .airllm_mixtral import AirLLMMixtral
-from .airllm_base import AirLLMBaseModel
-from .auto_model import AutoModel
-from .utils import split_and_save_layers
-from .utils import NotEnoughSpaceException
+from sys import platform
+
+is_on_mac_os = False
+
+if platform == "darwin":
+    is_on_mac_os = True
+
+if is_on_mac_os:
+    from .airllm_llama_mlx import AirLLMLlamaMlx
+else:
+    from .airllm import AirLLMLlama2
+    from .airllm_chatglm import AirLLMChatGLM
+    from .airllm_qwen import AirLLMQWen
+    from .airllm_baichuan import AirLLMBaichuan
+    from .airllm_internlm import AirLLMInternLM
+    from .airllm_mistral import AirLLMMistral
+    from .airllm_mixtral import AirLLMMixtral
+    from .airllm_base import AirLLMBaseModel
+    from .auto_model import AutoModel
+    from .utils import split_and_save_layers
+    from .utils import NotEnoughSpaceException
+
--- a/air_llm/airllm/airllm_base.py
+++ b/air_llm/airllm/airllm_base.py
@@ -155,7 +155,12 @@ class AirLLMBaseModel(GenerationMixin):

    # if derived class needs to create generation config differently, like Mistrial, this function can be overridden
    def get_generation_config(self):
-        return GenerationConfig.from_pretrained(self.model_local_path)
+        # protective on generation config
+
+        try:
+            return GenerationConfig.from_pretrained(self.model_local_path)
+        except Exception as e:
+            return GenerationConfig()

    # a chance to customize tokenizer
    def get_tokenizer(self, hf_token=None):
@@ -414,6 +419,7 @@ class AirLLMBaseModel(GenerationMixin):


            for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.running_device,
+                                               desc='running layers:',
                                               total=len(self.layers)):

                if self.prefetching:
--- a/air_llm/airllm/airllm_llama_mlx.py
+++ b/air_llm/airllm/airllm_llama_mlx.py
@@ -0,0 +1,431 @@
+
+import argparse
+import json
+import time
+import gc
+from tqdm import tqdm
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+
+import mlx.core as mx
+import mlx.nn as nn
+from sentencepiece import SentencePieceProcessor
+from .persist import ModelPersister
+import psutil
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel, GenerationMixin, LlamaForCausalLM, GenerationConfig
+from .utils import clean_memory, load_layer, \
+    find_or_create_local_splitted_path
+
+
+
+@dataclass
+class ModelArgs:
+    dim: int
+    n_layers: int
+    head_dim: int
+    hidden_dim: int
+    n_heads: int
+    n_kv_heads: int
+    norm_eps: float
+    vocab_size: int
+    rope_theta: float
+    rope_traditional: bool = True
+
+def sanitize_config(config, weights=None):
+    config.pop("model_type", None)
+    n_heads = config["n_heads"] if 'n_heads' in config else config['num_attention_heads']
+    if "n_kv_heads" not in config:
+        config["n_kv_heads"] = n_heads
+    if "head_dim" not in config:
+        config["head_dim"] = config["dim"] // n_heads
+    #if "hidden_dim" not in config:
+    #    config["hidden_dim"] = weights["layers.0.feed_forward.w1.weight"].shape[0]
+    #if config.get("vocab_size", -1) < 0:
+    #    config["vocab_size"] = weights["output.weight"].shape[-1]
+    if "rope_theta" not in config:
+        config["rope_theta"] = 10000
+    unused = ["multiple_of", "ffn_dim_multiplier"]
+    for k in unused:
+        config.pop(k, None)
+    return config
+
+def get_model_args_from_config(config):
+    params = {}
+    params["dim"] = config.hidden_size
+    params["hidden_dim"] = config.intermediate_size
+    params["n_heads"] = config.num_attention_heads
+    if hasattr(config, "num_key_value_heads"):
+        params["n_kv_heads"] = config.num_key_value_heads
+    params["n_layers"] = config.num_hidden_layers
+    params["vocab_size"] = config.vocab_size
+    params["norm_eps"] = config.rms_norm_eps
+    params["rope_traditional"] = False
+
+    sconfig = sanitize_config(params)
+
+    # quantization = config.pop("quantization", None)
+    model_args = ModelArgs(**sconfig)
+    return model_args
+
+class RMSNorm(nn.Module):
+    def __init__(self, dims: int, eps: float = 1e-5):
+        super().__init__()
+        self.weight = mx.ones((dims,))
+        self.eps = eps
+
+    def _norm(self, x):
+        return x * mx.rsqrt(x.square().mean(-1, keepdims=True) + self.eps)
+
+    def __call__(self, x):
+        output = self._norm(x.astype(mx.float32)).astype(x.dtype)
+        return self.weight * output
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+
+        self.n_heads: int = args.n_heads
+        self.n_kv_heads: int = args.n_kv_heads
+
+        self.repeats = self.n_heads // self.n_kv_heads
+
+        self.scale = self.args.head_dim**-0.5
+
+        self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=False)
+        self.rope = nn.RoPE(
+            args.head_dim, traditional=args.rope_traditional, base=args.rope_theta
+        )
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+
+        queries, keys, values = self.wq(x), self.wk(x), self.wv(x)
+
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+
+        def repeat(a):
+            a = mx.concatenate([mx.expand_dims(a, 2)] * self.repeats, axis=2)
+            return a.reshape([B, self.n_heads, L, -1])
+
+        keys, values = map(repeat, (keys, values))
+
+        if cache is not None:
+            key_cache, value_cache = cache
+            queries = self.rope(queries, offset=key_cache.shape[2])
+            keys = self.rope(keys, offset=key_cache.shape[2])
+            keys = mx.concatenate([key_cache, keys], axis=2)
+            values = mx.concatenate([value_cache, values], axis=2)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+
+        scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)
+        if mask is not None:
+            scores += mask
+        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
+        output = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.wo(output), (keys, values)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=False)
+        self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=False)
+        self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=False)
+
+    def __call__(self, x) -> mx.array:
+        return self.w2(nn.silu(self.w1(x)) * self.w3(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args=args)
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.args = args
+
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Tuple[mx.array, mx.array]] = None,
+    ) -> mx.array:
+        r, cache = self.attention(self.attention_norm(x), mask, cache)
+        h = x + r
+        r = self.feed_forward(self.ffn_norm(h))
+        out = h + r
+        return out, cache
+
+def sample(logits, temperature=0):
+    if temperature == 0:
+        return mx.argmax(logits, axis=-1)
+    else:
+        return mx.random.categorical(logits * (1 / temperature))
+
+class AirLLMLlamaMlx:
+
+    # customize layer names here
+    def set_layer_names_dict(self):
+        self.layer_names_dict = {'embed': 'model.embed_tokens',
+                       'layer_prefix': 'model.layers',
+                       'norm': 'model.norm',
+                       'lm_head': 'lm_head',}
+
+
+    def record_memory(self, msg=None):
+        if not self.show_memory_util:
+            return
+
+        available = psutil.virtual_memory().available / 1024 / 1024
+        if self.least_available is None:
+            self.least_available = available
+        else:
+            self.least_available = min(available, self.least_available)
+
+        print(f"[{msg}] - available mem: {available:.02}mb, least available:{available:.02}mb")
+
+    def __init__(self, model_local_path_or_repo_id, device="cuda:0", dtype=None, max_seq_len=512,
+                 layer_shards_saving_path=None, profiling_mode=False, compression=None,
+                 hf_token=None, prefetching=True, test_nonlayered=False, show_memory_util=False):
+
+        self.hf_token = hf_token
+        self.set_layer_names_dict()
+        self.test_nonlayered = test_nonlayered
+        self.show_memory_util = show_memory_util
+        self.least_available = None
+
+
+
+        self.model_local_path, self.checkpoint_path = find_or_create_local_splitted_path(model_local_path_or_repo_id,
+                                                                                         layer_shards_saving_path,
+                                                                                         compression=compression,
+                                                                                         layer_names=self.layer_names_dict,
+                                                                                         hf_token=hf_token)
+        if hf_token is not None:
+            self.config = AutoConfig.from_pretrained(self.model_local_path, token=hf_token, trust_remote_code=True)
+        else:
+            self.config = AutoConfig.from_pretrained(self.model_local_path, trust_remote_code=True)
+
+
+        self.model_args = get_model_args_from_config(self.config)
+
+        self.layer_names = [self.layer_names_dict['embed']] + \
+                           [f'{self.layer_names_dict["layer_prefix"]}.{i}' for i in range(self.model_args.n_layers)] + \
+                           [self.layer_names_dict['norm'], self.layer_names_dict['lm_head']]
+
+        self.tokenizer = self.get_tokenizer(hf_token=hf_token)
+
+
+    def get_tokenizer(self, hf_token=None):
+        if hf_token is not None:
+            return AutoTokenizer.from_pretrained(self.model_local_path, token=hf_token, trust_remote_code=True)
+        else:
+            return AutoTokenizer.from_pretrained(self.model_local_path, trust_remote_code=True)
+
+
+    def generate(self, x, temperature=0, max_new_tokens=None, **kwargs):
+        tokens = []
+        for token in self.model_generate(x, temperature=temperature):
+            tokens.append(token)
+
+
+            if len(tokens) >= max_new_tokens:
+                break
+
+
+        s = self.tokenizer.decode([t.item() for t in tokens])
+        return s
+
+    def model_generate(self, x, temperature=0, max_new_tokens=None):
+        cache = []
+        TEST_NO_LAYERED = True
+
+        # Make an additive causal mask. We will need that to process the prompt.
+        mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
+
+        # First we process the prompt x the same was as in __call__ but
+        # save the caches in cache
+
+        self.record_memory('before_tok_embeddings')
+        self.tok_embeddings = nn.Embedding(self.model_args.vocab_size, self.model_args.dim)
+        #w0 = self.tok_embeddings.weight[0][0]
+        mask = mask.astype(self.tok_embeddings.weight.dtype)
+
+        self.record_memory('before_loading_tok')
+        update_weights = ModelPersister.get_model_persister().load_model(self.layer_names_dict['embed'], self.checkpoint_path)
+
+        self.record_memory('after_loading_tok')
+        self.tok_embeddings.update(update_weights['tok_embeddings'])
+        #w1 = self.tok_embeddings.weight[0][0]
+
+        #assert w0 != w1, f"weight should change after updates, weights: {update_weights}"
+
+        x = self.tok_embeddings(x)
+        # force execution
+        mx.eval(x)
+
+        if not self.test_nonlayered:
+
+            del self.tok_embeddings
+            gc.collect()
+        else:
+            print(f"self.test_nonlayered:{self.test_nonlayered}, save layers")
+            self.layers = []
+
+        self.record_memory('after_tok_embeddings')
+        #for l in self.layers:
+
+        for il in tqdm(range(self.model_args.n_layers), desc='running layers:'):
+            self.record_memory(f'before layer {il}')
+            l = TransformerBlock(args=self.model_args)
+            l.update(
+                ModelPersister.get_model_persister().load_model(f'{self.layer_names_dict["layer_prefix"]}.{il}',
+                                                                     self.checkpoint_path)['layers'][il]
+            )
+
+            x, c = l(x, mask=mask)
+            # force execution
+            mx.eval(x)
+            # We store the per layer cache in a simple python list
+            cache.append(c)
+
+            if not self.test_nonlayered:
+                del l
+                gc.collect()
+            else:
+                self.layers.append(l)
+            self.record_memory(f'after layer {il}')
+
+        self.record_memory('before_norm')
+        self.norm = RMSNorm(self.model_args.dim, eps=self.model_args.norm_eps)
+        self.norm.update(
+            ModelPersister.get_model_persister().load_model(self.layer_names_dict['norm'], self.checkpoint_path)['norm']
+        )
+        x = self.norm(x)
+        # force execution
+        mx.eval(x)
+        if not self.test_nonlayered:
+            del self.norm
+            gc.collect()
+        self.record_memory('after_norm')
+
+        # We only care about the last logits that generate the next token
+        self.record_memory('before_lmhead')
+        self.output = nn.Linear(self.model_args.dim, self.model_args.vocab_size, bias=False)
+        self.output.update(
+            ModelPersister.get_model_persister().load_model(self.layer_names_dict['lm_head'], self.checkpoint_path)['output']
+        )
+        y = self.output(x[:, -1])
+        # force execution
+        mx.eval(y)
+
+        if not self.test_nonlayered:
+            del self.output
+            gc.collect()
+        self.record_memory('after_lmhead')
+        y = sample(y)
+
+
+        # y now has size [1]
+        # Since MLX is lazily evaluated nothing is computed yet.
+        # Calling y.item() would force the computation to happen at
+        # this point but we can also choose not to do that and let the
+        # user choose when to start the computation.
+        yield y
+
+
+
+        # Now we parsed the prompt and generated the first token we
+        # need to feed it back into the model and loop to generate the
+        # rest.
+        while True:
+            # Unsqueezing the last dimension to add a sequence length
+            # dimension of 1
+            x = y[:, None]
+
+            if not self.test_nonlayered:
+                self.record_memory('before_tok_embeddings')
+                self.tok_embeddings = nn.Embedding(self.model_args.vocab_size, self.model_args.dim)
+                #w0 = self.tok_embeddings.weight[0][0]
+                self.tok_embeddings.update(
+                    ModelPersister.get_model_persister().load_model(self.layer_names_dict['embed'], self.checkpoint_path)['tok_embeddings'])
+                #w1 = self.tok_embeddings.weight[0][0]
+
+                #assert w0 != w1, f"weight should change after updates."
+            x = self.tok_embeddings(x)
+
+            # force execution
+            mx.eval(x)
+            if not self.test_nonlayered:
+                del self.tok_embeddings
+                gc.collect()
+            self.record_memory('after_tok_embeddings')
+
+            for i in tqdm(range(len(cache)), desc='running layers:'):
+                self.record_memory(f'before layer {il}')
+                # We are overwriting the arrays in the cache list. When
+                # the computation will happen, MLX will be discarding the
+                # old cache the moment it is not needed anymore.
+
+                if not self.test_nonlayered:
+                    l = TransformerBlock(args=self.model_args)
+                    l.update(ModelPersister.get_model_persister().load_model(f'{self.layer_names_dict["layer_prefix"]}.{i}',
+                                                                             self.checkpoint_path)['layers'][i])
+                else:
+                    l = self.layers[i]
+
+                x, cache[i] = l(x, mask=None, cache=cache[i])
+                # force execution
+                mx.eval(x)
+                if not self.test_nonlayered:
+                    del l
+                    gc.collect()
+                self.record_memory(f'after layer {il}')
+
+            self.record_memory('before_norm')
+            if not self.test_nonlayered:
+                self.norm = RMSNorm(self.model_args.dim, eps=self.model_args.norm_eps)
+                self.norm.update(ModelPersister.get_model_persister().load_model(self.layer_names_dict['norm'], self.checkpoint_path)['norm'])
+            x = self.norm(x)
+            # force execution
+            mx.eval(x)
+
+            if not self.test_nonlayered:
+                del self.norm
+                gc.collect()
+
+            self.record_memory('after_norm')
+
+            if not self.test_nonlayered:
+                self.output = nn.Linear(self.model_args.dim, self.model_args.vocab_size, bias=False)
+                self.output.update(ModelPersister.get_model_persister().load_model(self.layer_names_dict['lm_head'], self.checkpoint_path)['output'])
+            y = sample(self.output(x[:, -1]))
+
+            # force execution
+            mx.eval(y)
+            if not self.test_nonlayered:
+                del self.output
+                gc.collect()
+
+            self.record_memory('after_lmhead')
+            yield y
--- a/air_llm/airllm/persist/init.py
+++ b/air_llm/airllm/persist/init.py
@@ -0,0 +1 @@
+from .model_persister import ModelPersister
--- a/air_llm/airllm/persist/mlx_model_persister.py
+++ b/air_llm/airllm/persist/mlx_model_persister.py
@@ -0,0 +1,113 @@
+
+
+import os
+from pathlib import Path
+import mlx.core as mx
+from .model_persister import ModelPersister
+from mlx.utils import tree_unflatten
+import torch
+
+import psutil
+import numpy as np
+from itertools import starmap
+
+
+
+def map_torch_to_mlx(model):
+
+    # things to change
+    # 1. there's no "model." in the weight names
+    model = {k.replace("model.", ""): v for k, v in model.items()}
+
+    # 2. mlp is called feed_forward
+    model = {k.replace("mlp", "feed_forward"): v for k, v in model.items()}
+
+    # 3. up_proj, down_proj, gate_proj
+    model = {k.replace("down_proj", "w2"): v for k, v in model.items()}
+    model = {k.replace("up_proj", "w3"): v for k, v in model.items()}
+    model = {k.replace("gate_proj", "w1"): v for k, v in model.items()}
+
+    # 4. layernorms
+    model = {
+        k.replace("input_layernorm", "attention_norm"): v for k, v in model.items()
+    }
+    model = {
+        k.replace("post_attention_layernorm", "ffn_norm"): v for k, v in model.items()
+    }
+
+    # 5. lm head
+    model = {k.replace("lm_head", "output"): v for k, v in model.items()}
+
+    # 6. token emb
+    model = {k.replace("embed_tokens", "tok_embeddings"): v for k, v in model.items()}
+
+    # 7. attention
+    model = {k.replace("self_attn", "attention"): v for k, v in model.items()}
+    model = {k.replace("q_proj", "wq"): v for k, v in model.items()}
+    model = {k.replace("k_proj", "wk"): v for k, v in model.items()}
+    model = {k.replace("v_proj", "wv"): v for k, v in model.items()}
+    model = {k.replace("o_proj", "wo"): v for k, v in model.items()}
+
+
+    #weights = {k: v.to(torch.float16).numpy() for k, v in model.items()}
+
+
+    return model
+
+class MlxModelPersister(ModelPersister):
+
+
+    def __init__(self, *args, **kwargs):
+
+
+        super(MlxModelPersister, self).__init__(*args, **kwargs)
+
+
+    def model_persist_exist(self, layer_name, saving_path):
+
+
+
+        safetensor_exists = os.path.exists(str(saving_path / (layer_name + 'mlx.npz')))
+        done_marker_exists = os.path.exists(str(saving_path / (layer_name + 'mlx.done')))
+
+        #print(f"checking {layer_name}, {saving_path} - {safetensor_exists},{done_marker_exists}")
+
+        return safetensor_exists and done_marker_exists
+
+    def persist_model(self, state_dict, layer_name, saving_path):
+        #save_file(state_dict, saving_path / (layer_name + 'safetensors'))
+        weights = {k: v.to(torch.float16).numpy() for k, v in state_dict.items()}
+        np.savez(
+            saving_path / (layer_name + 'mlx'),
+            **weights#map_torch_to_mlx(state_dict)
+        )
+
+        print(f"saved as: {saving_path / (layer_name + 'mlx')}")
+
+        # set done marker
+        (saving_path / (layer_name + 'mlx.done')).touch()
+
+
+    def load_model(self, layer_name, path):
+        try:
+            to_load_path = Path(path) / (layer_name + ".mlx.npz")
+            #available = psutil.virtual_memory().available / 1024 / 1024
+            #print(f"start loading: {to_load_path}, before loading: {available:.02f}")
+            layer_state_dict = mx.load(to_load_path)
+            #available = psutil.virtual_memory().available / 1024 / 1024
+            #print(f"loaded {layer_name}, available mem: {available:.02f}")
+
+            layer_state_dict = map_torch_to_mlx(layer_state_dict)
+
+            weights = tree_unflatten(list(layer_state_dict.items()))
+
+            #for el in layer_name.split("."):
+            #    if len(el) > 0:
+            #        if el.isdigit():
+            #            el = int(el)
+            #        weights = weights[el]
+
+            return weights
+        except Exception as ex:
+            print(f"error: {layer_name}, {path}")
+            raise ex
--- a/air_llm/airllm/persist/model_persister.py
+++ b/air_llm/airllm/persist/model_persister.py
@@ -0,0 +1,39 @@
+
+
+
+model_persister = None
+
+class ModelPersister:
+    def __init__(self):
+        pass
+
+    @classmethod
+    def get_model_persister(cls):
+        global model_persister
+        if model_persister is not None:
+            return model_persister
+
+
+        from sys import platform
+        is_on_mac_os = False
+
+        if platform == "darwin":
+            is_on_mac_os = True
+
+
+        if is_on_mac_os:
+            from .mlx_model_persister import MlxModelPersister
+            model_persister = MlxModelPersister()
+        else:
+            from .safetensor_model_persister import SafetensorModelPersister
+            model_persister = SafetensorModelPersister()
+        return model_persister
+
+    def model_persist_exist(self, layer_name, saving_path):
+        pass
+
+    def persist_model(self, state_dict, layer_name, path):
+        pass
+
+    def load_model(self, layer_name, path):
+        pass
--- a/air_llm/airllm/persist/safetensor_model_persister.py
+++ b/air_llm/airllm/persist/safetensor_model_persister.py
@@ -0,0 +1,38 @@
+
+
+import os
+from pathlib import Path
+from .model_persister import ModelPersister
+from safetensors.torch import load_file, save_file
+
+
+
+
+class SafetensorModelPersister(ModelPersister):
+
+
+    def __init__(self, *args, **kwargs):
+
+
+        super(SafetensorModelPersister, self).__init__(*args, **kwargs)
+
+
+    def model_persist_exist(self, layer_name, saving_path):
+
+        safetensor_exists = os.path.exists(str(saving_path / (layer_name + 'safetensors')))
+        done_marker_exists = os.path.exists(str(saving_path / (layer_name + 'safetensors.done')))
+
+        return safetensor_exists and done_marker_exists
+
+    def persist_model(self, state_dict, layer_name, saving_path):
+        save_file(state_dict, saving_path / (layer_name + 'safetensors'))
+
+        print(f"saved as: {saving_path / (layer_name + 'safetensors')}")
+
+        # set done marker
+        (saving_path / (layer_name + 'safetensors.done')).touch()
+
+
+    def load_model(self, layer_name, path):
+        layer_state_dict = load_file(Path(path) / (layer_name + ".safetensors"), device="cpu")
+        return layer_state_dict
--- a/air_llm/airllm/utils.py
+++ b/air_llm/airllm/utils.py
@@ -10,11 +10,20 @@ import time

 from collections import OrderedDict, defaultdict
 from typing import Dict, List, Optional, Tuple, Union
+from sys import platform
+
+is_on_mac_os = False
+
+if platform == "darwin":
+    is_on_mac_os = True
+

 import torch
 import torch.nn as nn
 from safetensors.torch import load_file, save_file

+from .persist import ModelPersister
+

 try:
    import bitsandbytes as bnb
@@ -27,7 +36,6 @@ except ImportError:
 import huggingface_hub


-
 # replacement for bnb quantstat.as_dict(True), until the bug is fixed....
 def save_quant_state_to_dict(self, packed=True):
    """
@@ -105,7 +113,8 @@ def uncompress_layer_state_dict(layer_state_dict):
    return layer_state_dict if uncompressed_layer_state_dict is None else uncompressed_layer_state_dict

 def load_layer(local_path, layer_name, profiling=False):
-    layer_state_dict = load_file(Path(local_path) / (layer_name + ".safetensors"), device="cpu")
+    #layer_state_dict = load_file(Path(local_path) / (layer_name + ".safetensors"), device="cpu")
+    layer_state_dict = ModelPersister.get_model_persister().load_model(layer_name, local_path)

    if profiling:
        t = time.process_time()
@@ -216,22 +225,10 @@ def split_and_save_layers(checkpoint_path, layer_shards_saving_path=None, splitt
    #print(f"checking exists: {saving_path}")
    if os.path.exists(saving_path):
        # dir already exists, check if all layer files are there
-        files_in_saving_path = glob(str(saving_path / "*.safetensors"))
-        done_files_in_saving_path = glob(str(saving_path / "*.safetensors.done"))

        found_layers = {}
        for layer in layers:
-
-            found_safetensor_file = [layer+'safetensors' in file_in_saving_path for file_in_saving_path in files_in_saving_path]
-            #print(layer)
-            #print(found_safetensor_file)
-            found_safetensor_file = any(found_safetensor_file)
-
-            found_done_file = [layer+'safetensors.done' in file_in_saving_path for file_in_saving_path in done_files_in_saving_path]
-            #print(found_done_file)
-            found_done_file = any(found_done_file)
-
-            found_layers[layer] = found_safetensor_file and found_done_file
+            found_layers[layer] = ModelPersister.get_model_persister().model_persist_exist(layer, saving_path)


        if all(found_layers.values()):
@@ -278,15 +275,11 @@ def split_and_save_layers(checkpoint_path, layer_shards_saving_path=None, splitt


        # Save layer state dict as using safetensors
-        safetensor_exists = os.path.exists(str(saving_path / (layer + 'safetensors')))
-        done_marker_exists = os.path.exists(str(saving_path / (layer + 'safetensors.done')))
-        if (not safetensor_exists) or (not done_marker_exists):
-            save_file(layer_state_dict, saving_path / (layer + 'safetensors'))

-            print(f"saved as: {saving_path / (layer + 'safetensors')}")
+        marker_exists = ModelPersister.get_model_persister().model_persist_exist(layer, saving_path)
+        if not marker_exists:
+            ModelPersister.get_model_persister().persist_model(layer_state_dict, layer, saving_path)

-            # set done marker
-            (saving_path / (layer + 'safetensors.done')).touch()

        # Free memory
        for k in layer_state_dict.keys():
@@ -332,10 +325,19 @@ def find_or_create_local_splitted_path(model_local_path_or_repo_id, layer_shards
                f"Found local directory in {model_local_path_or_repo_id}, but didn't find downloaded model. Try using {model_local_path_or_repo_id} as a HF repo...")

    # it should be a repo id at this point...
-    if hf_token is not None:
-        hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token)
+
+    # check if there's safetensors saved, if so, exclude torch saves
+    hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token, allow_patterns="model.safetensors.index.json")
+
+    if len(glob(str(Path(hf_cache_path) / "model.safetensors.index.json"))) > 0:
+        # there's safe tensor version, exclude torch version
+        hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id, token=hf_token,
+                                                          ignore_patterns=['pytorch_model.bin.index.json', '*.bin'])
+
    else:
-        hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id)
+        hf_cache_path = huggingface_hub.snapshot_download(model_local_path_or_repo_id,
+                                                          token=hf_token)
+
    assert os.path.exists(Path(hf_cache_path) / 'pytorch_model.bin.index.json') or \
           os.path.exists(Path(hf_cache_path) / 'model.safetensors.index.json'), \
           f"{hf_cache_path}/pytorch_model.bin.index.json or {hf_cache_path}/model.safetensors.index.json should exists."
--- a/air_llm/examples/run_on_macos.ipynb
+++ b/air_llm/examples/run_on_macos.ipynb
--- a/air_llm/setup.py
+++ b/air_llm/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:

 setuptools.setup(
    name="airllm",
-    version="2.7",
+    version="2.8",
    author="Gavin Li",
    author_email="gavinli@animaai.cloud",
    description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",
--- a/air_llm/tests/test_notebooks/test_mlx.ipynb
+++ b/air_llm/tests/test_notebooks/test_mlx.ipynb
				`@@ -0,0 +1 @@`
				`from .model_persister import ModelPersister`