support specify the layer shard saving path

2026-03-07 14:24:44 +00:00 · 2023-11-20 20:39:07 -06:00
parent 2f95f82852
commit 233bed9f91
4 changed files with 83 additions and 32 deletions
--- a/README.md
+++ b/README.md
@@ -141,7 +141,8 @@ Buy me a coffee please! 欢迎大家参与贡献本项目 🙏

 ## ✍️ 艾写科技 & Anima AI LLC

-<img src="https://static.aicompose.cn/static/logo/dabble-icon-recolor_trans_bg.svg?t=1698957644" alt="aiwrite" style="width:90px;"/> <img src="https://static.aicompose.cn/static/logo/animaai_logo.png?t=1696952962" alt="aiwrite" style="width:170px;"/>
+<img src="https://static.aicompose.cn/static/logo/dabble-icon-recolor_trans_bg.svg?t=1698957644" alt="aiwrite" style="width:90px;"/>
+<img src="https://static.aicompose.cn/static/logo/animaai_logo.png?t=1696952962" alt="aiwrite" style="width:170px;"/>

 This work is from [Anima AI LLC](https://animaai.cloud) and [aiwrite.ai](https://aiwrite.ai).

--- a/air_llm/README.md
+++ b/air_llm/README.md
@@ -26,6 +26,10 @@ Then, initialize AirLLMLlama2, pass in the huggingface repo ID of the model bein

 然后，初始化AirLLMLlama2，传入所使用模型的huggingface repo ID，或者本地路径即可类似于普通的transformer模型进行推理。

+(*You can can also specify the path to save the splitted layered model through **layer_shards_saving_path** when init AirLLMLlama2.*
+
+*如果需要指定另外的路径来存储分层的模型可以在初始化AirLLMLlama2是传入参数：**layer_shards_saving_path**。*)
+
 ```python
 from airllm import AirLLMLlama2

--- a/air_llm/airllm/init.py
+++ b/air_llm/airllm/init.py
@@ -1,2 +1,3 @@
 from .airllm import AirLLMLlama2
-from .airllm import split_and_save_layers
+from .airllm import split_and_save_layers
+from .airllm import NotEnoughSpaceException
--- a/air_llm/airllm/airllm.py
+++ b/air_llm/airllm/airllm.py
@@ -6,6 +6,7 @@ import ctypes
 import shutil
 from tqdm import tqdm
 from pathlib import Path
+from glob import glob

 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel, GenerationMixin, LlamaForCausalLM, GenerationConfig
@@ -16,10 +17,17 @@ from safetensors.torch import load_file, save_file
 from optimum.bettertransformer import BetterTransformer
 import huggingface_hub

+class NotEnoughSpaceException(Exception):
+    pass
+
 # Function to clean RAM & vRAM
 def clean_memory():
    gc.collect()
-    ctypes.CDLL("libc.so.6").malloc_trim(0)
+    try:
+        ctypes.CDLL("libc.so.6").malloc_trim(0)
+    except Exception as ex:
+        # maybe platform
+        pass
    torch.cuda.empty_cache()

 def load_layer(local_path, layer_name):
@@ -27,19 +35,38 @@ def load_layer(local_path, layer_name):
    return layer_state_dict


-def split_and_save_layers(checkpoint_path, splitted_model_dir_name='splitted_model'):
+def check_space(checkpoint_path, layer_shards_saving_path=None):
+    total_shard_files_size_bytes = 0
+    for model_shard_file in glob(str(checkpoint_path / '*')):
+        total_shard_files_size_bytes += os.path.getsize(model_shard_file)
+
+    total, used, free = shutil.disk_usage(checkpoint_path if layer_shards_saving_path is None else layer_shards_saving_path)
+
+    if free < total_shard_files_size_bytes:
+        raise NotEnoughSpaceException(f"Not enough space. Free space under {checkpoint_path if layer_shards_saving_path is None else layer_shards_saving_path}:"  \
+                                      f" {free / 1024 / 1024 / 1024:.02f}GB. Model total size: {total_shard_files_size_bytes / 1024 / 1024 / 1024:.02f}GB.")
+
+
+def split_and_save_layers(checkpoint_path, layer_shards_saving_path=None, splitted_model_dir_name='splitted_model'):
    """
    Save the all layers of a model sharded checkpoint using safetensors.
    """

    checkpoint_path = Path(checkpoint_path)

-    total, used, free = shutil.disk_usage(checkpoint_path)

-    Llama2_70B_size = 134720680
+    saving_path = checkpoint_path / splitted_model_dir_name

-    if free/1024 < Llama2_70B_size:
-        print(f"WARNING: free space in the saving path {checkpoint_path / splitted_model_dir_name} seems small: {free/1024/1024/1024:02f}GB, please make sure you have enough space to save the splitted model")
+    if layer_shards_saving_path is not None:
+        saving_path = Path(layer_shards_saving_path) / splitted_model_dir_name
+
+    if os.path.exists(saving_path):
+        # already exists, assuming it's saved already, directly return
+        return str(saving_path)
+
+
+
+    check_space(checkpoint_path, layer_shards_saving_path)

    with open(checkpoint_path / 'pytorch_model.bin.index.json', 'rb') as f:
        index = json.load(f)['weight_map']
@@ -50,8 +77,10 @@ def split_and_save_layers(checkpoint_path, splitted_model_dir_name='splitted_mod
    n_shards = len(set(index.values()))
    state_dict = {}

-    if not os.path.exists(checkpoint_path / splitted_model_dir_name):
-        os.makedirs(checkpoint_path / splitted_model_dir_name)
+
+
+    if not os.path.exists(saving_path):
+        os.makedirs(saving_path)

    for layer in tqdm(layers):

@@ -67,9 +96,9 @@ def split_and_save_layers(checkpoint_path, splitted_model_dir_name='splitted_mod
        layer_state_dict = dict([(k, v) for k, v in state_dict.items() if k.startswith(layer)])

        # Save layer state dict as using safetensors
-        save_file(layer_state_dict, checkpoint_path / splitted_model_dir_name / (layer + 'safetensors'))
+        save_file(layer_state_dict, saving_path / (layer + 'safetensors'))

-        print(f"saved as: {checkpoint_path / splitted_model_dir_name / (layer + 'safetensors')}")
+        print(f"saved as: {saving_path / (layer + 'safetensors')}")

        # Free memory
        for k in layer_state_dict.keys():
@@ -77,17 +106,31 @@ def split_and_save_layers(checkpoint_path, splitted_model_dir_name='splitted_mod
        del layer_state_dict
        gc.collect()

-    return str(checkpoint_path / splitted_model_dir_name)
+    return str(saving_path)

-def find_or_create_local_splitted_path(model_local_path_or_repo_id):
-    # try as splitted path first...
-    if os.path.exists(Path(model_local_path_or_repo_id) / 'splitted_model'):
-        return Path(model_local_path_or_repo_id) / 'splitted_model'
+def find_or_create_local_splitted_path(model_local_path_or_repo_id, layer_shards_saving_path=None):
+    """
+    find the model's local cache path, download the cache if not exists, then split and save the model.

-    # try local model path
+    Parameters
+    ----------
+    model_local_path_or_repo_id : str
+        model local path or hf repo id
+    layer_shards_saving_path : str, optional
+        optional path to save the splitted model, by default directly under the model local path
+
+    Returns
+    -------
+    model_local_path : str
+        local model path
+    saved_layer_shards_path : str
+        the path saved layer shards
+    """
+
+    # try local model path, if the model exist split and save there
    if os.path.exists(model_local_path_or_repo_id):
        if os.path.exists(Path(model_local_path_or_repo_id) / 'pytorch_model.bin.index.json'):
-            return split_and_save_layers(model_local_path_or_repo_id)
+            return Path(model_local_path_or_repo_id), split_and_save_layers(model_local_path_or_repo_id, layer_shards_saving_path)
        else:
            print(
                f"Found local directory in {model_local_path_or_repo_id}, but didn't find downloaded model. Try using {model_local_path_or_repo_id} as a HF repo...")
@@ -97,43 +140,45 @@ def find_or_create_local_splitted_path(model_local_path_or_repo_id):
    assert os.path.exists(Path(
        hf_cache_path) / 'pytorch_model.bin.index.json'), f"{hf_cache_path}/pytorch_model.bin.index.json should exists."

-    if os.path.exists(Path(hf_cache_path) / 'splitted_model'):
-        return Path(hf_cache_path) / 'splitted_model'
-    else:
-        return split_and_save_layers(hf_cache_path)
+    # if splitted_model subdir exists under cache use it, otherwise split and save
+    return Path(hf_cache_path), split_and_save_layers(hf_cache_path, layer_shards_saving_path)



 class AirLLMLlama2(GenerationMixin):
-    def __init__(self, model_local_path_or_repo_id, device="cuda:0", dtype=torch.float16, max_seq_len=512):
+    def __init__(self, model_local_path_or_repo_id, device="cuda:0", dtype=torch.float16, max_seq_len=512,
+                 layer_shards_saving_path=None):
        """
        Sharded version of LlamaForCausalLM : the model is splitted into layer shards to reduce GPU memory usage.
        During the forward pass, the inputs are processed layer by layer, and the GPU memory is freed after each layer.
-        To avoid loading the layers multiple times, we could save all the intermediate activations in RAM, but
-        as Kaggle accelerators have more GPU memory than CPU, we simply batch the inputs and keep them on the GPU.
+        To avoid loading the layers multiple times, we could save all the intermediate activations in RAM.

        Parameters
        ----------
-        checkpoint_path : str or Path
-            path to the checkpoint
+        model_local_path_or_repo_id : str or Path
+            path to the local model checkpoint or huggingface repo id
        device : str, optional
            device, by default "cuda:0"
        dtype : torch.dtype, optional
            dtype, by default torch.float16
+        max_seq_len : int, optional
+            max seq lenght, by default 512
+        layer_shards_saving_path : str, optional
+            optional path to save layered shards model file, by default just save to the local cache of model, subdir named splitted_model will be saved
        """

        # Save parameters
-        self.checkpoint_path = find_or_create_local_splitted_path(model_local_path_or_repo_id)
+        self.model_local_path, self.checkpoint_path = find_or_create_local_splitted_path(model_local_path_or_repo_id, layer_shards_saving_path)
        self.running_device = device
        self.device = torch.device(self.running_device)
        self.running_dtype = dtype
        self.dtype = self.running_dtype

        # Create model
-        self.config = AutoConfig.from_pretrained(self.checkpoint_path.parent)
-        self.generation_config = GenerationConfig.from_pretrained(self.checkpoint_path.parent)
+        self.config = AutoConfig.from_pretrained(self.model_local_path)
+        self.generation_config = GenerationConfig.from_pretrained(self.model_local_path)
        #print(f"using generation_config: {self.generation_config}")
-        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint_path.parent)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_local_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"
        self.init_model()