fix compresession

2026-03-07 22:33:47 +00:00 · 2023-12-20 10:55:57 -06:00
parent 7948044c9f
commit ed58786755
2 changed files with 13 additions and 7 deletions
--- a/air_llm/airllm/airllm_base.py
+++ b/air_llm/airllm/airllm_base.py
@@ -143,6 +143,11 @@ class AirLLMBaseModel(GenerationMixin):

        # model weights prefetch cuda stream
        self.prefetching = prefetching
+
+        if self.compression is not None:
+            self.prefetching = False
+            print(f"not support prefetching for compression for now. loading with no prepetching mode.")
+
        if prefetching:
            self.stream = torch.cuda.Stream()
        else:
@@ -265,13 +270,14 @@ class AirLLMBaseModel(GenerationMixin):
            state_dict = load_layer_output

        # pin memory:
-        t = time.time()
-        for k in state_dict.keys():
-            state_dict[k].pin_memory()
+        if self.prefetching:
+            t = time.time()
+            for k in state_dict.keys():
+                state_dict[k].pin_memory()

-        elapsed_time = time.time() - t
-        if self.profiling_mode:
-            self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)
+            elapsed_time = time.time() - t
+            if self.profiling_mode:
+                self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)

        return state_dict

--- a/air_llm/setup.py
+++ b/air_llm/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:

 setuptools.setup(
    name="airllm",
-    version="2.6",
+    version="2.6.1",
    author="Gavin Li",
    author_email="gavinli@animaai.cloud",
    description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",