mirror of
https://github.com/0xSojalSec/airllm.git
synced 2026-03-07 22:33:47 +00:00
fix compresession
This commit is contained in:
@@ -143,6 +143,11 @@ class AirLLMBaseModel(GenerationMixin):
|
||||
|
||||
# model weights prefetch cuda stream
|
||||
self.prefetching = prefetching
|
||||
|
||||
if self.compression is not None:
|
||||
self.prefetching = False
|
||||
print(f"not support prefetching for compression for now. loading with no prepetching mode.")
|
||||
|
||||
if prefetching:
|
||||
self.stream = torch.cuda.Stream()
|
||||
else:
|
||||
@@ -265,13 +270,14 @@ class AirLLMBaseModel(GenerationMixin):
|
||||
state_dict = load_layer_output
|
||||
|
||||
# pin memory:
|
||||
t = time.time()
|
||||
for k in state_dict.keys():
|
||||
state_dict[k].pin_memory()
|
||||
if self.prefetching:
|
||||
t = time.time()
|
||||
for k in state_dict.keys():
|
||||
state_dict[k].pin_memory()
|
||||
|
||||
elapsed_time = time.time() - t
|
||||
if self.profiling_mode:
|
||||
self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)
|
||||
elapsed_time = time.time() - t
|
||||
if self.profiling_mode:
|
||||
self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
|
||||
|
||||
setuptools.setup(
|
||||
name="airllm",
|
||||
version="2.6",
|
||||
version="2.6.1",
|
||||
author="Gavin Li",
|
||||
author_email="gavinli@animaai.cloud",
|
||||
description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",
|
||||
|
||||
Reference in New Issue
Block a user