fix compresession

This commit is contained in:
Yu Li
2023-12-20 10:55:57 -06:00
parent 7948044c9f
commit ed58786755
2 changed files with 13 additions and 7 deletions

View File

@@ -143,6 +143,11 @@ class AirLLMBaseModel(GenerationMixin):
# model weights prefetch cuda stream
self.prefetching = prefetching
if self.compression is not None:
self.prefetching = False
print(f"not support prefetching for compression for now. loading with no prepetching mode.")
if prefetching:
self.stream = torch.cuda.Stream()
else:
@@ -265,13 +270,14 @@ class AirLLMBaseModel(GenerationMixin):
state_dict = load_layer_output
# pin memory:
t = time.time()
for k in state_dict.keys():
state_dict[k].pin_memory()
if self.prefetching:
t = time.time()
for k in state_dict.keys():
state_dict[k].pin_memory()
elapsed_time = time.time() - t
if self.profiling_mode:
self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)
elapsed_time = time.time() - t
if self.profiling_mode:
self.profiler.add_profiling_time('pin_memory_to_trigger_load', elapsed_time)
return state_dict

View File

@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
setuptools.setup(
name="airllm",
version="2.6",
version="2.6.1",
author="Gavin Li",
author_email="gavinli@animaai.cloud",
description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",