From 58e38734314e80e32d4bd07d1a1e41fce99a87f9 Mon Sep 17 00:00:00 2001
From: Yu Li <lyo.gavin@gmail.com>
Date: Sun, 3 Dec 2023 18:21:23 -0600
Subject: [PATCH] clean up unnecessary logs

---
 air_llm/airllm/airllm_qwen.py | 8 ++++----
 air_llm/setup.py              | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/air_llm/airllm/airllm_qwen.py b/air_llm/airllm/airllm_qwen.py
index 51cd037..189c811 100644
--- a/air_llm/airllm/airllm_qwen.py
+++ b/air_llm/airllm/airllm_qwen.py
@@ -205,7 +205,7 @@ class AirLLMQWen(GenerationMixin):
             output_hidden_states: Optional[bool] = None,
             return_dict: Optional[bool] = None,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        print(f"input_ids shape: {input_ids.shape}")
+        #print(f"input_ids shape: {input_ids.shape}")
 
         global total_disk_loading_time, total_gpu_loading_time, total_compression_overhead_time
 
@@ -222,7 +222,7 @@ class AirLLMQWen(GenerationMixin):
 
         batch = [input_ids_unit.to(self.running_device).unsqueeze(0) for input_ids_unit in input_ids]
         n_seq = len(batch[0])
-        print(f"batch[0] shape:{batch[0].shape}")
+        #print(f"batch[0] shape:{batch[0].shape}")
         #batch_eos = [(input_ids_unit != self.tokenizer.pad_token_id).sum(0) - 1 for input_ids_unit in input_ids]
 
         # Create attention mask for the largest input, and position ids to use KV cache
@@ -242,7 +242,7 @@ class AirLLMQWen(GenerationMixin):
 
             for i, (layer_name, layer) in tqdm(enumerate(zip(self.layer_names, self.layers)), desc=self.running_device,
                                                total=len(self.layers)):
-                print(f"layer:{i} {layer_name}")
+                #print(f"layer:{i} {layer_name}")
 
                 load_layer_to_cpu_output = self.load_layer_to_cpu(layer_name, self.profiling_mode)
                 # profile
@@ -382,7 +382,7 @@ class AirLLMQWen(GenerationMixin):
             for i in range(len(kv_cache_list)):
                 # print(f"{i} - {kv_cache_list[i][0].shape}")
                 kv_cache_list[i] = (torch.cat(kv_cache_list[i][0], 0), torch.cat(kv_cache_list[i][1], 0))
-            print(f"returning kvcache size: {kv_cache_list[0][0].shape}")
+            #print(f"returning kvcache size: {kv_cache_list[0][0].shape}")
 
         if output_attentions:
             all_self_attns = all_self_attns[0:-2]
diff --git a/air_llm/setup.py b/air_llm/setup.py
index 2ff2be0..f8f1ab7 100644
--- a/air_llm/setup.py
+++ b/air_llm/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
 
 setuptools.setup(
     name="airllm",
-    version="2.3.0",
+    version="2.3.1",
     author="Gavin Li",
     author_email="gavinli@animaai.cloud",
     description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",