diff --git a/air_llm/airllm/airllm.py b/air_llm/airllm/airllm.py
index 685f21e..de39943 100644
--- a/air_llm/airllm/airllm.py
+++ b/air_llm/airllm/airllm.py
@@ -130,9 +130,9 @@ class AirLLMLlama2(GenerationMixin):
 
     def load_layer_to_cpu(self, layer_name, profiling=False):
 
-        t = time.process_time()
+        t = time.time()
         load_layer_output = load_layer(self.checkpoint_path, layer_name, profiling)
-        elapsed_time = time.process_time() - t
+        elapsed_time = time.time() - t
 
         if profiling:
             state_dict, compression_time = load_layer_output
@@ -216,6 +216,7 @@ class AirLLMLlama2(GenerationMixin):
             total_gpu_loading_time = []
             total_compression_overhead_time = []
             forward_start = time.process_time()
+            forward_start_wall = time.time()
 
         # Reboot the model to make sure buffers are loaded and memory is clean
         del self.model
@@ -253,11 +254,11 @@ class AirLLMLlama2(GenerationMixin):
                 else:
                     state_dict = load_layer_to_cpu_output
 
-                t = time.process_time()
+                t = time.time()
                 self.move_layer_to_device(state_dict)
                 if self.profiling_mode:
                     torch.cuda.synchronize()
-                elapsed_time = time.process_time() - t
+                elapsed_time = time.time() - t
                 # profile
                 if self.profiling_mode:
                     total_gpu_loading_time.append(elapsed_time)
@@ -366,6 +367,7 @@ class AirLLMLlama2(GenerationMixin):
                                      tuple(all_self_attns) if all_self_attns is not None else None] if v is not None)
         if self.profiling_mode:
             forward_elapsed_time = time.process_time() - forward_start
+            forward_elapsed_time_wall = time.time() - forward_start_wall
             if self.compression:
                 print(f"total disk loading time: {sum(total_disk_loading_time):.04f}")
                 print(f"total gpu loading time: {sum(total_gpu_loading_time):.04f}")
@@ -376,7 +378,8 @@ class AirLLMLlama2(GenerationMixin):
                 #print(f"total disk loading time: {sum(total_disk_loading_time):.04f}")
                 #print(f"total gpu loading time: {sum(total_gpu_loading_time):.04f}")
 
-            print(f"total infer time(including all above plus gpu compute): {forward_elapsed_time:.04f}")
+            print(f"total infer process time(including all above plus gpu compute): {forward_elapsed_time:.04f}")
+            print(f"total infer wall time(including all above plus gpu compute): {forward_elapsed_time_wall:.04f}")
 
             total_disk_loading_time = []
             total_gpu_loading_time = []
diff --git a/air_llm/setup.py b/air_llm/setup.py
index d1c439a..7243cde 100644
--- a/air_llm/setup.py
+++ b/air_llm/setup.py
@@ -5,7 +5,7 @@ with open("README.md", "r") as fh:
 
 setuptools.setup(
     name="airllm",
-    version="2.4.3",
+    version="2.4.4",
     author="Gavin Li",
     author_email="gavinli@animaai.cloud",
     description="AirLLM allows single 4GB GPU card to run 70B large language models without quantization, distillation or pruning.",