Update huggingface.py

Added quantization support using bitsandbytes
2026-03-06 22:03:39 +00:00 · 2023-10-05 11:03:51 +05:30
parent 6c3ed5e533
commit 5b4e517d9d
1 changed files with 17 additions and 5 deletions
--- a/application/llm/huggingface.py
+++ b/application/llm/huggingface.py
@@ -2,13 +2,25 @@ from application.llm.base import BaseLLM

 class HuggingFaceLLM(BaseLLM):

-    def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B'):
+    def __init__(self, api_key, llm_name='Arc53/DocsGPT-7B',q=False):
        global hf
-
+        
        from langchain.llms import HuggingFacePipeline
-        from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-        tokenizer = AutoTokenizer.from_pretrained(llm_name)
-        model = AutoModelForCausalLM.from_pretrained(llm_name)
+        if q:
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
+            tokenizer = AutoTokenizer.from_pretrained(llm_name)
+            bnb_config = BitsAndBytesConfig(
+                            load_in_4bit=True,
+                            bnb_4bit_use_double_quant=True,
+                            bnb_4bit_quant_type="nf4",
+                            bnb_4bit_compute_dtype=torch.bfloat16
+                        )
+            model = AutoModelForCausalLM.from_pretrained(llm_name,quantization_config=bnb_config)
+        else:
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+            tokenizer = AutoTokenizer.from_pretrained(llm_name)
+            model = AutoModelForCausalLM.from_pretrained(llm_name)
+        
        pipe = pipeline(
            "text-generation", model=model,
            tokenizer=tokenizer, max_new_tokens=2000,