diff --git a/air_llm/examples/run_llama3.1_405B.ipynb b/air_llm/examples/run_llama3.1_405B.ipynb new file mode 100644 index 0000000..f4d8c6b --- /dev/null +++ b/air_llm/examples/run_llama3.1_405B.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bfd29d17-9756-464f-b692-41ff20f41148", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e07099a-03b9-49da-b0f8-c473b7f449eb", + "metadata": {}, + "outputs": [], + "source": [ + "from airllm import AutoModel\n", + "\n", + "MAX_LENGTH = 128\n", + "# could use hugging face model repo id:\n", + "model = AutoModel.from_pretrained(\"unsloth/Meta-Llama-3.1-405B-Instruct-bnb-4bit\")\n", + "\n", + "input_text = [\n", + " 'What is the capital of United States?',\n", + " ]\n", + "\n", + "input_tokens = model.tokenizer(input_text,\n", + " return_tensors=\"pt\", \n", + " return_attention_mask=False, \n", + " truncation=True, \n", + " max_length=MAX_LENGTH, \n", + " padding=False)\n", + " \n", + "generation_output = model.generate(\n", + " input_tokens['input_ids'].cuda(), \n", + " max_new_tokens=10,\n", + " use_cache=True,\n", + " return_dict_in_generate=True)\n", + "\n", + "output = model.tokenizer.decode(generation_output.sequences[0])\n", + "\n", + "print(output)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}