Files
airllm/examples/inferrence.ipynb
2023-06-12 22:25:34 +00:00

7152 lines
378 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "8fd0667c-90fc-47e4-94bf-545d529c2b86",
"metadata": {},
"source": [
"# make sure dependencies are all installed"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3300dc45-8915-476a-bc82-ea2e40ef0786",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting transformers@ git+https://github.com/huggingface/transformers.git\n",
" Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-64cefzsp/transformers_5f0472cc878d47368a5465679921455b\n",
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-install-64cefzsp/transformers_5f0472cc878d47368a5465679921455b\n",
" Resolved https://github.com/huggingface/transformers.git to commit 70c79940957fb25b54bd1b106935c756b90345eb\n",
" Installing build dependencies ... \u001b[?25ldone\n",
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting peft@ git+https://github.com/huggingface/peft.git\n",
" Cloning https://github.com/huggingface/peft.git to /tmp/pip-install-64cefzsp/peft_3420f28ed7984da896c14ac8c804539f\n",
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-install-64cefzsp/peft_3420f28ed7984da896c14ac8c804539f\n",
" Resolved https://github.com/huggingface/peft.git to commit 189a6b8e357ecda05ccde13999e4c35759596a67\n",
" Installing build dependencies ... \u001b[?25ldone\n",
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting accelerate@ git+https://github.com/huggingface/accelerate.git\n",
" Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-install-64cefzsp/accelerate_854acdbdc77b46b3ab84ac7a585aaf47\n",
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-install-64cefzsp/accelerate_854acdbdc77b46b3ab84ac7a585aaf47\n",
" Resolved https://github.com/huggingface/accelerate.git to commit 543c59af224e3ea273633732319916b0698234ab\n",
" Installing build dependencies ... \u001b[?25ldone\n",
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting bitsandbytes==0.39.0\n",
" Downloading bitsandbytes-0.39.0-py3-none-any.whl (92.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.2/92.2 MB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting einops==0.6.1\n",
" Downloading einops-0.6.1-py3-none-any.whl (42 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.2/42.2 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting evaluate==0.4.0\n",
" Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting scikit-learn==1.2.2\n",
" Downloading scikit_learn-1.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.8/9.8 MB\u001b[0m \u001b[31m76.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting sentencepiece==0.1.99\n",
" Downloading sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m28.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
"\u001b[?25hCollecting wandb==0.15.3\n",
" Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m66.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting huggingface-hub>=0.7.0\n",
" Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m16.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting multiprocess\n",
" Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.0/132.0 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /home/ubuntu/.local/lib/python3.8/site-packages (from evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (2.28.2)\n",
"Collecting dill\n",
" Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting fsspec[http]>=2021.05.0\n",
" Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.8/163.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pandas in /home/ubuntu/.local/lib/python3.8/site-packages (from evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (1.5.3)\n",
"Collecting responses<0.19\n",
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
"Requirement already satisfied: packaging in /home/ubuntu/.local/lib/python3.8/site-packages (from evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (23.0)\n",
"Collecting datasets>=2.0.0\n",
" Downloading datasets-2.12.0-py3-none-any.whl (474 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m474.6/474.6 kB\u001b[0m \u001b[31m21.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /home/ubuntu/.local/lib/python3.8/site-packages (from evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (1.23.5)\n",
"Requirement already satisfied: tqdm>=4.62.1 in /home/ubuntu/.local/lib/python3.8/site-packages (from evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (4.64.1)\n",
"Collecting xxhash\n",
" Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting threadpoolctl>=2.0.0\n",
" Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)\n",
"Requirement already satisfied: joblib>=1.1.1 in /home/ubuntu/.local/lib/python3.8/site-packages (from scikit-learn==1.2.2->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 7)) (1.2.0)\n",
"Requirement already satisfied: scipy>=1.3.2 in /home/ubuntu/.local/lib/python3.8/site-packages (from scikit-learn==1.2.2->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 7)) (1.9.3)\n",
"Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/lib/python3/dist-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (7.0)\n",
"Collecting GitPython!=3.1.29,>=1.0.0\n",
" Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.3/184.3 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: psutil>=5.0.0 in /usr/lib/python3/dist-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (5.5.1)\n",
"Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (45.2.0)\n",
"Requirement already satisfied: PyYAML in /usr/lib/python3/dist-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (5.3.1)\n",
"Collecting pathtools\n",
" Downloading pathtools-0.1.2.tar.gz (11 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hCollecting docker-pycreds>=0.4.0\n",
" Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
"Collecting setproctitle\n",
" Downloading setproctitle-1.3.2-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31 kB)\n",
"Requirement already satisfied: appdirs>=1.4.3 in /usr/lib/python3/dist-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (1.4.3)\n",
"Collecting sentry-sdk>=1.0.0\n",
" Downloading sentry_sdk-1.25.1-py2.py3-none-any.whl (206 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m206.7/206.7 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: typing-extensions in /home/ubuntu/.local/lib/python3.8/site-packages (from wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (4.5.0)\n",
"Collecting protobuf!=4.21.0,<5,>=3.12.0\n",
" Downloading protobuf-4.23.2-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m19.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
" Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m86.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting safetensors>=0.3.1\n",
" Downloading safetensors-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m46.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting regex!=2019.12.17\n",
" Downloading regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m772.3/772.3 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: filelock in /usr/lib/python3/dist-packages (from transformers@ git+https://github.com/huggingface/transformers.git->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 2)) (3.0.12)\n",
"Requirement already satisfied: torch>=1.13.0 in /usr/lib/python3/dist-packages (from peft@ git+https://github.com/huggingface/peft.git->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 3)) (1.13.1)\n",
"Collecting pyarrow>=8.0.0\n",
" Downloading pyarrow-12.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (39.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.0/39.0 MB\u001b[0m \u001b[31m34.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting aiohttp\n",
" Downloading aiohttp-3.8.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: six>=1.4.0 in /usr/lib/python3/dist-packages (from docker-pycreds>=0.4.0->wandb==0.15.3->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 9)) (1.14.0)\n",
"Collecting gitdb<5,>=4.0.1\n",
" Downloading gitdb-4.0.10-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /home/ubuntu/.local/lib/python3.8/site-packages (from requests>=2.19.0->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (3.1.0)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/lib/python3/dist-packages (from requests>=2.19.0->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (1.25.8)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.19.0->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (2.8)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests>=2.19.0->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (2019.11.28)\n",
"Collecting urllib3<1.27,>=1.21.1\n",
" Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.1/143.1 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /home/ubuntu/.local/lib/python3.8/site-packages (from pandas->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (2023.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.1 in /home/ubuntu/.local/lib/python3.8/site-packages (from pandas->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (2.8.2)\n",
"Collecting async-timeout<5.0,>=4.0.0a3\n",
" Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
"Collecting multidict<7.0,>=4.5\n",
" Downloading multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (121 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.3/121.3 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting aiosignal>=1.1.2\n",
" Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
"Collecting frozenlist>=1.1.1\n",
" Downloading frozenlist-1.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m161.3/161.3 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting yarl<2.0,>=1.0\n",
" Downloading yarl-1.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (266 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m266.9/266.9 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/lib/python3/dist-packages (from aiohttp->datasets>=2.0.0->evaluate==0.4.0->-r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true (line 6)) (19.3.0)\n",
"Collecting smmap<6,>=3.0.1\n",
" Downloading smmap-5.0.0-py3-none-any.whl (24 kB)\n",
"Building wheels for collected packages: transformers, peft, accelerate, pathtools\n",
" Building wheel for transformers (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for transformers: filename=transformers-4.31.0.dev0-py3-none-any.whl size=7169418 sha256=2ab57f4a80e84be409b8ac089caa1f8dda6061345a3ffde96ce8172d359176b5\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-1pw8o8hi/wheels/05/0a/97/64ae47c27ba95fae2cb5838e7b4b7247a34d4a8ba5f7092de2\n",
" Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for peft: filename=peft-0.4.0.dev0-py3-none-any.whl size=59308 sha256=690762039e60ad980188cea200591721fada0df97e18780532b25447e297355f\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-1pw8o8hi/wheels/95/fe/57/a484616f9bd99820cb946c7c3d2b1b492423b504356b0797dd\n",
" Building wheel for accelerate (pyproject.toml) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for accelerate: filename=accelerate-0.21.0.dev0-py3-none-any.whl size=228522 sha256=8525391652f2fa9403af3611633084eb5b68f1f21ff305ac7610c27d6043d461\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-1pw8o8hi/wheels/0e/3c/a4/a965507f9d132376a5e3c337ed615278a9afc049743353bd6b\n",
" Building wheel for pathtools (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for pathtools: filename=pathtools-0.1.2-py3-none-any.whl size=8784 sha256=1231e0a83f5bcd2facc1743c27caf735373b4bf959918840fe6f81b9ce2deefc\n",
" Stored in directory: /home/ubuntu/.cache/pip/wheels/4c/8e/7e/72fbc243e1aeecae64a96875432e70d4e92f3d2d18123be004\n",
"Successfully built transformers peft accelerate pathtools\n",
"Installing collected packages: tokenizers, sentencepiece, safetensors, pathtools, bitsandbytes, xxhash, urllib3, threadpoolctl, smmap, setproctitle, regex, pyarrow, protobuf, multidict, fsspec, frozenlist, einops, docker-pycreds, dill, async-timeout, accelerate, yarl, sentry-sdk, scikit-learn, multiprocess, gitdb, aiosignal, responses, huggingface-hub, GitPython, aiohttp, wandb, transformers, peft, datasets, evaluate\n",
"Successfully installed GitPython-3.1.31 accelerate-0.21.0.dev0 aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 bitsandbytes-0.39.0 datasets-2.12.0 dill-0.3.6 docker-pycreds-0.4.0 einops-0.6.1 evaluate-0.4.0 frozenlist-1.3.3 fsspec-2023.6.0 gitdb-4.0.10 huggingface-hub-0.15.1 multidict-6.0.4 multiprocess-0.70.14 pathtools-0.1.2 peft-0.4.0.dev0 protobuf-4.23.2 pyarrow-12.0.0 regex-2023.6.3 responses-0.18.0 safetensors-0.3.1 scikit-learn-1.2.2 sentencepiece-0.1.99 sentry-sdk-1.25.1 setproctitle-1.3.2 smmap-5.0.0 threadpoolctl-3.1.0 tokenizers-0.13.3 transformers-4.31.0.dev0 urllib3-1.26.16 wandb-0.15.3 xxhash-3.2.0 yarl-1.9.2\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip install -r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true\n"
]
},
{
"cell_type": "markdown",
"id": "9a342541-244f-45d2-a077-9c641628cd38",
"metadata": {},
"source": [
"# import libs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1692b132-9280-49ba-8c73-fe71f733f2a5",
"metadata": {},
"outputs": [],
"source": [
"# fix this issue:\n",
"\n",
"#TypeError: Descriptors cannot not be created directly.\n",
"#If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.\n",
"#If you cannot immediately regenerate your protos, some other possible workarounds are:\n",
"# 1. Downgrade the protobuf package to 3.20.x or lower.\n",
"# 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).#\n",
"\n",
"#More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates\n",
"\n",
"\n",
"import os\n",
"os.environ[\"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION\"] = \"python\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "96673ee5-4481-4070-8fae-6310c4138431",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"===================================BUG REPORT===================================\n",
"Welcome to bitsandbytes. For bug reports, please run\n",
"\n",
"python -m bitsandbytes\n",
"\n",
" and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
"================================================================================\n",
"bin /home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
"CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
"ERROR: /usr/bin/python3: undefined symbol: cudaRuntimeGetVersion\n",
"CUDA SETUP: libcudart.so path is None\n",
"CUDA SETUP: Is seems that your cuda installation is not in your path. See https://github.com/TimDettmers/bitsandbytes/issues/85 for more information.\n",
"CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!\n",
"CUDA SETUP: Highest compute capability among GPUs detected: 9.0\n",
"CUDA SETUP: Detected CUDA version 00\n",
"CUDA SETUP: Loading binary /home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
" warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n",
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/run/lambda-jupyter-lab.pid')}\n",
" warn(msg)\n",
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//ipykernel.pylab.backend_inline')}\n",
" warn(msg)\n",
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/cuda/lib64')}\n",
" warn(msg)\n",
"/home/ubuntu/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n",
" warn(msg)\n"
]
}
],
"source": [
"from peft import PeftModel\n",
"from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer\n",
"import torch"
]
},
{
"cell_type": "markdown",
"id": "76804d19-8a46-4550-91f2-61f671e312ff",
"metadata": {},
"source": [
"# create tokenizer and model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c81b9ff6-c627-4a8a-9161-7e177dbc4930",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e0907f2e60d24e53bb0adfb39e07c55a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b89c2ccff9d5467ab47c1f180ec868f0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)cial_tokens_map.json: 0%| | 0.00/289 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0bf37509fe72499eb074879a5b5badbb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)okenizer_config.json: 0%| | 0.00/715 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"base_model = \"timdettmers/guanaco-33b-merged\"\n",
"tokenizer = LlamaTokenizer.from_pretrained(base_model)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e4b4d6c8-dfeb-46e2-b6ed-d9ef525d44d1",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "67a7af9075af4e6b9d0a964df0a8deec",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/555 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b96b2b6b2d8045409538c939d637a89f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)model.bin.index.json: 0%| | 0.00/50.1k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5daad442b5a143bc913693413f65edfb",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading shards: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4cd9e08369964aa7bead2d818d0bce5e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00001-of-00007.bin: 0%| | 0.00/9.82G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "24fabe9e4d4b4380ab7ce25f1b73b9a9",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00002-of-00007.bin: 0%| | 0.00/9.96G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4b312fc6194e46dcbdbd23b2050267aa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00003-of-00007.bin: 0%| | 0.00/9.90G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9d4583b23c8e4f78aa87420f022815dc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00004-of-00007.bin: 0%| | 0.00/9.87G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "99249522f6c54846b449e35fc1babf7c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00005-of-00007.bin: 0%| | 0.00/9.87G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3ed218039f6e4dbfba2926125b1a9cea",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00006-of-00007.bin: 0%| | 0.00/9.96G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "96e3c8ce1e6140ee9eab8062385671bc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)l-00007-of-00007.bin: 0%| | 0.00/5.69G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6965927d7c7b4127b942a31327986f5c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "80f7ed64ad794248a7674bd955c1596f",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)neration_config.json: 0%| | 0.00/137 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# base model\n",
"model = LlamaForCausalLM.from_pretrained(\n",
" base_model,\n",
" torch_dtype=torch.float16,\n",
" device_map=\"auto\",\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "249c9ef2-ce90-4ce3-8b8c-2685839d837d",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bb1ccc9f54854c768a9c5aa2299378bc",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)/adapter_config.json: 0%| | 0.00/505 [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "ee6648891b734f7cb016b37668019cad",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading adapter_model.bin: 0%| | 0.00/1.95G [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"PeftModelForCausalLM(\n",
" (base_model): LoraModel(\n",
" (model): LlamaForCausalLM(\n",
" (model): LlamaModel(\n",
" (embed_tokens): Embedding(32000, 6656, padding_idx=0)\n",
" (layers): ModuleList(\n",
" (0): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (1): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (2): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (3): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (4): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (5): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (6): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (7): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (8): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (9): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (10): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (11): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (12): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (13): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (14): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (15): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (16): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (17): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (18): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (19): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (20): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (21): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (22): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (23): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (24): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (25): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (26): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (27): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (28): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (29): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (30): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (31): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (32): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (33): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (34): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (35): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (36): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (37): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (38): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (39): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (40): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (41): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (42): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (43): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (44): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (45): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (46): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (47): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (48): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (49): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (50): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (51): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (52): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (53): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (54): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (55): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (56): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (57): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (58): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" (59): LlamaDecoderLayer(\n",
" (self_attn): LlamaAttention(\n",
" (q_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (k_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (v_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (o_proj): Linear(\n",
" in_features=6656, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (rotary_emb): LlamaRotaryEmbedding()\n",
" )\n",
" (mlp): LlamaMLP(\n",
" (gate_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (down_proj): Linear(\n",
" in_features=17920, out_features=6656, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=17920, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=6656, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (up_proj): Linear(\n",
" in_features=6656, out_features=17920, bias=False\n",
" (lora_dropout): ModuleDict(\n",
" (default): Identity()\n",
" )\n",
" (lora_A): ModuleDict(\n",
" (default): Linear(in_features=6656, out_features=64, bias=False)\n",
" )\n",
" (lora_B): ModuleDict(\n",
" (default): Linear(in_features=64, out_features=17920, bias=False)\n",
" )\n",
" (lora_embedding_A): ParameterDict()\n",
" (lora_embedding_B): ParameterDict()\n",
" )\n",
" (act_fn): SiLUActivation()\n",
" )\n",
" (input_layernorm): LlamaRMSNorm()\n",
" (post_attention_layernorm): LlamaRMSNorm()\n",
" )\n",
" )\n",
" (norm): LlamaRMSNorm()\n",
" )\n",
" (lm_head): Linear(in_features=6656, out_features=32000, bias=False)\n",
" )\n",
" )\n",
")"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# LORA PEFT adapters\n",
"adapter_model =\"lyogavin/Anima33B\"\n",
"\n",
"model = PeftModel.from_pretrained(\n",
" model,\n",
" adapter_model,\n",
" #torch_dtype=torch.float16,\n",
" )\n",
"model.eval()"
]
},
{
"cell_type": "markdown",
"id": "73afb823-9b6f-4ebc-a097-d5606ab2095d",
"metadata": {},
"source": [
"# generate"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1e14bee3-61f0-4bd1-b7b5-b9a4a543322c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/ubuntu/.local/lib/python3.8/site-packages/transformers/generation/utils.py:1452: UserWarning: You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on cpu, whereas the model is on cuda. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('cuda') before running `.generate()`.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"中国的首都是哪里?\n",
"中国的首都是北京。\n",
"北京位于中国北部,是中国历史悠\n"
]
}
],
"source": [
"\n",
"# prompt\n",
"prompt = \"中国的首都是哪里?\"\n",
"inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
"\n",
"# Generate\n",
"generate_ids = model.generate(**inputs, max_new_tokens=30)\n",
"print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])\n",
"\n",
"# output: '中国的首都是哪里?\\n中国的首都是北京。\\n北京位于中国北部是中国历史悠'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5bd05a7-243d-4264-9e15-5d4ae32f5ed4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}