code open source, update README

This commit is contained in:
Yu Li
2023-06-12 15:07:29 -05:00
parent a7935feb6b
commit b3ca4eb56c
5 changed files with 990 additions and 0 deletions

View File

@@ -36,6 +36,18 @@
* LoRA r=64, alpha=16 ([QLoRA](https://arxiv.org/abs/2305.14314) Appendix B.2)
* source_max_len=512, target_max_len=512需要保证大部分的training sample没有truncate能完整的把信息训练到模型中根据脚本()中的估计512大概可以覆盖大部分的样本长度。
#### 如何训练
使用以下步骤可以重现Anima 33B模型
# 1. install dependencies
pip install -r requirements.txt
# 2.
cd training
./run_Amina_training.sh
## 验证评估
#### Elo rating tournament结论
@@ -53,6 +65,52 @@
* **评估方法**: 为了平衡成本我们主要采用GPT4进行评估。如[QLoRA](https://arxiv.org/abs/2305.14314) 论证单纯GPT4打分进行模型的对比随机波动性较大。这与我们的观察一致。因此采用了[QLoRA](https://arxiv.org/abs/2305.14314) 推荐的现在比较普遍采用的Elo Rating tournament评测方法。
* **超参选择**出于成本考虑我们选择300轮随机评估随机选择模型PK的先后顺序以抵消先后顺序的影响随机种子为42。Elo rating的实现代码和其他超参参照[Vicuna的Elo代码](https://raw.githubusercontent.com/lm-sys/FastChat/833d65032a715240a3978f4a8f08e7a496c83cb1/fastchat/serve/monitor/elo_analysis.py): K=32, init rating=1000。
# 如何Inferrence
首先保证依赖都已经安装:
pip install -r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true
可以参考:[inferrence.ipynb]
或者使用如下代码:
# imports
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
import torch
# create tokenizer
tokenizer = LlamaTokenizer.from_pretrained(base_model)
# base model
base_model = "timdettmers/guanaco-33b-merged"
model = LlamaForCausalLM.from_pretrained(
base_model,
torch_dtype=torch.float16,
device_map="auto",
)
# LORA PEFT adapters
adapter_model ="/home/ubuntu/cloudfs/saved_models/qlora_cn/output_1686031465/checkpoint-10000/adapter_model"
model = PeftModel.from_pretrained(
model,
adapter_model,
#torch_dtype=torch.float16,
)
model.eval()
# prompt
prompt = "中国的首都是哪里?"
inputs = tokenizer(prompt, return_tensors="pt")
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
# output: '中国的首都是哪里?\n中国的首都是北京。\n北京位于中国北部是中国历史悠'
## Who We Are?
此工作来自于[艾写科技](https://aicompose.cn/about)。我们团队来自于硅谷有多年中、美大厂的一线AI工作经验。

9
requirements.txt Normal file
View File

@@ -0,0 +1,9 @@
bitsandbytes==0.39.0
transformers @ git+https://github.com/huggingface/transformers.git
peft @ git+https://github.com/huggingface/peft.git
accelerate @ git+https://github.com/huggingface/accelerate.git
einops==0.6.1
evaluate==0.4.0
scikit-learn==1.2.2
sentencepiece==0.1.99
wandb==0.15.3

View File

@@ -0,0 +1,26 @@
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
model_id = "timdettmers/guanaco-33b-merged"
tokenizer = AutoTokenizer.from_pretrained(model_id)
ds = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
source_template = "Below is an instruction that describes a task, paired with an input that provides further context. " \
"Write a response that appropriately completes the request.\n\n" \
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
ds = ds.map(lambda x: {'source_length': len(tokenizer.encode(source_template.format(**x))),
'target_length': len(tokenizer.encode(x['output']))})
df = ds["train"].to_pandas()
for qt in [0.8, 0.85, 0.9, 0.95, 0.98]:
print(f"source len @qt{qt}: {df['source_length'].quantile(qt)}")
print(f"target len @qt{qt}: {df['target_length'].quantile(qt)}")

847
training/qlora.py Normal file
View File

@@ -0,0 +1,847 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from collections import defaultdict
import copy
import json
import os
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import bitsandbytes as bnb
import pandas as pd
import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
set_seed,
Seq2SeqTrainer,
BitsAndBytesConfig,
LlamaTokenizer
)
from datasets import load_dataset, Dataset
import evaluate
from peft import (
prepare_model_for_kbit_training,
LoraConfig,
get_peft_model,
PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
torch.backends.cuda.matmul.allow_tf32 = True
logging_file_path = f"./qlora_logs.log"
handlers = [
logging.FileHandler(logging_file_path),
logging.StreamHandler(sys.stdout)
]
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=handlers
)
logger = logging.getLogger(__name__)
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
@dataclass
class ModelArguments:
model_name_or_path: Optional[str] = field(
default="EleutherAI/pythia-12b"
)
trust_remote_code: Optional[bool] = field(
default=False,
metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
)
@dataclass
class DataArguments:
eval_dataset_size: int = field(
default=1024, metadata={"help": "Size of validation dataset."}
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
source_max_len: int = field(
default=1024,
metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
)
target_max_len: int = field(
default=256,
metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
)
dataset: str = field(
default='alpaca',
metadata={"help": "Which dataset to finetune on. See datamodule for options."}
)
dataset_format: Optional[str] = field(
default=None,
metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
)
@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
cache_dir: Optional[str] = field(
default=None
)
train_on_source: Optional[bool] = field(
default=False,
metadata={"help": "Whether to train on the input in addition to the target text."}
)
mmlu_split: Optional[str] = field(
default='eval',
metadata={"help": "The MMLU split to run on"}
)
mmlu_dataset: Optional[str] = field(
default='mmlu-fs',
metadata={"help": "MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot."}
)
do_mmlu_eval: Optional[bool] = field(
default=False,
metadata={"help": "Whether to run the MMLU evaluation."}
)
max_mmlu_samples: Optional[int] = field(
default=None,
metadata={"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."}
)
mmlu_source_max_len: int = field(
default=2048,
metadata={"help": "Maximum source sequence length for mmlu."}
)
full_finetune: bool = field(
default=False,
metadata={"help": "Finetune the entire model without adapters."}
)
adam8bit: bool = field(
default=False,
metadata={"help": "Use 8-bit adam."}
)
double_quant: bool = field(
default=True,
metadata={"help": "Compress the quantization statistics through double quantization."}
)
quant_type: str = field(
default="nf4",
metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
)
bits: int = field(
default=4,
metadata={"help": "How many bits to use."}
)
lora_r: int = field(
default=64,
metadata={"help": "Lora R dimension."}
)
lora_alpha: float = field(
default=16,
metadata={"help": " Lora alpha."}
)
lora_dropout: float = field(
default=0.0,
metadata={"help":"Lora dropout."}
)
max_memory_MB: int = field(
default=80000,
metadata={"help": "Free memory per gpu."}
)
report_to: str = field(
default='none',
metadata={"help": "To use wandb or something else for reporting."}
)
output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'})
per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'})
lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})
sample_generate: bool = field(default=False, metadata={"help": 'If do sample generation on evaluation.'})
debug_mode: bool = field(default=False, metadata={"help": 'debug mode sample 200 train/eval samples for validation'})
@dataclass
class GenerationArguments:
# For more hyperparameters check:
# https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
# Length arguments
max_new_tokens: Optional[int] = field(
default=256,
metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
"if predict_with_generate is set."}
)
min_new_tokens : Optional[int] = field(
default=None,
metadata={"help": "Minimum number of new tokens to generate."}
)
# Generation strategy
do_sample: Optional[bool] = field(default=False)
num_beams: Optional[int] = field(default=1)
num_beam_groups: Optional[int] = field(default=1)
penalty_alpha: Optional[float] = field(default=None)
use_cache: Optional[bool] = field(default=True)
# Hyperparameters for logit manipulation
temperature: Optional[float] = field(default=1.0)
top_k: Optional[int] = field(default=50)
top_p: Optional[float] = field(default=1.0)
typical_p: Optional[float] = field(default=1.0)
diversity_penalty: Optional[float] = field(default=0.0)
repetition_penalty: Optional[float] = field(default=1.0)
length_penalty: Optional[float] = field(default=1.0)
no_repeat_ngram_size: Optional[int] = field(default=0)
def find_all_linear_names(args, model):
cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
class SampleGenerateCallback(transformers.TrainerCallback):
"A callback that prints a sample generations of the model in the process of training"
def on_evaluate(self, args, state, control, **kwargs):
logger.info("on_evaluate in SampleGenerateCallback...")
sample_inputs = [
'用一句话描述地球为什么是独一无二的。',
'中国是否应该推出刺激政策救楼市?',
'如何更好地融入新工作圈子'
]
if "model" in kwargs:
for sample_input in sample_inputs:
tokenizer = kwargs['tokenizer']
inputs = "Below is an instruction that describes a task. " \
"Write a response that appropriately completes the request.\n\n" \
"### Instruction:\n{sample_input}\n\n### Response: ".format(sample_input=sample_input)
logger.info(f"sample input: {inputs}")
model = kwargs['model']
input_ids = tokenizer(inputs, return_tensors="pt")['input_ids']
input_ids = input_ids.to('cuda')
generation_output = model.generate(
input_ids=input_ids,
max_new_tokens=70,
)
#print(generation_output)
logger.info(f"sample output: {tokenizer.decode(generation_output[0])}")
else:
logger.info(f"model not found in kwargs, skipping")
class SavePeftModelCallback(transformers.TrainerCallback):
def save_model(self, args, state, kwargs):
logger.info('Saving PEFT checkpoint...')
if state.best_model_checkpoint is not None:
checkpoint_folder = os.path.join(state.best_model_checkpoint, "adapter_model")
else:
checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
kwargs["model"].save_pretrained(peft_model_path)
pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
if os.path.exists(pytorch_model_path):
os.remove(pytorch_model_path)
def on_save(self, args, state, control, **kwargs):
self.save_model(args, state, kwargs)
return control
def on_train_end(self, args, state, control, **kwargs):
def touch(fname, times=None):
with open(fname, 'a'):
os.utime(fname, times)
touch(join(args.output_dir, 'completed'))
self.save_model(args, state, kwargs)
def get_accelerate_model(args, checkpoint_dir):
n_gpus = torch.cuda.device_count()
max_memory = f'{args.max_memory_MB}MB'
max_memory = {i: max_memory for i in range(n_gpus)}
if args.full_finetune: assert args.bits in [16, 32]
logger.info(f'loading base model {args.model_name_or_path}...')
compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
model = AutoModelForCausalLM.from_pretrained(
args.model_name_or_path,
cache_dir=args.cache_dir,
load_in_4bit=args.bits == 4,
load_in_8bit=args.bits == 8,
device_map='auto',
max_memory=max_memory,
quantization_config=BitsAndBytesConfig(
load_in_4bit=args.bits == 4,
load_in_8bit=args.bits == 8,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=compute_dtype,
bnb_4bit_use_double_quant=args.double_quant,
bnb_4bit_quant_type=args.quant_type
),
torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
trust_remote_code=args.trust_remote_code,
)
if compute_dtype == torch.float16 and args.bits == 4:
major, minor = torch.cuda.get_device_capability()
if major >= 8:
logger.info('='*80)
logger.info('Your GPU supports bfloat16, you can accelerate training with the argument --bf16')
logger.info('='*80)
setattr(model, 'model_parallel', True)
setattr(model, 'is_parallelizable', True)
model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
if not args.full_finetune:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing)
if args.gradient_checkpointing:
model.gradient_checkpointing_enable()
if not args.full_finetune:
if checkpoint_dir is not None:
logger.info("Loading adapters from checkpoint.")
model = PeftModel.from_pretrained(model, join(checkpoint_dir, 'adapter_model'), is_trainable=True)
else:
logger.info(f'adding LoRA modules...')
modules = find_all_linear_names(args, model)
config = LoraConfig(
r=args.lora_r,
lora_alpha=args.lora_alpha,
target_modules=modules,
lora_dropout=args.lora_dropout,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
for name, module in model.named_modules():
if isinstance(module, LoraLayer):
if args.bf16:
module = module.to(torch.bfloat16)
if 'norm' in name:
module = module.to(torch.float32)
if 'lm_head' in name or 'embed_tokens' in name:
if hasattr(module, 'weight'):
if args.bf16 and module.weight.dtype == torch.float32:
module = module.to(torch.bfloat16)
return model
def print_trainable_parameters(args, model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
if args.bits == 4: trainable_params /= 2
logger.info(
f"trainable params: {trainable_params} || "
f"all params: {all_param} || "
f"trainable: {100 * trainable_params / all_param}"
)
def smart_tokenizer_and_embedding_resize(
special_tokens_dict: Dict,
tokenizer: transformers.PreTrainedTokenizer,
model: transformers.PreTrainedModel,
):
"""Resize tokenizer and embedding.
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
"""
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
if num_new_tokens > 0:
input_embeddings = model.get_input_embeddings().weight.data
output_embeddings = model.get_output_embeddings().weight.data
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
input_embeddings[-num_new_tokens:] = input_embeddings_avg
output_embeddings[-num_new_tokens:] = output_embeddings_avg
@dataclass
class DataCollatorForCausalLM(object):
tokenizer: transformers.PreTrainedTokenizer
source_max_len: int
target_max_len: int
train_on_source: bool
predict_with_generate: bool
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
# Extract elements
sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
# Tokenize
tokenized_sources_with_prompt = self.tokenizer(
sources,
max_length=self.source_max_len,
truncation=True,
add_special_tokens=False,
)
tokenized_targets = self.tokenizer(
targets,
max_length=self.target_max_len,
truncation=True,
add_special_tokens=False,
)
# Build the input and labels for causal LM
input_ids = []
labels = []
for tokenized_source, tokenized_target in zip(
tokenized_sources_with_prompt['input_ids'],
tokenized_targets['input_ids']
):
if not self.predict_with_generate:
input_ids.append(torch.tensor(tokenized_source + tokenized_target))
if not self.train_on_source:
labels.append(
torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
)
else:
labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
else:
input_ids.append(torch.tensor(tokenized_source))
# Apply padding
input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
data_dict = {
'input_ids': input_ids,
'attention_mask':input_ids.ne(self.tokenizer.pad_token_id),
}
if labels is not None:
data_dict['labels'] = labels
return data_dict
def extract_unnatural_instructions_data(examples, extract_reformulations=False):
out = {
'input': [],
'output': [],
}
for example_instances in examples['instances']:
for instance in example_instances:
out['input'].append(instance['instruction_with_input'])
out['output'].append(instance['output'])
if extract_reformulations:
for example_reformulations in examples['reformulations']:
if example_reformulations is not None:
for instance in example_reformulations:
out['input'].append(instance['instruction_with_input'])
out['output'].append(instance['output'])
return out
ALPACA_PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response: "
),
}
def extract_alpaca_dataset(example):
if example.get("input", "") != "":
prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
else:
prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
return {'input': prompt_format.format(**example)}
def local_dataset(dataset_name):
if dataset_name.endswith('.json'):
full_dataset = Dataset.from_json(path_or_paths=dataset_name)
elif dataset_name.endswith('.jsonl'):
full_dataset = Dataset.from_json(filename=dataset_name, format='jsonlines')
elif dataset_name.endswith('.csv'):
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
elif dataset_name.endswith('.tsv'):
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
else:
raise ValueError(f"Unsupported dataset format: {dataset_name}")
split_dataset = full_dataset.train_test_split(test_size=0.1)
return split_dataset
def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
"""
Make dataset and collator for supervised fine-tuning.
Datasets are expected to have the following columns: { `input`, `output` }
Available datasets to be selected with `dataset` argument:
- alpaca, 52002 examples
- alpaca cleaned, 51942 examples
- chip2 (OIG), 210289 examples
- self-instruct, 82612 examples
- hh-rlhf (Anthropic), 160800 examples
- longform, 23.7k examples
- oasst1 (OpenAssistant) primary message tree only, 9,846 examples
Coming soon:
- unnatural instructions core, 66010 examples
- unnatural instructions full, 240670 examples
- alpaca-gpt4, 52002 examples
- unnatural-instructions-gpt4, 9000 examples
- supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
- flan (FLAN v2), up to 20M examples available
- vicuna
"""
def load_data(dataset_name):
if dataset_name == 'alpaca':
return load_dataset("tatsu-lab/alpaca")
elif dataset_name == 'alpaca-clean':
return load_dataset("yahma/alpaca-cleaned")
elif dataset_name == 'chip2':
return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
elif dataset_name == 'self-instruct':
return load_dataset("yizhongw/self_instruct", name='self_instruct')
elif dataset_name == 'hh-rlhf':
return load_dataset("Anthropic/hh-rlhf")
elif dataset_name == 'longform':
return load_dataset("akoksal/LongForm")
elif dataset_name == 'oasst1':
return load_dataset("timdettmers/openassistant-guanaco")
elif dataset_name == 'vicuna':
raise NotImplementedError("Vicuna data was not released.")
elif dataset_name == 'chinese-vicuna':
return load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
else:
if os.path.exists(dataset_name):
try:
args.dataset_format = args.dataset_format if args.dataset_format else "alpaca"
full_dataset = local_dataset(dataset_name)
return full_dataset
except:
raise ValueError(f"Error loading dataset from {dataset_name}")
else:
raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")
def format_dataset(dataset, dataset_format):
if (
dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or
(dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean'])
):
dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction'])
elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
dataset = dataset.map(lambda x: {
'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
'output': x['text'].split('\n<bot>: ')[1],
})
elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
for old, new in [["prompt", "input"], ["completion", "output"]]:
dataset = dataset.rename_column(old, new)
elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
dataset = dataset.map(lambda x: {
'input': '',
'output': x['chosen']
})
elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
dataset = dataset.map(lambda x: {
'input': '',
'output': x['text'],
})
# Remove unused columns.
dataset = dataset.remove_columns(
[col for col in dataset.column_names['train'] if col not in ['input', 'output']]
)
return dataset
# Load dataset.
dataset = load_data(args.dataset)
if args.debug_mode:
dataset['train'] = dataset['train'].filter(lambda x,i: i < 200, with_indices=True)
#dataset['eval'] = dataset['eval'].filter(lambda x,i: i < 200, with_indices=True)
dataset = format_dataset(dataset, args.dataset_format)
# Split train/eval, reduce size
if args.do_eval or args.do_predict:
if 'eval' in dataset:
eval_dataset = dataset['eval']
else:
logger.info('Splitting train dataset in train and validation according to `eval_dataset_size`')
dataset = dataset["train"].train_test_split(
test_size=args.eval_dataset_size, shuffle=True, seed=42
)
eval_dataset = dataset['test']
if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
eval_dataset = eval_dataset.select(range(args.max_eval_samples))
if args.group_by_length:
eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
if args.do_train:
train_dataset = dataset['train']
if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
train_dataset = train_dataset.select(range(args.max_train_samples))
if args.group_by_length:
train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
data_collator = DataCollatorForCausalLM(
tokenizer=tokenizer,
source_max_len=args.source_max_len,
target_max_len=args.target_max_len,
train_on_source=args.train_on_source,
predict_with_generate=args.predict_with_generate,
)
return dict(
train_dataset=train_dataset if args.do_train else None,
eval_dataset=eval_dataset if args.do_eval else None,
predict_dataset=eval_dataset if args.do_predict else None,
data_collator=data_collator
)
def get_last_checkpoint(checkpoint_dir):
if isdir(checkpoint_dir):
is_completed = exists(join(checkpoint_dir, 'completed'))
if is_completed: return None, True # already finished
max_step = 0
for filename in os.listdir(checkpoint_dir):
if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
max_step = max(max_step, int(filename.replace('checkpoint-', '')))
if max_step == 0: return None, is_completed # training started, but no checkpoint
checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
logger.info(f"Found a previous checkpoint at: {checkpoint_dir}")
return checkpoint_dir, is_completed # checkpoint found!
return None, False # first training
def train():
hfparser = transformers.HfArgumentParser((
ModelArguments, DataArguments, TrainingArguments, GenerationArguments
))
model_args, data_args, training_args, generation_args, extra_args = \
hfparser.parse_args_into_dataclasses(return_remaining_strings=True)
training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
args = argparse.Namespace(
**vars(model_args), **vars(data_args), **vars(training_args)
)
logger.info(f"args: {args}")
checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
if completed_training:
logger.info('Detected that training was already completed!')
model = get_accelerate_model(args, checkpoint_dir)
model.config.use_cache = False
print_trainable_parameters(args, model)
logger.info('loaded model')
set_seed(args.seed)
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path,
cache_dir=args.cache_dir,
padding_side="right",
use_fast=False, # Fast tokenizer giving issues.
tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, # Needed for HF name change
)
if tokenizer._pad_token is None:
smart_tokenizer_and_embedding_resize(
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
tokenizer=tokenizer,
model=model,
)
if 'llama' in args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer):
# LLaMA tokenizer may not have correct special tokens set.
# Check and add them if missing to prevent them from being parsed into different tokens.
# Note that these are present in the vocabulary.
# Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
logger.info('Adding special tokens.')
tokenizer.add_special_tokens({
"eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
"bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
"unk_token": tokenizer.convert_ids_to_tokens(
model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
),
})
data_module = make_data_module(tokenizer=tokenizer, args=args)
trainer = Seq2SeqTrainer(
model=model,
tokenizer=tokenizer,
args=training_args,
**{k:v for k,v in data_module.items() if k != 'predict_dataset'},
)
# Callbacks
if not args.full_finetune:
trainer.add_callback(SavePeftModelCallback)
if args.sample_generate:
trainer.add_callback(SampleGenerateCallback)
if args.do_mmlu_eval:
if args.mmlu_dataset == 'mmlu-zs':
mmlu_dataset = load_dataset("json", data_files={
'eval': 'data/mmlu/zero_shot_mmlu_val.json',
'test': 'data/mmlu/zero_shot_mmlu_test.json',
})
mmlu_dataset = mmlu_dataset.remove_columns('subject')
# MMLU Five-shot (Eval/Test only)
elif args.mmlu_dataset == 'mmlu' or args.mmlu_dataset == 'mmlu-fs':
mmlu_dataset = load_dataset("json", data_files={
'eval': 'data/mmlu/five_shot_mmlu_val.json',
'test': 'data/mmlu/five_shot_mmlu_test.json',
})
# mmlu_dataset = mmlu_dataset.remove_columns('subject')
mmlu_dataset = mmlu_dataset[args.mmlu_split]
if args.max_mmlu_samples is not None:
mmlu_dataset = mmlu_dataset.select(range(args.max_mmlu_samples))
abcd_idx = [
tokenizer("A", add_special_tokens=False).input_ids[0],
tokenizer("B", add_special_tokens=False).input_ids[0],
tokenizer("C", add_special_tokens=False).input_ids[0],
tokenizer("D", add_special_tokens=False).input_ids[0],
]
accuracy = evaluate.load("accuracy")
class MMLUEvalCallback(transformers.TrainerCallback):
def on_evaluate(self, args, state, control, model, **kwargs):
data_loader = trainer.get_eval_dataloader(mmlu_dataset)
source_max_len = trainer.data_collator.source_max_len
trainer.data_collator.source_max_len = args.mmlu_source_max_len
trainer.model.eval()
preds, refs = [], []
loss_mmlu = 0
for batch in tqdm(data_loader, total=len(data_loader)):
(loss, logits, labels) = trainer.prediction_step(trainer.model,batch,prediction_loss_only=False,)
# There are two tokens, the output, and eos token.
for i, logit in enumerate(logits):
label_non_zero_id = (batch['labels'][i] != -100).nonzero()[0][0]
logit_abcd = logit[label_non_zero_id-1][abcd_idx]
preds.append(torch.argmax(logit_abcd).item())
labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:,0]
refs += [abcd_idx.index(label) for label in labels.tolist()]
loss_mmlu += loss.item()
# Extract results by subject.
results = {'mmlu_loss':loss_mmlu/len(data_loader)}
subject = mmlu_dataset['subject']
subjects = {s:{'refs':[], 'preds':[]} for s in set(subject)}
for s,p,r in zip(subject, preds, refs):
subjects[s]['preds'].append(p)
subjects[s]['refs'].append(r)
subject_scores = []
for subject in subjects:
subject_score = accuracy.compute(
references=subjects[subject]['refs'],
predictions=subjects[subject]['preds']
)['accuracy']
results[f'mmlu_{args.mmlu_split}_accuracy_{subject}'] = subject_score
subject_scores.append(subject_score)
results[f'mmlu_{args.mmlu_split}_accuracy'] = np.mean(subject_scores)
trainer.log(results)
trainer.data_collator.source_max_len = source_max_len
trainer.add_callback(MMLUEvalCallback)
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
dtype = p.dtype
if dtype not in dtypes: dtypes[dtype] = 0
dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items(): total+= v
for k, v in dtypes.items():
logger.info(k, v, v/total)
all_metrics = {"run_name": args.run_name}
# Training
if args.do_train:
logger.info("*** Train ***")
# Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
# Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
all_metrics.update(metrics)
# Evaluation
if args.do_eval:
logger.info("*** Evaluate ***")
metrics = trainer.evaluate(metric_key_prefix="eval")
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
all_metrics.update(metrics)
# Prediction
if args.do_predict:
logger.info("*** Predict ***")
prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict")
prediction_metrics = prediction_output.metrics
predictions = prediction_output.predictions
predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
predictions = tokenizer.batch_decode(
predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
for i, example in enumerate(data_module['predict_dataset']):
example['prediction_with_input'] = predictions[i].strip()
example['prediction'] = predictions[i].replace(example['input'], '').strip()
fout.write(json.dumps(example) + '\n')
logger.info(prediction_metrics)
trainer.log_metrics("predict", prediction_metrics)
trainer.save_metrics("predict", prediction_metrics)
all_metrics.update(prediction_metrics)
if (args.do_train or args.do_eval or args.do_predict):
with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
fout.write(json.dumps(all_metrics))
if __name__ == "__main__":
train()

50
training/run_Amina_training.sh Executable file
View File

@@ -0,0 +1,50 @@
set -x -e
run_id=$(date +%s)
echo "RUN ID: $run_ts"
echo "START TIME: $(date)"
ROOT_DIR_BASE=/home/ubuntu/cloudfs/saved_models/qlora_cn
OUTPUT_PATH=$ROOT_DIR_BASE/output_$run_id
mkdir -p $OUTPUT_PATH
# based on test in ./test_cn_dataset_lenghts.py :
#source len @qt0.8: 188.0
#target len @qt0.8: 222.0
#source len @qt0.85: 228.0
#target len @qt0.85: 267.0
#source len @qt0.9: 297.0
#target len @qt0.9: 342.0
#source len @qt0.95: 396.0
#target len @qt0.95: 491.0
#source len @qt0.98: 515.0
#target len @qt0.98: 670.2800000000279
python qlora.py --dataset="chinese-vicuna" \
--dataset_format="alpaca-clean" `#alpaca-clean has similar format to chinese training dataset` \
--learning_rate 0.0001 `# QLoRA paper appendix B Table 9 `\
--per_device_train_batch_size 1 `# fix for fitting mem `\
--gradient_accumulation_steps 16 `# QLoRA paper appendix B Table 9 `\
--max_steps 10000 `# QLoRA paper appendix B Table 9, follow paper setting even though cn data is 690k much bigger than OASST1 9k, batch size considering accum`\
--model_name_or_path "timdettmers/guanaco-33b-merged" \
--source_max_len 512 `# default setting in code, cn model 2048 too long `\
--target_max_len 512 `# follow QLoRA paper appendix B Table 9 `\
--eval_dataset_size 1 `# mainly for testing, no need to be big` \
--do_eval \
--evaluation_strategy "steps" \
--eval_steps 200 `# 10 for debug mode only, 200 for training` \
--output_dir $OUTPUT_PATH \
--report_to 'wandb' \
--sample_generate `# test sample generation every once a while` \
--save_steps 200 `# 20 for debug mode only, 200 for training`
# --debug_mode `# only set when it's debug mode` \