From b3ca4eb56c208aaf181116d6f16d4d6137381029 Mon Sep 17 00:00:00 2001 From: Yu Li Date: Mon, 12 Jun 2023 15:07:29 -0500 Subject: [PATCH] code open source, update README --- README.md | 58 ++ requirements.txt | 9 + scripts/test_cn_dataset_lenghts.py | 26 + training/qlora.py | 847 +++++++++++++++++++++++++++++ training/run_Amina_training.sh | 50 ++ 5 files changed, 990 insertions(+) create mode 100644 requirements.txt create mode 100644 scripts/test_cn_dataset_lenghts.py create mode 100644 training/qlora.py create mode 100755 training/run_Amina_training.sh diff --git a/README.md b/README.md index 9bef48c..739bb02 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,18 @@ * LoRA r=64, alpha=16 ([QLoRA](https://arxiv.org/abs/2305.14314) Appendix B.2) * source_max_len=512, target_max_len=512,需要保证大部分的training sample没有truncate,能完整的把信息训练到模型中,根据脚本()中的估计,512大概可以覆盖大部分的样本长度。 +#### 如何训练 + +使用以下步骤可以重现Anima 33B模型: + + # 1. install dependencies + pip install -r requirements.txt + # 2. + cd training + ./run_Amina_training.sh + + + ## 验证评估 #### Elo rating tournament结论 @@ -53,6 +65,52 @@ * **评估方法**: 为了平衡成本,我们主要采用GPT4进行评估。如[QLoRA](https://arxiv.org/abs/2305.14314) 论证,单纯GPT4打分进行模型的对比随机波动性较大。这与我们的观察一致。因此采用了[QLoRA](https://arxiv.org/abs/2305.14314) 推荐的,现在比较普遍采用的Elo Rating tournament评测方法。 * **超参选择**:出于成本考虑,我们选择:300轮随机评估,随机选择模型PK的先后顺序以抵消先后顺序的影响,随机种子为:42。Elo rating的实现代码和其他超参参照[Vicuna的Elo代码](https://raw.githubusercontent.com/lm-sys/FastChat/833d65032a715240a3978f4a8f08e7a496c83cb1/fastchat/serve/monitor/elo_analysis.py): K=32, init rating=1000。 +# 如何Inferrence + +首先保证依赖都已经安装: + + pip install -r https://github.com/lyogavin/Anima/blob/main/requirements.txt?raw=true + +可以参考:[inferrence.ipynb] + +或者使用如下代码: + + # imports + from peft import PeftModel + from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer + import torch + + # create tokenizer + tokenizer = LlamaTokenizer.from_pretrained(base_model) + + # base model + base_model = "timdettmers/guanaco-33b-merged" + model = LlamaForCausalLM.from_pretrained( + base_model, + torch_dtype=torch.float16, + device_map="auto", + ) + + # LORA PEFT adapters + adapter_model ="/home/ubuntu/cloudfs/saved_models/qlora_cn/output_1686031465/checkpoint-10000/adapter_model" + + model = PeftModel.from_pretrained( + model, + adapter_model, + #torch_dtype=torch.float16, + ) + model.eval() + + # prompt + prompt = "中国的首都是哪里?" + inputs = tokenizer(prompt, return_tensors="pt") + + # Generate + generate_ids = model.generate(**inputs, max_new_tokens=30) + print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]) + + # output: '中国的首都是哪里?\n中国的首都是北京。\n北京位于中国北部,是中国历史悠' + ## Who We Are? 此工作来自于[艾写科技](https://aicompose.cn/about)。我们团队来自于硅谷,有多年中、美大厂的一线AI工作经验。 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d822ab7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +bitsandbytes==0.39.0 +transformers @ git+https://github.com/huggingface/transformers.git +peft @ git+https://github.com/huggingface/peft.git +accelerate @ git+https://github.com/huggingface/accelerate.git +einops==0.6.1 +evaluate==0.4.0 +scikit-learn==1.2.2 +sentencepiece==0.1.99 +wandb==0.15.3 diff --git a/scripts/test_cn_dataset_lenghts.py b/scripts/test_cn_dataset_lenghts.py new file mode 100644 index 0000000..b66af41 --- /dev/null +++ b/scripts/test_cn_dataset_lenghts.py @@ -0,0 +1,26 @@ +from transformers import AutoTokenizer + +from datasets import load_dataset, Dataset + + +model_id = "timdettmers/guanaco-33b-merged" +tokenizer = AutoTokenizer.from_pretrained(model_id) + +ds = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0") + + +source_template = "Below is an instruction that describes a task, paired with an input that provides further context. " \ + "Write a response that appropriately completes the request.\n\n" \ + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: " + +ds = ds.map(lambda x: {'source_length': len(tokenizer.encode(source_template.format(**x))), + 'target_length': len(tokenizer.encode(x['output']))}) + + +df = ds["train"].to_pandas() + + +for qt in [0.8, 0.85, 0.9, 0.95, 0.98]: + + print(f"source len @qt{qt}: {df['source_length'].quantile(qt)}") + print(f"target len @qt{qt}: {df['target_length'].quantile(qt)}") \ No newline at end of file diff --git a/training/qlora.py b/training/qlora.py new file mode 100644 index 0000000..04ebb34 --- /dev/null +++ b/training/qlora.py @@ -0,0 +1,847 @@ +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import defaultdict +import copy +import json +import os +from os.path import exists, join, isdir +from dataclasses import dataclass, field +import sys +from typing import Optional, Dict, Sequence +import numpy as np +from tqdm import tqdm +import logging +import bitsandbytes as bnb +import pandas as pd + +import torch +import transformers +from torch.nn.utils.rnn import pad_sequence +import argparse +from transformers import ( + AutoTokenizer, + AutoModelForCausalLM, + set_seed, + Seq2SeqTrainer, + BitsAndBytesConfig, + LlamaTokenizer + +) +from datasets import load_dataset, Dataset +import evaluate + +from peft import ( + prepare_model_for_kbit_training, + LoraConfig, + get_peft_model, + PeftModel +) +from peft.tuners.lora import LoraLayer +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + + +torch.backends.cuda.matmul.allow_tf32 = True + + +logging_file_path = f"./qlora_logs.log" + +handlers = [ + logging.FileHandler(logging_file_path), + logging.StreamHandler(sys.stdout) +] + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=handlers +) + +logger = logging.getLogger(__name__) + +IGNORE_INDEX = -100 +DEFAULT_PAD_TOKEN = "[PAD]" + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field( + default="EleutherAI/pythia-12b" + ) + trust_remote_code: Optional[bool] = field( + default=False, + metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."} + ) + +@dataclass +class DataArguments: + eval_dataset_size: int = field( + default=1024, metadata={"help": "Size of validation dataset."} + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + source_max_len: int = field( + default=1024, + metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."}, + ) + target_max_len: int = field( + default=256, + metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."}, + ) + dataset: str = field( + default='alpaca', + metadata={"help": "Which dataset to finetune on. See datamodule for options."} + ) + dataset_format: Optional[str] = field( + default=None, + metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"} + ) + +@dataclass +class TrainingArguments(transformers.Seq2SeqTrainingArguments): + cache_dir: Optional[str] = field( + default=None + ) + train_on_source: Optional[bool] = field( + default=False, + metadata={"help": "Whether to train on the input in addition to the target text."} + ) + mmlu_split: Optional[str] = field( + default='eval', + metadata={"help": "The MMLU split to run on"} + ) + mmlu_dataset: Optional[str] = field( + default='mmlu-fs', + metadata={"help": "MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot."} + ) + do_mmlu_eval: Optional[bool] = field( + default=False, + metadata={"help": "Whether to run the MMLU evaluation."} + ) + max_mmlu_samples: Optional[int] = field( + default=None, + metadata={"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."} + ) + mmlu_source_max_len: int = field( + default=2048, + metadata={"help": "Maximum source sequence length for mmlu."} + ) + full_finetune: bool = field( + default=False, + metadata={"help": "Finetune the entire model without adapters."} + ) + adam8bit: bool = field( + default=False, + metadata={"help": "Use 8-bit adam."} + ) + double_quant: bool = field( + default=True, + metadata={"help": "Compress the quantization statistics through double quantization."} + ) + quant_type: str = field( + default="nf4", + metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} + ) + bits: int = field( + default=4, + metadata={"help": "How many bits to use."} + ) + lora_r: int = field( + default=64, + metadata={"help": "Lora R dimension."} + ) + lora_alpha: float = field( + default=16, + metadata={"help": " Lora alpha."} + ) + lora_dropout: float = field( + default=0.0, + metadata={"help":"Lora dropout."} + ) + max_memory_MB: int = field( + default=80000, + metadata={"help": "Free memory per gpu."} + ) + report_to: str = field( + default='none', + metadata={"help": "To use wandb or something else for reporting."} + ) + output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'}) + optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'}) + per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'}) + gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'}) + max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'}) + weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed + learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'}) + remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'}) + max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'}) + gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'}) + do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'}) + lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'}) + warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'}) + logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'}) + group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'}) + save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'}) + save_steps: int = field(default=250, metadata={"help": 'How often to save a model'}) + save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'}) + sample_generate: bool = field(default=False, metadata={"help": 'If do sample generation on evaluation.'}) + debug_mode: bool = field(default=False, metadata={"help": 'debug mode sample 200 train/eval samples for validation'}) + +@dataclass +class GenerationArguments: + # For more hyperparameters check: + # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig + # Length arguments + max_new_tokens: Optional[int] = field( + default=256, + metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops" + "if predict_with_generate is set."} + ) + min_new_tokens : Optional[int] = field( + default=None, + metadata={"help": "Minimum number of new tokens to generate."} + ) + + # Generation strategy + do_sample: Optional[bool] = field(default=False) + num_beams: Optional[int] = field(default=1) + num_beam_groups: Optional[int] = field(default=1) + penalty_alpha: Optional[float] = field(default=None) + use_cache: Optional[bool] = field(default=True) + + # Hyperparameters for logit manipulation + temperature: Optional[float] = field(default=1.0) + top_k: Optional[int] = field(default=50) + top_p: Optional[float] = field(default=1.0) + typical_p: Optional[float] = field(default=1.0) + diversity_penalty: Optional[float] = field(default=0.0) + repetition_penalty: Optional[float] = field(default=1.0) + length_penalty: Optional[float] = field(default=1.0) + no_repeat_ngram_size: Optional[int] = field(default=0) + +def find_all_linear_names(args, model): + cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear) + lora_module_names = set() + for name, module in model.named_modules(): + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + + if 'lm_head' in lora_module_names: # needed for 16-bit + lora_module_names.remove('lm_head') + return list(lora_module_names) + + +class SampleGenerateCallback(transformers.TrainerCallback): + "A callback that prints a sample generations of the model in the process of training" + + def on_evaluate(self, args, state, control, **kwargs): + logger.info("on_evaluate in SampleGenerateCallback...") + sample_inputs = [ + '用一句话描述地球为什么是独一无二的。', + '中国是否应该推出刺激政策救楼市?', + '如何更好地融入新工作圈子' + ] + if "model" in kwargs: + for sample_input in sample_inputs: + tokenizer = kwargs['tokenizer'] + inputs = "Below is an instruction that describes a task. " \ + "Write a response that appropriately completes the request.\n\n" \ + "### Instruction:\n{sample_input}\n\n### Response: ".format(sample_input=sample_input) + logger.info(f"sample input: {inputs}") + model = kwargs['model'] + input_ids = tokenizer(inputs, return_tensors="pt")['input_ids'] + input_ids = input_ids.to('cuda') + generation_output = model.generate( + input_ids=input_ids, + max_new_tokens=70, + ) + #print(generation_output) + logger.info(f"sample output: {tokenizer.decode(generation_output[0])}") + + else: + logger.info(f"model not found in kwargs, skipping") + + + +class SavePeftModelCallback(transformers.TrainerCallback): + def save_model(self, args, state, kwargs): + logger.info('Saving PEFT checkpoint...') + if state.best_model_checkpoint is not None: + checkpoint_folder = os.path.join(state.best_model_checkpoint, "adapter_model") + else: + checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}") + + peft_model_path = os.path.join(checkpoint_folder, "adapter_model") + kwargs["model"].save_pretrained(peft_model_path) + + pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin") + if os.path.exists(pytorch_model_path): + os.remove(pytorch_model_path) + + def on_save(self, args, state, control, **kwargs): + self.save_model(args, state, kwargs) + return control + + def on_train_end(self, args, state, control, **kwargs): + def touch(fname, times=None): + with open(fname, 'a'): + os.utime(fname, times) + + touch(join(args.output_dir, 'completed')) + self.save_model(args, state, kwargs) + +def get_accelerate_model(args, checkpoint_dir): + + n_gpus = torch.cuda.device_count() + max_memory = f'{args.max_memory_MB}MB' + max_memory = {i: max_memory for i in range(n_gpus)} + + if args.full_finetune: assert args.bits in [16, 32] + + logger.info(f'loading base model {args.model_name_or_path}...') + compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + cache_dir=args.cache_dir, + load_in_4bit=args.bits == 4, + load_in_8bit=args.bits == 8, + device_map='auto', + max_memory=max_memory, + quantization_config=BitsAndBytesConfig( + load_in_4bit=args.bits == 4, + load_in_8bit=args.bits == 8, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=compute_dtype, + bnb_4bit_use_double_quant=args.double_quant, + bnb_4bit_quant_type=args.quant_type + ), + torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)), + trust_remote_code=args.trust_remote_code, + ) + if compute_dtype == torch.float16 and args.bits == 4: + major, minor = torch.cuda.get_device_capability() + if major >= 8: + logger.info('='*80) + logger.info('Your GPU supports bfloat16, you can accelerate training with the argument --bf16') + logger.info('='*80) + + setattr(model, 'model_parallel', True) + setattr(model, 'is_parallelizable', True) + + model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)) + + if not args.full_finetune: + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing) + if args.gradient_checkpointing: + model.gradient_checkpointing_enable() + + if not args.full_finetune: + if checkpoint_dir is not None: + logger.info("Loading adapters from checkpoint.") + model = PeftModel.from_pretrained(model, join(checkpoint_dir, 'adapter_model'), is_trainable=True) + else: + logger.info(f'adding LoRA modules...') + modules = find_all_linear_names(args, model) + config = LoraConfig( + r=args.lora_r, + lora_alpha=args.lora_alpha, + target_modules=modules, + lora_dropout=args.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + for name, module in model.named_modules(): + if isinstance(module, LoraLayer): + if args.bf16: + module = module.to(torch.bfloat16) + if 'norm' in name: + module = module.to(torch.float32) + if 'lm_head' in name or 'embed_tokens' in name: + if hasattr(module, 'weight'): + if args.bf16 and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + return model + +def print_trainable_parameters(args, model): + """ + Prints the number of trainable parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in model.named_parameters(): + all_param += param.numel() + if param.requires_grad: + trainable_params += param.numel() + if args.bits == 4: trainable_params /= 2 + logger.info( + f"trainable params: {trainable_params} || " + f"all params: {all_param} || " + f"trainable: {100 * trainable_params / all_param}" + ) + +def smart_tokenizer_and_embedding_resize( + special_tokens_dict: Dict, + tokenizer: transformers.PreTrainedTokenizer, + model: transformers.PreTrainedModel, +): + """Resize tokenizer and embedding. + + Note: This is the unoptimized version that may make your embedding size not be divisible by 64. + """ + num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) + model.resize_token_embeddings(len(tokenizer)) + + if num_new_tokens > 0: + input_embeddings = model.get_input_embeddings().weight.data + output_embeddings = model.get_output_embeddings().weight.data + + input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) + + input_embeddings[-num_new_tokens:] = input_embeddings_avg + output_embeddings[-num_new_tokens:] = output_embeddings_avg + +@dataclass +class DataCollatorForCausalLM(object): + tokenizer: transformers.PreTrainedTokenizer + source_max_len: int + target_max_len: int + train_on_source: bool + predict_with_generate: bool + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + # Extract elements + sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances] + targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances] + # Tokenize + tokenized_sources_with_prompt = self.tokenizer( + sources, + max_length=self.source_max_len, + truncation=True, + add_special_tokens=False, + ) + tokenized_targets = self.tokenizer( + targets, + max_length=self.target_max_len, + truncation=True, + add_special_tokens=False, + ) + # Build the input and labels for causal LM + input_ids = [] + labels = [] + for tokenized_source, tokenized_target in zip( + tokenized_sources_with_prompt['input_ids'], + tokenized_targets['input_ids'] + ): + if not self.predict_with_generate: + input_ids.append(torch.tensor(tokenized_source + tokenized_target)) + if not self.train_on_source: + labels.append( + torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target)) + ) + else: + labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target))) + else: + input_ids.append(torch.tensor(tokenized_source)) + # Apply padding + input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id) + labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None + data_dict = { + 'input_ids': input_ids, + 'attention_mask':input_ids.ne(self.tokenizer.pad_token_id), + } + if labels is not None: + data_dict['labels'] = labels + return data_dict + +def extract_unnatural_instructions_data(examples, extract_reformulations=False): + out = { + 'input': [], + 'output': [], + } + for example_instances in examples['instances']: + for instance in example_instances: + out['input'].append(instance['instruction_with_input']) + out['output'].append(instance['output']) + if extract_reformulations: + for example_reformulations in examples['reformulations']: + if example_reformulations is not None: + for instance in example_reformulations: + out['input'].append(instance['instruction_with_input']) + out['output'].append(instance['output']) + return out + +ALPACA_PROMPT_DICT = { + "prompt_input": ( + "Below is an instruction that describes a task, paired with an input that provides further context. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: " + ), + "prompt_no_input": ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + "### Instruction:\n{instruction}\n\n### Response: " + ), +} + +def extract_alpaca_dataset(example): + if example.get("input", "") != "": + prompt_format = ALPACA_PROMPT_DICT["prompt_input"] + else: + prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"] + return {'input': prompt_format.format(**example)} + +def local_dataset(dataset_name): + if dataset_name.endswith('.json'): + full_dataset = Dataset.from_json(path_or_paths=dataset_name) + elif dataset_name.endswith('.jsonl'): + full_dataset = Dataset.from_json(filename=dataset_name, format='jsonlines') + elif dataset_name.endswith('.csv'): + full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name)) + elif dataset_name.endswith('.tsv'): + full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t')) + else: + raise ValueError(f"Unsupported dataset format: {dataset_name}") + + split_dataset = full_dataset.train_test_split(test_size=0.1) + return split_dataset + +def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict: + """ + Make dataset and collator for supervised fine-tuning. + Datasets are expected to have the following columns: { `input`, `output` } + + Available datasets to be selected with `dataset` argument: + - alpaca, 52002 examples + - alpaca cleaned, 51942 examples + - chip2 (OIG), 210289 examples + - self-instruct, 82612 examples + - hh-rlhf (Anthropic), 160800 examples + - longform, 23.7k examples + - oasst1 (OpenAssistant) primary message tree only, 9,846 examples + + Coming soon: + - unnatural instructions core, 66010 examples + - unnatural instructions full, 240670 examples + - alpaca-gpt4, 52002 examples + - unnatural-instructions-gpt4, 9000 examples + - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used) + - flan (FLAN v2), up to 20M examples available + - vicuna + + """ + def load_data(dataset_name): + if dataset_name == 'alpaca': + return load_dataset("tatsu-lab/alpaca") + elif dataset_name == 'alpaca-clean': + return load_dataset("yahma/alpaca-cleaned") + elif dataset_name == 'chip2': + return load_dataset("laion/OIG", data_files='unified_chip2.jsonl') + elif dataset_name == 'self-instruct': + return load_dataset("yizhongw/self_instruct", name='self_instruct') + elif dataset_name == 'hh-rlhf': + return load_dataset("Anthropic/hh-rlhf") + elif dataset_name == 'longform': + return load_dataset("akoksal/LongForm") + elif dataset_name == 'oasst1': + return load_dataset("timdettmers/openassistant-guanaco") + elif dataset_name == 'vicuna': + raise NotImplementedError("Vicuna data was not released.") + elif dataset_name == 'chinese-vicuna': + return load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0") + else: + if os.path.exists(dataset_name): + try: + args.dataset_format = args.dataset_format if args.dataset_format else "alpaca" + full_dataset = local_dataset(dataset_name) + return full_dataset + except: + raise ValueError(f"Error loading dataset from {dataset_name}") + else: + raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.") + + def format_dataset(dataset, dataset_format): + if ( + dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or + (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean']) + ): + dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction']) + elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'): + dataset = dataset.map(lambda x: { + 'input': x['text'].split('\n: ')[0].replace(': ', ''), + 'output': x['text'].split('\n: ')[1], + }) + elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'): + for old, new in [["prompt", "input"], ["completion", "output"]]: + dataset = dataset.rename_column(old, new) + elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'): + dataset = dataset.map(lambda x: { + 'input': '', + 'output': x['chosen'] + }) + elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'): + dataset = dataset.map(lambda x: { + 'input': '', + 'output': x['text'], + }) + # Remove unused columns. + dataset = dataset.remove_columns( + [col for col in dataset.column_names['train'] if col not in ['input', 'output']] + ) + return dataset + + # Load dataset. + dataset = load_data(args.dataset) + if args.debug_mode: + dataset['train'] = dataset['train'].filter(lambda x,i: i < 200, with_indices=True) + #dataset['eval'] = dataset['eval'].filter(lambda x,i: i < 200, with_indices=True) + dataset = format_dataset(dataset, args.dataset_format) + + # Split train/eval, reduce size + if args.do_eval or args.do_predict: + if 'eval' in dataset: + eval_dataset = dataset['eval'] + else: + logger.info('Splitting train dataset in train and validation according to `eval_dataset_size`') + dataset = dataset["train"].train_test_split( + test_size=args.eval_dataset_size, shuffle=True, seed=42 + ) + eval_dataset = dataset['test'] + if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples: + eval_dataset = eval_dataset.select(range(args.max_eval_samples)) + if args.group_by_length: + eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])}) + if args.do_train: + train_dataset = dataset['train'] + if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples: + train_dataset = train_dataset.select(range(args.max_train_samples)) + if args.group_by_length: + train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])}) + + data_collator = DataCollatorForCausalLM( + tokenizer=tokenizer, + source_max_len=args.source_max_len, + target_max_len=args.target_max_len, + train_on_source=args.train_on_source, + predict_with_generate=args.predict_with_generate, + ) + return dict( + train_dataset=train_dataset if args.do_train else None, + eval_dataset=eval_dataset if args.do_eval else None, + predict_dataset=eval_dataset if args.do_predict else None, + data_collator=data_collator + ) + +def get_last_checkpoint(checkpoint_dir): + if isdir(checkpoint_dir): + is_completed = exists(join(checkpoint_dir, 'completed')) + if is_completed: return None, True # already finished + max_step = 0 + for filename in os.listdir(checkpoint_dir): + if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'): + max_step = max(max_step, int(filename.replace('checkpoint-', ''))) + if max_step == 0: return None, is_completed # training started, but no checkpoint + checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}') + logger.info(f"Found a previous checkpoint at: {checkpoint_dir}") + return checkpoint_dir, is_completed # checkpoint found! + return None, False # first training + +def train(): + hfparser = transformers.HfArgumentParser(( + ModelArguments, DataArguments, TrainingArguments, GenerationArguments + )) + model_args, data_args, training_args, generation_args, extra_args = \ + hfparser.parse_args_into_dataclasses(return_remaining_strings=True) + training_args.generation_config = transformers.GenerationConfig(**vars(generation_args)) + args = argparse.Namespace( + **vars(model_args), **vars(data_args), **vars(training_args) + ) + + logger.info(f"args: {args}") + + checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir) + if completed_training: + logger.info('Detected that training was already completed!') + + model = get_accelerate_model(args, checkpoint_dir) + + model.config.use_cache = False + print_trainable_parameters(args, model) + logger.info('loaded model') + set_seed(args.seed) + + # Tokenizer + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, + cache_dir=args.cache_dir, + padding_side="right", + use_fast=False, # Fast tokenizer giving issues. + tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, # Needed for HF name change + ) + if tokenizer._pad_token is None: + smart_tokenizer_and_embedding_resize( + special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), + tokenizer=tokenizer, + model=model, + ) + if 'llama' in args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer): + # LLaMA tokenizer may not have correct special tokens set. + # Check and add them if missing to prevent them from being parsed into different tokens. + # Note that these are present in the vocabulary. + # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token. + logger.info('Adding special tokens.') + tokenizer.add_special_tokens({ + "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), + "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), + "unk_token": tokenizer.convert_ids_to_tokens( + model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id + ), + }) + data_module = make_data_module(tokenizer=tokenizer, args=args) + trainer = Seq2SeqTrainer( + model=model, + tokenizer=tokenizer, + args=training_args, + **{k:v for k,v in data_module.items() if k != 'predict_dataset'}, + ) + + # Callbacks + if not args.full_finetune: + trainer.add_callback(SavePeftModelCallback) + if args.sample_generate: + trainer.add_callback(SampleGenerateCallback) + if args.do_mmlu_eval: + if args.mmlu_dataset == 'mmlu-zs': + mmlu_dataset = load_dataset("json", data_files={ + 'eval': 'data/mmlu/zero_shot_mmlu_val.json', + 'test': 'data/mmlu/zero_shot_mmlu_test.json', + }) + mmlu_dataset = mmlu_dataset.remove_columns('subject') + # MMLU Five-shot (Eval/Test only) + elif args.mmlu_dataset == 'mmlu' or args.mmlu_dataset == 'mmlu-fs': + mmlu_dataset = load_dataset("json", data_files={ + 'eval': 'data/mmlu/five_shot_mmlu_val.json', + 'test': 'data/mmlu/five_shot_mmlu_test.json', + }) + # mmlu_dataset = mmlu_dataset.remove_columns('subject') + mmlu_dataset = mmlu_dataset[args.mmlu_split] + if args.max_mmlu_samples is not None: + mmlu_dataset = mmlu_dataset.select(range(args.max_mmlu_samples)) + abcd_idx = [ + tokenizer("A", add_special_tokens=False).input_ids[0], + tokenizer("B", add_special_tokens=False).input_ids[0], + tokenizer("C", add_special_tokens=False).input_ids[0], + tokenizer("D", add_special_tokens=False).input_ids[0], + ] + accuracy = evaluate.load("accuracy") + class MMLUEvalCallback(transformers.TrainerCallback): + def on_evaluate(self, args, state, control, model, **kwargs): + data_loader = trainer.get_eval_dataloader(mmlu_dataset) + source_max_len = trainer.data_collator.source_max_len + trainer.data_collator.source_max_len = args.mmlu_source_max_len + trainer.model.eval() + preds, refs = [], [] + loss_mmlu = 0 + for batch in tqdm(data_loader, total=len(data_loader)): + (loss, logits, labels) = trainer.prediction_step(trainer.model,batch,prediction_loss_only=False,) + # There are two tokens, the output, and eos token. + for i, logit in enumerate(logits): + label_non_zero_id = (batch['labels'][i] != -100).nonzero()[0][0] + logit_abcd = logit[label_non_zero_id-1][abcd_idx] + preds.append(torch.argmax(logit_abcd).item()) + labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:,0] + refs += [abcd_idx.index(label) for label in labels.tolist()] + loss_mmlu += loss.item() + # Extract results by subject. + results = {'mmlu_loss':loss_mmlu/len(data_loader)} + subject = mmlu_dataset['subject'] + subjects = {s:{'refs':[], 'preds':[]} for s in set(subject)} + for s,p,r in zip(subject, preds, refs): + subjects[s]['preds'].append(p) + subjects[s]['refs'].append(r) + subject_scores = [] + for subject in subjects: + subject_score = accuracy.compute( + references=subjects[subject]['refs'], + predictions=subjects[subject]['preds'] + )['accuracy'] + results[f'mmlu_{args.mmlu_split}_accuracy_{subject}'] = subject_score + subject_scores.append(subject_score) + results[f'mmlu_{args.mmlu_split}_accuracy'] = np.mean(subject_scores) + trainer.log(results) + trainer.data_collator.source_max_len = source_max_len + + trainer.add_callback(MMLUEvalCallback) + + # Verifying the datatypes. + dtypes = {} + for _, p in model.named_parameters(): + dtype = p.dtype + if dtype not in dtypes: dtypes[dtype] = 0 + dtypes[dtype] += p.numel() + total = 0 + for k, v in dtypes.items(): total+= v + for k, v in dtypes.items(): + logger.info(k, v, v/total) + + all_metrics = {"run_name": args.run_name} + # Training + if args.do_train: + logger.info("*** Train ***") + # Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF. + # Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not. + train_result = trainer.train() + metrics = train_result.metrics + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + all_metrics.update(metrics) + # Evaluation + if args.do_eval: + logger.info("*** Evaluate ***") + metrics = trainer.evaluate(metric_key_prefix="eval") + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + all_metrics.update(metrics) + # Prediction + if args.do_predict: + logger.info("*** Predict ***") + prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict") + prediction_metrics = prediction_output.metrics + predictions = prediction_output.predictions + predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id) + predictions = tokenizer.batch_decode( + predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True + ) + with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout: + for i, example in enumerate(data_module['predict_dataset']): + example['prediction_with_input'] = predictions[i].strip() + example['prediction'] = predictions[i].replace(example['input'], '').strip() + fout.write(json.dumps(example) + '\n') + logger.info(prediction_metrics) + trainer.log_metrics("predict", prediction_metrics) + trainer.save_metrics("predict", prediction_metrics) + all_metrics.update(prediction_metrics) + + if (args.do_train or args.do_eval or args.do_predict): + with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout: + fout.write(json.dumps(all_metrics)) + +if __name__ == "__main__": + train() diff --git a/training/run_Amina_training.sh b/training/run_Amina_training.sh new file mode 100755 index 0000000..e876a69 --- /dev/null +++ b/training/run_Amina_training.sh @@ -0,0 +1,50 @@ + + +set -x -e + +run_id=$(date +%s) +echo "RUN ID: $run_ts" + +echo "START TIME: $(date)" + + +ROOT_DIR_BASE=/home/ubuntu/cloudfs/saved_models/qlora_cn +OUTPUT_PATH=$ROOT_DIR_BASE/output_$run_id + +mkdir -p $OUTPUT_PATH + + + +# based on test in ./test_cn_dataset_lenghts.py : + +#source len @qt0.8: 188.0 +#target len @qt0.8: 222.0 +#source len @qt0.85: 228.0 +#target len @qt0.85: 267.0 +#source len @qt0.9: 297.0 +#target len @qt0.9: 342.0 +#source len @qt0.95: 396.0 +#target len @qt0.95: 491.0 +#source len @qt0.98: 515.0 +#target len @qt0.98: 670.2800000000279 + + +python qlora.py --dataset="chinese-vicuna" \ + --dataset_format="alpaca-clean" `#alpaca-clean has similar format to chinese training dataset` \ + --learning_rate 0.0001 `# QLoRA paper appendix B Table 9 `\ + --per_device_train_batch_size 1 `# fix for fitting mem `\ + --gradient_accumulation_steps 16 `# QLoRA paper appendix B Table 9 `\ + --max_steps 10000 `# QLoRA paper appendix B Table 9, follow paper setting even though cn data is 690k much bigger than OASST1 9k, batch size considering accum`\ + --model_name_or_path "timdettmers/guanaco-33b-merged" \ + --source_max_len 512 `# default setting in code, cn model 2048 too long `\ + --target_max_len 512 `# follow QLoRA paper appendix B Table 9 `\ + --eval_dataset_size 1 `# mainly for testing, no need to be big` \ + --do_eval \ + --evaluation_strategy "steps" \ + --eval_steps 200 `# 10 for debug mode only, 200 for training` \ + --output_dir $OUTPUT_PATH \ + --report_to 'wandb' \ + --sample_generate `# test sample generation every once a while` \ + --save_steps 200 `# 20 for debug mode only, 200 for training` + +# --debug_mode `# only set when it's debug mode` \