mirror of
https://github.com/0xSojalSec/airllm.git
synced 2026-03-07 22:33:47 +00:00
924 lines
39 KiB
Python
924 lines
39 KiB
Python
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
from collections import defaultdict
|
|
import copy
|
|
import json
|
|
import os
|
|
from os.path import exists, join, isdir
|
|
from dataclasses import dataclass, field
|
|
import sys
|
|
from typing import Optional, Dict, Sequence
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
import logging
|
|
import bitsandbytes as bnb
|
|
import pandas as pd
|
|
|
|
import torch
|
|
import transformers
|
|
from torch.nn.utils.rnn import pad_sequence
|
|
import argparse
|
|
from transformers import (
|
|
AutoTokenizer,
|
|
AutoModelForCausalLM,
|
|
set_seed,
|
|
Seq2SeqTrainer,
|
|
BitsAndBytesConfig,
|
|
LlamaTokenizer,
|
|
EvalPrediction
|
|
|
|
)
|
|
from datasets import load_dataset, Dataset
|
|
import evaluate
|
|
|
|
from peft import (
|
|
prepare_model_for_kbit_training,
|
|
LoraConfig,
|
|
get_peft_model,
|
|
PeftModel
|
|
)
|
|
from peft.tuners.lora import LoraLayer
|
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
|
from typing import Optional, Dict, List, Union, Tuple, Any
|
|
import torch.nn.functional as F
|
|
|
|
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
|
|
|
logging_file_path = f"./qlora_dpo_logs.log"
|
|
|
|
handlers = [
|
|
logging.FileHandler(logging_file_path),
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
handlers=handlers
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
IGNORE_INDEX = -100
|
|
DEFAULT_PAD_TOKEN = "[PAD]"
|
|
|
|
@dataclass
|
|
class ModelArguments:
|
|
model_name_or_path: Optional[str] = field(
|
|
default="EleutherAI/pythia-12b"
|
|
)
|
|
trust_remote_code: Optional[bool] = field(
|
|
default=False,
|
|
metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
|
|
)
|
|
|
|
@dataclass
|
|
class DataArguments:
|
|
eval_dataset_size: int = field(
|
|
default=1024, metadata={"help": "Size of validation dataset."}
|
|
)
|
|
max_train_samples: Optional[int] = field(
|
|
default=None,
|
|
metadata={
|
|
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
|
"value if set."
|
|
},
|
|
)
|
|
max_eval_samples: Optional[int] = field(
|
|
default=None,
|
|
metadata={
|
|
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
|
"value if set."
|
|
},
|
|
)
|
|
source_max_len: int = field(
|
|
default=1024,
|
|
metadata={"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
|
|
)
|
|
target_max_len: int = field(
|
|
default=256,
|
|
metadata={"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
|
|
)
|
|
dataset: str = field(
|
|
default='hh-rlhf',
|
|
metadata={"help": "Which dataset to finetune on. See datamodule for options."}
|
|
)
|
|
dataset_format: Optional[str] = field(
|
|
default='hh-rlhf',
|
|
metadata={"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
|
|
)
|
|
|
|
@dataclass
|
|
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
|
|
cache_dir: Optional[str] = field(
|
|
default=None
|
|
)
|
|
train_on_source: Optional[bool] = field(
|
|
default=False,
|
|
metadata={"help": "Whether to train on the input in addition to the target text."}
|
|
)
|
|
mmlu_split: Optional[str] = field(
|
|
default='eval',
|
|
metadata={"help": "The MMLU split to run on"}
|
|
)
|
|
mmlu_dataset: Optional[str] = field(
|
|
default='mmlu-fs',
|
|
metadata={"help": "MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot."}
|
|
)
|
|
do_mmlu_eval: Optional[bool] = field(
|
|
default=False,
|
|
metadata={"help": "Whether to run the MMLU evaluation."}
|
|
)
|
|
max_mmlu_samples: Optional[int] = field(
|
|
default=None,
|
|
metadata={"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."}
|
|
)
|
|
mmlu_source_max_len: int = field(
|
|
default=2048,
|
|
metadata={"help": "Maximum source sequence length for mmlu."}
|
|
)
|
|
full_finetune: bool = field(
|
|
default=False,
|
|
metadata={"help": "Finetune the entire model without adapters."}
|
|
)
|
|
adam8bit: bool = field(
|
|
default=False,
|
|
metadata={"help": "Use 8-bit adam."}
|
|
)
|
|
double_quant: bool = field(
|
|
default=True,
|
|
metadata={"help": "Compress the quantization statistics through double quantization."}
|
|
)
|
|
quant_type: str = field(
|
|
default="nf4",
|
|
metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
|
|
)
|
|
bits: int = field(
|
|
default=4,
|
|
metadata={"help": "How many bits to use."}
|
|
)
|
|
lora_r: int = field(
|
|
default=64,
|
|
metadata={"help": "Lora R dimension."}
|
|
)
|
|
lora_alpha: float = field(
|
|
default=16,
|
|
metadata={"help": " Lora alpha."}
|
|
)
|
|
lora_dropout: float = field(
|
|
default=0.0,
|
|
metadata={"help":"Lora dropout."}
|
|
)
|
|
max_memory_MB: int = field(
|
|
default=80000,
|
|
metadata={"help": "Free memory per gpu."}
|
|
)
|
|
report_to: str = field(
|
|
default='none',
|
|
metadata={"help": "To use wandb or something else for reporting."}
|
|
)
|
|
output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
|
|
optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'})
|
|
per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
|
|
gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
|
|
max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
|
|
weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
|
|
learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
|
|
remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
|
|
max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
|
|
gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
|
|
do_train: bool = field(default=True, metadata={"help": 'To train or not to train, that is the question?'})
|
|
lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
|
|
warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
|
|
logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
|
|
group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
|
|
save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
|
|
save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
|
|
save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})
|
|
sample_generate: bool = field(default=False, metadata={"help": 'If do sample generation on evaluation.'})
|
|
debug_mode: bool = field(default=False, metadata={"help": 'debug mode sample 200 train/eval samples for validation'})
|
|
reference_model: str = field(default="timdettmers/qlora-hh-rlhf-7b", metadata={"help": 'pretrained reference sft model name or path'})
|
|
reference_free: bool = field(default=False, metadata={"help": 'If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses.'})
|
|
beta: float = field(default=0.1, metadata={"help": 'Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0.'})
|
|
|
|
@dataclass
|
|
class GenerationArguments:
|
|
# For more hyperparameters check:
|
|
# https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
|
|
# Length arguments
|
|
max_new_tokens: Optional[int] = field(
|
|
default=256,
|
|
metadata={"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
|
|
"if predict_with_generate is set."}
|
|
)
|
|
min_new_tokens : Optional[int] = field(
|
|
default=None,
|
|
metadata={"help": "Minimum number of new tokens to generate."}
|
|
)
|
|
|
|
# Generation strategy
|
|
do_sample: Optional[bool] = field(default=False)
|
|
num_beams: Optional[int] = field(default=1)
|
|
num_beam_groups: Optional[int] = field(default=1)
|
|
penalty_alpha: Optional[float] = field(default=None)
|
|
use_cache: Optional[bool] = field(default=True)
|
|
|
|
# Hyperparameters for logit manipulation
|
|
temperature: Optional[float] = field(default=1.0)
|
|
top_k: Optional[int] = field(default=50)
|
|
top_p: Optional[float] = field(default=1.0)
|
|
typical_p: Optional[float] = field(default=1.0)
|
|
diversity_penalty: Optional[float] = field(default=0.0)
|
|
repetition_penalty: Optional[float] = field(default=1.0)
|
|
length_penalty: Optional[float] = field(default=1.0)
|
|
no_repeat_ngram_size: Optional[int] = field(default=0)
|
|
|
|
def find_all_linear_names(args, model):
|
|
cls = bnb.nn.Linear4bit if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
|
|
lora_module_names = set()
|
|
for name, module in model.named_modules():
|
|
if isinstance(module, cls):
|
|
names = name.split('.')
|
|
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
|
|
|
|
|
|
if 'lm_head' in lora_module_names: # needed for 16-bit
|
|
lora_module_names.remove('lm_head')
|
|
return list(lora_module_names)
|
|
|
|
|
|
class SampleGenerateCallback(transformers.TrainerCallback):
|
|
"A callback that prints a sample generations of the model in the process of training"
|
|
|
|
def on_evaluate(self, args, state, control, **kwargs):
|
|
logger.info("on_evaluate in SampleGenerateCallback...")
|
|
sample_inputs = [
|
|
'如果一头大象站在一张脆弱的椅子上,椅子会破裂吗?',
|
|
'什么是机器学习?它有哪些应用场景?',
|
|
'如果细菌对抗生素产生了耐药性,那么为什么它们不能对所有抗生素都免疫?'
|
|
]
|
|
if "model" in kwargs:
|
|
for sample_input in sample_inputs:
|
|
tokenizer = kwargs['tokenizer']
|
|
inputs = "Below is an instruction that describes a task. " \
|
|
"Write a response that appropriately completes the request.\n\n" \
|
|
"### Instruction:\n{sample_input}\n\n### Response: ".format(sample_input=sample_input)
|
|
logger.info(f"sample input: {inputs}")
|
|
model = kwargs['model']
|
|
input_ids = tokenizer(inputs, return_tensors="pt")['input_ids']
|
|
input_ids = input_ids.to('cuda')
|
|
generation_output = model.generate(
|
|
input_ids=input_ids,
|
|
max_new_tokens=370,
|
|
)
|
|
#print(generation_output)
|
|
logger.info(f"sample output: {tokenizer.decode(generation_output[0])}")
|
|
|
|
else:
|
|
logger.info(f"model not found in kwargs, skipping")
|
|
|
|
|
|
|
|
class SavePeftModelCallback(transformers.TrainerCallback):
|
|
def save_model(self, args, state, kwargs):
|
|
logger.info('Saving PEFT checkpoint...')
|
|
if state.best_model_checkpoint is not None:
|
|
checkpoint_folder = os.path.join(state.best_model_checkpoint, "adapter_model")
|
|
else:
|
|
checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
|
|
|
|
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
|
kwargs["model"].save_pretrained(peft_model_path)
|
|
|
|
pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
|
|
if os.path.exists(pytorch_model_path):
|
|
os.remove(pytorch_model_path)
|
|
|
|
def on_save(self, args, state, control, **kwargs):
|
|
self.save_model(args, state, kwargs)
|
|
return control
|
|
|
|
def on_train_end(self, args, state, control, **kwargs):
|
|
def touch(fname, times=None):
|
|
with open(fname, 'a'):
|
|
os.utime(fname, times)
|
|
|
|
touch(join(args.output_dir, 'completed'))
|
|
self.save_model(args, state, kwargs)
|
|
|
|
def get_reference_model(args, checkpoint_dir):
|
|
|
|
n_gpus = torch.cuda.device_count()
|
|
max_memory = f'{args.max_memory_MB}MB'
|
|
max_memory = {i: max_memory for i in range(n_gpus)}
|
|
|
|
if args.full_finetune: assert args.bits in [16, 32]
|
|
|
|
logger.info(f'loading reference model {args.reference_model}...')
|
|
compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
args.reference_model,
|
|
cache_dir=args.cache_dir,
|
|
load_in_4bit=args.bits == 4,
|
|
load_in_8bit=args.bits == 8,
|
|
device_map='auto',
|
|
max_memory=max_memory,
|
|
quantization_config=BitsAndBytesConfig(
|
|
load_in_4bit=args.bits == 4,
|
|
load_in_8bit=args.bits == 8,
|
|
llm_int8_threshold=6.0,
|
|
llm_int8_has_fp16_weight=False,
|
|
bnb_4bit_compute_dtype=compute_dtype,
|
|
bnb_4bit_use_double_quant=args.double_quant,
|
|
bnb_4bit_quant_type=args.quant_type
|
|
),
|
|
torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
|
|
trust_remote_code=args.trust_remote_code,
|
|
)
|
|
if compute_dtype == torch.float16 and args.bits == 4:
|
|
major, minor = torch.cuda.get_device_capability()
|
|
if major >= 8:
|
|
logger.info('='*80)
|
|
logger.info('Your GPU supports bfloat16, you can accelerate training with the argument --bf16')
|
|
logger.info('='*80)
|
|
|
|
setattr(model, 'model_parallel', True)
|
|
setattr(model, 'is_parallelizable', True)
|
|
|
|
model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
|
|
return model
|
|
|
|
|
|
def get_accelerate_model(args, checkpoint_dir):
|
|
|
|
n_gpus = torch.cuda.device_count()
|
|
max_memory = f'{args.max_memory_MB}MB'
|
|
max_memory = {i: max_memory for i in range(n_gpus)}
|
|
|
|
if args.full_finetune: assert args.bits in [16, 32]
|
|
|
|
logger.info(f'loading base model {args.model_name_or_path}...')
|
|
compute_dtype = (torch.float16 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
args.model_name_or_path,
|
|
cache_dir=args.cache_dir,
|
|
load_in_4bit=args.bits == 4,
|
|
load_in_8bit=args.bits == 8,
|
|
device_map='auto',
|
|
max_memory=max_memory,
|
|
quantization_config=BitsAndBytesConfig(
|
|
load_in_4bit=args.bits == 4,
|
|
load_in_8bit=args.bits == 8,
|
|
llm_int8_threshold=6.0,
|
|
llm_int8_has_fp16_weight=False,
|
|
bnb_4bit_compute_dtype=compute_dtype,
|
|
bnb_4bit_use_double_quant=args.double_quant,
|
|
bnb_4bit_quant_type=args.quant_type
|
|
),
|
|
torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32)),
|
|
trust_remote_code=args.trust_remote_code,
|
|
)
|
|
if compute_dtype == torch.float16 and args.bits == 4:
|
|
major, minor = torch.cuda.get_device_capability()
|
|
if major >= 8:
|
|
logger.info('='*80)
|
|
logger.info('Your GPU supports bfloat16, you can accelerate training with the argument --bf16')
|
|
logger.info('='*80)
|
|
|
|
setattr(model, 'model_parallel', True)
|
|
setattr(model, 'is_parallelizable', True)
|
|
|
|
model.config.torch_dtype=(torch.float32 if args.fp16 else (torch.bfloat16 if args.bf16 else torch.float32))
|
|
|
|
if not args.full_finetune:
|
|
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=args.gradient_checkpointing)
|
|
if args.gradient_checkpointing:
|
|
model.gradient_checkpointing_enable()
|
|
|
|
if not args.full_finetune:
|
|
if checkpoint_dir is not None:
|
|
logger.info("Loading adapters from checkpoint.")
|
|
model = PeftModel.from_pretrained(model, join(checkpoint_dir, 'adapter_model'), is_trainable=True)
|
|
else:
|
|
logger.info(f'adding LoRA modules...')
|
|
modules = find_all_linear_names(args, model)
|
|
config = LoraConfig(
|
|
r=args.lora_r,
|
|
lora_alpha=args.lora_alpha,
|
|
target_modules=modules,
|
|
lora_dropout=args.lora_dropout,
|
|
bias="none",
|
|
task_type="CAUSAL_LM",
|
|
)
|
|
model = get_peft_model(model, config)
|
|
|
|
for name, module in model.named_modules():
|
|
if isinstance(module, LoraLayer):
|
|
if args.bf16:
|
|
module = module.to(torch.bfloat16)
|
|
if 'norm' in name:
|
|
module = module.to(torch.float32)
|
|
if 'lm_head' in name or 'embed_tokens' in name:
|
|
if hasattr(module, 'weight'):
|
|
if args.bf16 and module.weight.dtype == torch.float32:
|
|
module = module.to(torch.bfloat16)
|
|
return model
|
|
|
|
def print_trainable_parameters(args, model):
|
|
"""
|
|
Prints the number of trainable parameters in the model.
|
|
"""
|
|
trainable_params = 0
|
|
all_param = 0
|
|
for _, param in model.named_parameters():
|
|
all_param += param.numel()
|
|
if param.requires_grad:
|
|
trainable_params += param.numel()
|
|
if args.bits == 4: trainable_params /= 2
|
|
logger.info(
|
|
f"trainable params: {trainable_params} || "
|
|
f"all params: {all_param} || "
|
|
f"trainable: {100 * trainable_params / all_param}"
|
|
)
|
|
|
|
def smart_tokenizer_and_embedding_resize(
|
|
special_tokens_dict: Dict,
|
|
tokenizer: transformers.PreTrainedTokenizer,
|
|
model: transformers.PreTrainedModel,
|
|
):
|
|
"""Resize tokenizer and embedding.
|
|
|
|
Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
|
|
"""
|
|
num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
if num_new_tokens > 0:
|
|
input_embeddings = model.get_input_embeddings().weight.data
|
|
output_embeddings = model.get_output_embeddings().weight.data
|
|
|
|
input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
|
|
output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
|
|
|
|
input_embeddings[-num_new_tokens:] = input_embeddings_avg
|
|
output_embeddings[-num_new_tokens:] = output_embeddings_avg
|
|
|
|
@dataclass
|
|
class DataCollatorForCausalLM(object):
|
|
tokenizer: transformers.PreTrainedTokenizer
|
|
source_max_len: int
|
|
target_max_len: int
|
|
train_on_source: bool
|
|
predict_with_generate: bool
|
|
|
|
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
|
|
# Extract elements
|
|
chosen = [f"{self.tokenizer.bos_token}{example['chosen']}{self.tokenizer.eos_token}" for example in instances]
|
|
rejected = [f"{self.tokenizer.bos_token}{example['rejected']}{self.tokenizer.eos_token}" for example in instances]
|
|
|
|
# Tokenize
|
|
tokenized_chosen = self.tokenizer(
|
|
chosen,
|
|
max_length=self.source_max_len,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
|
|
tokenized_rejected = self.tokenizer(
|
|
rejected,
|
|
max_length=self.target_max_len,
|
|
truncation=True,
|
|
add_special_tokens=False,
|
|
)
|
|
tokenized_input_ids_list = []
|
|
for tokenized_chosen_input_ids in tokenized_chosen['input_ids']:
|
|
tokenized_input_ids_list.append(torch.tensor(tokenized_chosen_input_ids))
|
|
|
|
for tokenized_rejected_input_ids in tokenized_rejected['input_ids']:
|
|
tokenized_input_ids_list.append(torch.tensor(tokenized_rejected_input_ids))
|
|
|
|
|
|
# Apply padding
|
|
all_input_ids = pad_sequence(tokenized_input_ids_list, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
|
|
|
data_dict = {
|
|
'chosen_input_ids': all_input_ids[:len(instances)],
|
|
'chosen_attention_mask':all_input_ids[:len(instances)].ne(self.tokenizer.pad_token_id),
|
|
'rejected_input_ids': all_input_ids[len(instances):],
|
|
'rejected_attention_mask':all_input_ids[len(instances):].ne(self.tokenizer.pad_token_id),
|
|
'return_loss':True
|
|
}
|
|
|
|
return data_dict
|
|
|
|
def extract_unnatural_instructions_data(examples, extract_reformulations=False):
|
|
out = {
|
|
'input': [],
|
|
'output': [],
|
|
}
|
|
for example_instances in examples['instances']:
|
|
for instance in example_instances:
|
|
out['input'].append(instance['instruction_with_input'])
|
|
out['output'].append(instance['output'])
|
|
if extract_reformulations:
|
|
for example_reformulations in examples['reformulations']:
|
|
if example_reformulations is not None:
|
|
for instance in example_reformulations:
|
|
out['input'].append(instance['instruction_with_input'])
|
|
out['output'].append(instance['output'])
|
|
return out
|
|
|
|
ALPACA_PROMPT_DICT = {
|
|
"prompt_input": (
|
|
"Below is an instruction that describes a task, paired with an input that provides further context. "
|
|
"Write a response that appropriately completes the request.\n\n"
|
|
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
|
|
),
|
|
"prompt_no_input": (
|
|
"Below is an instruction that describes a task. "
|
|
"Write a response that appropriately completes the request.\n\n"
|
|
"### Instruction:\n{instruction}\n\n### Response: "
|
|
),
|
|
}
|
|
|
|
def extract_alpaca_dataset(example):
|
|
if example.get("input", "") != "":
|
|
prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
|
|
else:
|
|
prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
|
|
return {'input': prompt_format.format(**example)}
|
|
|
|
def local_dataset(dataset_name):
|
|
if dataset_name.endswith('.json'):
|
|
full_dataset = Dataset.from_json(path_or_paths=dataset_name)
|
|
elif dataset_name.endswith('.jsonl'):
|
|
full_dataset = Dataset.from_json(filename=dataset_name, format='jsonlines')
|
|
elif dataset_name.endswith('.csv'):
|
|
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
|
|
elif dataset_name.endswith('.tsv'):
|
|
full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
|
|
else:
|
|
raise ValueError(f"Unsupported dataset format: {dataset_name}")
|
|
|
|
split_dataset = full_dataset.train_test_split(test_size=0.1)
|
|
return split_dataset
|
|
|
|
def make_data_module(tokenizer: transformers.PreTrainedTokenizer, args) -> Dict:
|
|
"""
|
|
Make dataset and collator for supervised fine-tuning.
|
|
Datasets are expected to have the following columns: { `chosen`, `rejected` }
|
|
|
|
"""
|
|
def load_data(dataset_name):
|
|
if dataset_name == 'hh-rlhf':
|
|
return load_dataset("Anthropic/hh-rlhf")
|
|
else:
|
|
if os.path.exists(dataset_name):
|
|
try:
|
|
args.dataset_format = args.dataset_format if args.dataset_format else "hh-rlhf"
|
|
full_dataset = local_dataset(dataset_name)
|
|
return full_dataset
|
|
except:
|
|
raise ValueError(f"Error loading dataset from {dataset_name}")
|
|
else:
|
|
try:
|
|
return load_dataset(dataset_name)
|
|
except Exception:
|
|
raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")
|
|
|
|
def format_dataset(dataset, dataset_format):
|
|
if dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
|
|
dataset = dataset.map(lambda x: {
|
|
'rejected': x['rejected'],
|
|
'chosen': x['chosen']
|
|
})
|
|
|
|
# Remove unused columns.
|
|
dataset = dataset.remove_columns(
|
|
[col for col in dataset.column_names['train'] if col not in ['rejected', 'chosen']]
|
|
)
|
|
return dataset
|
|
|
|
# Load dataset.
|
|
dataset = load_data(args.dataset)
|
|
if args.debug_mode:
|
|
dataset['train'] = dataset['train'].filter(lambda x,i: i < 200, with_indices=True)
|
|
dataset['test'] = dataset['test'].filter(lambda x,i: i < 50, with_indices=True)
|
|
dataset = format_dataset(dataset, args.dataset_format)
|
|
|
|
# Split train/eval, reduce size
|
|
if args.do_eval or args.do_predict:
|
|
if 'eval' in dataset:
|
|
eval_dataset = dataset['eval']
|
|
elif 'test' in dataset:
|
|
eval_dataset = dataset['test']
|
|
else:
|
|
logger.info('Splitting train dataset in train and validation according to `eval_dataset_size`')
|
|
dataset = dataset["train"].train_test_split(
|
|
test_size=args.eval_dataset_size, shuffle=True, seed=42
|
|
)
|
|
eval_dataset = dataset['test']
|
|
if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
|
|
eval_dataset = eval_dataset.select(range(args.max_eval_samples))
|
|
if args.group_by_length:
|
|
eval_dataset = eval_dataset.map(lambda x: {'length': len(x['chosen']) + len(x['rejected'])})
|
|
|
|
|
|
logger.info(f"eval dataset: {eval_dataset}")
|
|
|
|
if args.do_train:
|
|
train_dataset = dataset['train']
|
|
if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
|
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
|
if args.group_by_length:
|
|
train_dataset = train_dataset.map(lambda x: {'length': len(x['chosen']) + len(x['rejected'])})
|
|
|
|
data_collator = DataCollatorForCausalLM(
|
|
tokenizer=tokenizer,
|
|
source_max_len=args.source_max_len,
|
|
target_max_len=args.target_max_len,
|
|
train_on_source=args.train_on_source,
|
|
predict_with_generate=args.predict_with_generate,
|
|
)
|
|
return dict(
|
|
train_dataset=train_dataset if args.do_train else None,
|
|
eval_dataset=eval_dataset if args.do_eval else None,
|
|
predict_dataset=eval_dataset if args.do_predict else None,
|
|
data_collator=data_collator
|
|
)
|
|
|
|
def get_last_checkpoint(checkpoint_dir):
|
|
if isdir(checkpoint_dir):
|
|
is_completed = exists(join(checkpoint_dir, 'completed'))
|
|
if is_completed: return None, True # already finished
|
|
max_step = 0
|
|
for filename in os.listdir(checkpoint_dir):
|
|
if isdir(join(checkpoint_dir, filename)) and filename.startswith('checkpoint'):
|
|
max_step = max(max_step, int(filename.replace('checkpoint-', '')))
|
|
if max_step == 0: return None, is_completed # training started, but no checkpoint
|
|
checkpoint_dir = join(checkpoint_dir, f'checkpoint-{max_step}')
|
|
logger.info(f"Found a previous checkpoint at: {checkpoint_dir}")
|
|
return checkpoint_dir, is_completed # checkpoint found!
|
|
return None, False # first training
|
|
|
|
def _get_batch_logps(logits: torch.FloatTensor, labels: torch.LongTensor, average_log_prob: bool = False,
|
|
tokenizer: transformers.PreTrainedTokenizer = None) -> torch.FloatTensor:
|
|
"""Compute the log probabilities of the given labels under the given logits.
|
|
|
|
Args:
|
|
logits: Logits of the model (unnormalized). Shape: (batch_size, sequence_length, vocab_size)
|
|
labels: Labels for which to compute the log probabilities. Label tokens with a value of -100 are ignored. Shape: (batch_size, sequence_length)
|
|
average_log_prob: If True, return the average log probability per (non-masked) token. Otherwise, return the sum of the log probabilities of the (non-masked) tokens.
|
|
|
|
Returns:
|
|
A tensor of shape (batch_size,) containing the average/sum log probabilities of the given labels under the given logits.
|
|
"""
|
|
assert logits.shape[:-1] == labels.shape
|
|
|
|
labels = labels[:, 1:].clone()
|
|
logits = logits[:, :-1, :]
|
|
loss_mask = (labels != tokenizer.pad_token_id)
|
|
|
|
# dummy token; we'll ignore the losses on these tokens later
|
|
labels[labels == tokenizer.pad_token_id] = 0
|
|
|
|
per_token_logps = torch.gather(logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)).squeeze(2)
|
|
|
|
if average_log_prob:
|
|
return (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
|
|
else:
|
|
return (per_token_logps * loss_mask).sum(-1)
|
|
|
|
def dpo_loss(policy_chosen_logps: torch.FloatTensor,
|
|
policy_rejected_logps: torch.FloatTensor,
|
|
reference_chosen_logps: torch.FloatTensor,
|
|
reference_rejected_logps: torch.FloatTensor,
|
|
beta: float,
|
|
reference_free: bool = False) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
|
|
"""Compute the DPO loss for a batch of policy and reference model log probabilities.
|
|
|
|
Args:
|
|
policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
|
|
policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
|
|
reference_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
|
|
reference_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
|
|
beta: Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0.
|
|
reference_free: If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses.
|
|
|
|
Returns:
|
|
A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
|
|
The losses tensor contains the DPO loss for each example in the batch.
|
|
The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
|
|
"""
|
|
|
|
try:
|
|
pi_logratios = policy_chosen_logps - policy_rejected_logps
|
|
ref_logratios = reference_chosen_logps - reference_rejected_logps
|
|
|
|
if reference_free:
|
|
ref_logratios = 0
|
|
|
|
logits = pi_logratios - ref_logratios
|
|
|
|
|
|
beta_logits = beta * logits
|
|
|
|
losses = -F.logsigmoid(beta_logits)
|
|
chosen_rewards = beta * (policy_chosen_logps - reference_chosen_logps).detach()
|
|
rejected_rewards = beta * (policy_rejected_logps - reference_rejected_logps).detach()
|
|
|
|
return losses, chosen_rewards, rejected_rewards
|
|
except Exception as e:
|
|
import traceback
|
|
import sys
|
|
|
|
logger.info(f"error: {e}")
|
|
logger.info(traceback.format_exc())
|
|
raise e
|
|
|
|
|
|
class DPOSeq2SeqTrainer(Seq2SeqTrainer):
|
|
def __init__(self, reference_model: torch.nn.Module,
|
|
beta: float,
|
|
reference_free: bool = False,
|
|
*argv, **kargv):
|
|
super().__init__(*argv, **kargv)
|
|
self.reference_model = reference_model
|
|
self.beta = beta
|
|
self.reference_free = reference_free
|
|
self.label_names = []
|
|
|
|
def compute_loss(self, model, inputs, return_outputs=False):
|
|
self.reference_model.eval()
|
|
|
|
with torch.no_grad():
|
|
reference_chosen_logits = self.reference_model(input_ids=inputs['chosen_input_ids'], attention_mask=inputs['chosen_attention_mask']).logits
|
|
reference_rejected_logits = self.reference_model(input_ids=inputs['rejected_input_ids'], attention_mask=inputs['rejected_attention_mask']).logits
|
|
|
|
policy_chosen_outputs = model(input_ids=inputs['chosen_input_ids'], attention_mask=inputs['chosen_attention_mask'])
|
|
|
|
policy_chosen_logits = policy_chosen_outputs.logits
|
|
policy_rejected_logits = model(input_ids=inputs['rejected_input_ids'], attention_mask=inputs['rejected_attention_mask']).logits
|
|
|
|
|
|
policy_chosen_logps = _get_batch_logps(policy_chosen_logits, inputs['chosen_input_ids'], average_log_prob=False, tokenizer=self.tokenizer)
|
|
policy_rejected_logps = _get_batch_logps(policy_rejected_logits, inputs['rejected_input_ids'], average_log_prob=False, tokenizer=self.tokenizer)
|
|
reference_chosen_logps = _get_batch_logps(reference_chosen_logits, inputs['chosen_input_ids'], average_log_prob=False, tokenizer=self.tokenizer)
|
|
reference_rejected_logps = _get_batch_logps(reference_rejected_logits, inputs['rejected_input_ids'], average_log_prob=False, tokenizer=self.tokenizer)
|
|
|
|
losses, chosen_rewards, rejected_rewards = dpo_loss(
|
|
policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps,
|
|
beta=self.beta, reference_free=self.reference_free)
|
|
|
|
output_dict = {'chosen_rewards': chosen_rewards.mean(),
|
|
'rejected_rewards': rejected_rewards.mean()
|
|
}
|
|
|
|
return (losses.mean(), output_dict) if return_outputs else losses.mean()
|
|
|
|
def compute_metrics(ep: EvalPrediction):
|
|
|
|
return {'chosen_rewards': ep.predictions[0].mean(), 'rejected_rewards': ep.predictions[1].mean()}
|
|
|
|
def train():
|
|
hfparser = transformers.HfArgumentParser((
|
|
ModelArguments, DataArguments, TrainingArguments, GenerationArguments
|
|
))
|
|
model_args, data_args, training_args, generation_args, extra_args = \
|
|
hfparser.parse_args_into_dataclasses(return_remaining_strings=True)
|
|
training_args.generation_config = transformers.GenerationConfig(**vars(generation_args))
|
|
args = argparse.Namespace(
|
|
**vars(model_args), **vars(data_args), **vars(training_args)
|
|
)
|
|
|
|
logger.info(f"args: {args}")
|
|
|
|
checkpoint_dir, completed_training = get_last_checkpoint(args.output_dir)
|
|
if completed_training:
|
|
logger.info('Detected that training was already completed!')
|
|
|
|
model = get_accelerate_model(args, checkpoint_dir)
|
|
|
|
reference_model = get_reference_model(args, checkpoint_dir)
|
|
logger.info(f"reference_model: {reference_model}")
|
|
|
|
model.config.use_cache = False
|
|
print_trainable_parameters(args, model)
|
|
logger.info('loaded model')
|
|
set_seed(args.seed)
|
|
|
|
# Tokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
args.model_name_or_path,
|
|
cache_dir=args.cache_dir,
|
|
padding_side="right",
|
|
use_fast=False, # Fast tokenizer giving issues.
|
|
tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, # Needed for HF name change
|
|
)
|
|
if tokenizer._pad_token is None:
|
|
smart_tokenizer_and_embedding_resize(
|
|
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
|
|
tokenizer=tokenizer,
|
|
model=model,
|
|
)
|
|
smart_tokenizer_and_embedding_resize(
|
|
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
|
|
tokenizer=tokenizer,
|
|
model=reference_model,
|
|
)
|
|
if 'llama' in args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer):
|
|
# LLaMA tokenizer may not have correct special tokens set.
|
|
# Check and add them if missing to prevent them from being parsed into different tokens.
|
|
# Note that these are present in the vocabulary.
|
|
# Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
|
|
logger.info('Adding special tokens.')
|
|
tokenizer.add_special_tokens({
|
|
"eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
|
|
"bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
|
|
"unk_token": tokenizer.convert_ids_to_tokens(
|
|
model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
|
|
),
|
|
})
|
|
data_module = make_data_module(tokenizer=tokenizer, args=args)
|
|
training_args.label_names = []
|
|
trainer = DPOSeq2SeqTrainer(
|
|
reference_model=reference_model,
|
|
reference_free=args.reference_free,
|
|
beta=args.beta,
|
|
model=model,
|
|
tokenizer=tokenizer,
|
|
args=training_args,
|
|
compute_metrics=compute_metrics,
|
|
**{k:v for k,v in data_module.items() if k != 'predict_dataset'},
|
|
)
|
|
|
|
logger.info(f"trainer label names: {trainer.label_names}")
|
|
logger.info(f"trainer can_return_loss: {trainer.can_return_loss}")
|
|
|
|
# Callbacks
|
|
if not args.full_finetune:
|
|
trainer.add_callback(SavePeftModelCallback)
|
|
if args.sample_generate:
|
|
trainer.add_callback(SampleGenerateCallback)
|
|
|
|
|
|
|
|
# Verifying the datatypes.
|
|
dtypes = {}
|
|
for _, p in model.named_parameters():
|
|
dtype = p.dtype
|
|
if dtype not in dtypes: dtypes[dtype] = 0
|
|
dtypes[dtype] += p.numel()
|
|
total = 0
|
|
for k, v in dtypes.items(): total+= v
|
|
for k, v in dtypes.items():
|
|
logger.info(k, v, v/total)
|
|
|
|
all_metrics = {"run_name": args.run_name}
|
|
# Training
|
|
if args.do_train:
|
|
logger.info("*** Train ***")
|
|
# Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
|
|
# Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
|
|
train_result = trainer.train()
|
|
metrics = train_result.metrics
|
|
trainer.log_metrics("train", metrics)
|
|
trainer.save_metrics("train", metrics)
|
|
trainer.save_state()
|
|
all_metrics.update(metrics)
|
|
# Evaluation
|
|
if args.do_eval:
|
|
logger.info("*** Evaluate ***")
|
|
metrics = trainer.evaluate(metric_key_prefix="eval")
|
|
trainer.log_metrics("eval", metrics)
|
|
trainer.save_metrics("eval", metrics)
|
|
all_metrics.update(metrics)
|
|
# Prediction
|
|
if args.do_predict:
|
|
logger.info("*** Predict ***")
|
|
prediction_output = trainer.predict(test_dataset=data_module['predict_dataset'],metric_key_prefix="predict")
|
|
prediction_metrics = prediction_output.metrics
|
|
predictions = prediction_output.predictions
|
|
predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
|
|
predictions = tokenizer.batch_decode(
|
|
predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
|
|
)
|
|
with open(os.path.join(args.output_dir, 'predictions.jsonl'), 'w') as fout:
|
|
for i, example in enumerate(data_module['predict_dataset']):
|
|
example['prediction_with_input'] = predictions[i].strip()
|
|
example['prediction'] = predictions[i].replace(example['input'], '').strip()
|
|
fout.write(json.dumps(example) + '\n')
|
|
logger.info(prediction_metrics)
|
|
trainer.log_metrics("predict", prediction_metrics)
|
|
trainer.save_metrics("predict", prediction_metrics)
|
|
all_metrics.update(prediction_metrics)
|
|
|
|
if (args.do_train or args.do_eval or args.do_predict):
|
|
with open(os.path.join(args.output_dir, "metrics.json"), "w") as fout:
|
|
fout.write(json.dumps(all_metrics))
|
|
|
|
if __name__ == "__main__":
|
|
train()
|