Skip to content

Commit

Permalink
Refactor finetune so some of it can be used to check data and its tok…
Browse files Browse the repository at this point in the history
…enization
  • Loading branch information
pseudotensor committed Apr 27, 2023
1 parent 31eef24 commit 0b74d7f
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 53 deletions.
55 changes: 54 additions & 1 deletion create_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1463,4 +1463,57 @@ def create_personality_data():
print(len(rows))
with open("h2ogpt-personality.json", "w") as f:
f.write(json.dumps(rows, indent=2))
return rows
return rows


def test_check_stats_data():
filename = 'h2ogpt-oig-oasst1-instruct-cleaned-v2.json'
df = pd.read_json(filename)

# get word stats
df['char_count'] = df['input'].apply(lambda x: len(x))
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
plt.hist(df['char_count'], bins=100)
chars_avg = np.mean(df['char_count'])
chars_median = np.median(df['char_count'])
plt.title("char_count avg: %s median: %s" % (chars_avg, chars_median))
plt.savefig('chars_hist.png')
plt.close()

# get tokenize stats for random sample of 1000 rows
from finetune import get_loaders, get_tokenizer, generate_and_tokenize_prompt
from functools import partial

llama_type = True
tokenizer_base_model = base_model = 'decapoda-research/llama-7b-hf'
model_loader, tokenizer_loader = get_loaders(llama_type=llama_type, model_name=base_model, reward_type=False)
local_files_only = False
resume_download = True
use_auth_token = False
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)
prompt_type = 'plain' # trained with data already in human bot form
train_on_inputs = True
add_eos_token = True
cutoff_len = 512 # can choose 2048
generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)
from datasets import load_dataset
data = load_dataset("json", data_files={"train": filename})
val_set_size = 0.90
train_val = data["train"].train_test_split(
test_size=val_set_size, shuffle=True, seed=42
)
train_data = train_val["train"]
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count())

df_tokens = pd.DataFrame([len(x) for x in train_data['input_ids']], columns=['token_count'])

plt.figure(figsize=(10, 10))
plt.hist(df_tokens['token_count'], bins=100)
token_avg = np.mean(df_tokens['token_count'])
token_median = np.median(df_tokens['token_count'])
plt.title("token_count with cutoff=%s avg: %s median: %s" % (cutoff_len, token_avg, token_median))
plt.savefig('token_hist_%s.png' % cutoff_len)
plt.close()
121 changes: 69 additions & 52 deletions finetune.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import sys
import time
from functools import partial
from typing import List, Union
from enum import Enum
import fire
Expand Down Expand Up @@ -255,56 +256,7 @@ def train(
model.is_parallelizable = True
model.model_parallel = True

tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

def tokenize(prompt, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result

def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, add_eos_token=add_eos)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt
tokenizer = get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token)

if train_8bit:
from peft import (
Expand Down Expand Up @@ -489,10 +441,14 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):

assert train_data is not None

generate_and_tokenize_prompt_fun = partial(generate_and_tokenize_prompt, prompt_type=prompt_type,
train_on_inputs=train_on_inputs, add_eos_token=add_eos_token,
cutoff_len=cutoff_len, tokenizer=tokenizer)

# shuffle and tokenize data
if train_data_mix_in:
train_data = concatenate_datasets([train_data, train_data_mix_in])
train_data = train_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
train_data = train_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
train_set_size = len(train_data)

if valid_data and valid_data_mix_in:
Expand All @@ -501,7 +457,7 @@ def generate_and_tokenize_prompt(data_point, add_eos=add_eos_token):
valid_data = valid_data_mix_in

if valid_data:
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt, num_proc=os.cpu_count() // torch.cuda.device_count())
valid_data = valid_data.shuffle().map(generate_and_tokenize_prompt_fun, num_proc=os.cpu_count() // torch.cuda.device_count())
val_set_size = len(valid_data)
else:
val_set_size = 0
Expand Down Expand Up @@ -702,6 +658,67 @@ def get_loaders(llama_type, model_name, reward_type):
return model_loader, tokenizer_loader


def get_tokenizer(tokenizer_loader, tokenizer_base_model, local_files_only, resume_download, use_auth_token):
tokenizer = tokenizer_loader.from_pretrained(tokenizer_base_model,
local_files_only=local_files_only,
resume_download=resume_download,
use_auth_token=use_auth_token)

tokenizer.pad_token_id = 0 # different from the eos token
# when generating, we will use the logits of right-most token to predict the next token
# so the padding should be on the left,
# e.g. see: https://huggingface.co/transformers/v4.11.3/model_doc/t5.html#inference
tokenizer.padding_side = "left" # Allow batched inference

return tokenizer


def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True):
# there's probably a way to do this with the tokenizer settings
# but again, gotta move fast
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != tokenizer.eos_token_id
and len(result["input_ids"]) < cutoff_len
and add_eos_token
):
result["input_ids"].append(tokenizer.eos_token_id)
result["attention_mask"].append(1)

result["labels"] = result["input_ids"].copy()

return result


def generate_and_tokenize_prompt(data_point, prompt_type=None, train_on_inputs=False, add_eos_token=False,
cutoff_len=None, tokenizer=None):
assert prompt_type is not None
assert cutoff_len is not None
assert tokenizer is not None
full_prompt, _, _ = generate_prompt(data_point, prompt_type, False, False)
tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
if not train_on_inputs:
user_prompt, _, _ = generate_prompt({**data_point, "output": ""}, prompt_type, False, False)
tokenized_user_prompt = tokenize(user_prompt, tokenizer, cutoff_len, add_eos_token=add_eos_token)
user_prompt_len = len(tokenized_user_prompt["input_ids"])
if add_eos_token:
user_prompt_len -= 1

# ignore_index=-100 ensures torch/tf don't include padding token id in CrossEntropyLoss
tokenized_full_prompt["labels"] = [
-100
] * user_prompt_len + tokenized_full_prompt["labels"][
user_prompt_len:
] # could be sped up, probably
return tokenized_full_prompt


def get_prompt(prompt_type, chat, context, reduced):
if prompt_type in [-1, "-1", "plain"]:
promptA = promptB = PreInstruct = PreInput = PreResponse = ''
Expand Down

0 comments on commit 0b74d7f

Please sign in to comment.