Skip to content

Commit

Permalink
fix: s2s decoder only notebook (#811)
Browse files Browse the repository at this point in the history
Fix the decoder only notebook
  • Loading branch information
bogdan-galileo authored Dec 4, 2023
1 parent 699fb19 commit 1dbcaf4
Showing 1 changed file with 13 additions and 70 deletions.
83 changes: 13 additions & 70 deletions docs/notebooks/Seq2Seq_Decoder_Only_Dq_Fake_Data.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@
"metadata": {},
"outputs": [],
"source": [
"dataset_size = 100\n",
"dataset_size = 10\n",
"\n",
"ds = load_dataset(\"billsum\")\n",
"ds = ds.remove_columns('text')\n",
"# Add ids\n",
"ds = ds.map(create_formatted_prompt, with_indices=True)\n",
"ds_train = Dataset.from_dict(ds['train'][:100])\n",
"ds_val = Dataset.from_dict(ds['test'][:100])\n",
"ds_train = Dataset.from_dict(ds['train'][:dataset_size])\n",
"ds_val = Dataset.from_dict(ds['test'][:dataset_size])\n",
"ds_train"
]
},
Expand All @@ -109,7 +109,7 @@
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM\n",
"from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM, PreTrainedTokenizerFast\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"facebook/opt-125m\", use_fast=True)\n",
"model = AutoModelForCausalLM.from_pretrained(\"facebook/opt-125m\")"
Expand Down Expand Up @@ -140,60 +140,6 @@
"ds_train[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e63464a5",
"metadata": {},
"outputs": [],
"source": [
"batch = ds_train[:10]\n",
"model_inputs = {\n",
" 'input_ids': batch['input_ids'],\n",
" 'attention_mask': batch['attention_mask'],\n",
" #'labels': batch['input_ids'].copy()\n",
"}\n",
"model_inputs = tokenizer.pad(model_inputs, padding=True, return_tensors='pt')\n",
"model_inputs['labels'] = model_inputs['input_ids'].clone()\n",
"model_outputs = model(**model_inputs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f52621a",
"metadata": {},
"outputs": [],
"source": [
"model_outputs.logits.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c9cb17b",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"\n",
"fake_sample_ids = torch.tensor([[2, 337, 245, 103, 63, 839, 215, 239]])\n",
"fake_attention = torch.ones(1, 8)\n",
"fake_labels = fake_sample_ids.clone()\n",
"model_outputs = model(input_ids=fake_sample_ids, attention_mask=fake_attention, labels=fake_labels)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07256adb",
"metadata": {},
"outputs": [],
"source": [
"model_outputs.logits.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -206,6 +152,7 @@
"os.environ[\"GALILEO_USERNAME\"]=\"\"\n",
"os.environ[\"GALILEO_PASSWORD\"]=\"\"\n",
"\n",
"\n",
"import dataquality as dq\n",
"from dataquality.integrations.seq2seq.core import watch\n",
"dq.configure()"
Expand All @@ -218,19 +165,23 @@
"metadata": {},
"outputs": [],
"source": [
"dq.init(\"seq2seq\", project_name=\"Seq2Seq_DecoderOnly_Real_Model_shifted\")\n",
"dq.init(\"seq2seq\", project_name=\"Seq2Seq_DecoderOnly_Generation\")\n",
"\n",
"temperature = 0.4\n",
"temperature = 0.\n",
"generation_config = GenerationConfig(\n",
" max_new_tokens=15,\n",
" # Whether we use multinomial sampling\n",
" do_sample=temperature >= 1e-5,\n",
" temperature=temperature,\n",
")\n",
"\n",
"response_template = \"###Response:\"\n",
"response_template = tokenizer(response_template, add_special_tokens=False)[\"input_ids\"]\n",
"\n",
"watch(\n",
" model,\n",
" tokenizer,\n",
" \"decoder_only\",\n",
" model,\n",
" generation_config,\n",
" generation_splits=[],\n",
" max_input_tokens=1024,\n",
Expand Down Expand Up @@ -316,16 +267,8 @@
"metadata": {},
"outputs": [],
"source": [
"dq.finish()"
"dq.finish(data_embs_col=\"title\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d356e528",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 1dbcaf4

Please sign in to comment.