diff --git a/docs/notebooks/Seq2Seq_Decoder_Only_Dq_Fake_Data.ipynb b/docs/notebooks/Seq2Seq_Decoder_Only_Dq_Fake_Data.ipynb index 51868c134..c01016ac0 100644 --- a/docs/notebooks/Seq2Seq_Decoder_Only_Dq_Fake_Data.ipynb +++ b/docs/notebooks/Seq2Seq_Decoder_Only_Dq_Fake_Data.ipynb @@ -81,14 +81,14 @@ "metadata": {}, "outputs": [], "source": [ - "dataset_size = 100\n", + "dataset_size = 10\n", "\n", "ds = load_dataset(\"billsum\")\n", "ds = ds.remove_columns('text')\n", "# Add ids\n", "ds = ds.map(create_formatted_prompt, with_indices=True)\n", - "ds_train = Dataset.from_dict(ds['train'][:100])\n", - "ds_val = Dataset.from_dict(ds['test'][:100])\n", + "ds_train = Dataset.from_dict(ds['train'][:dataset_size])\n", + "ds_val = Dataset.from_dict(ds['test'][:dataset_size])\n", "ds_train" ] }, @@ -109,7 +109,7 @@ "metadata": {}, "outputs": [], "source": [ - "from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM\n", + "from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM, PreTrainedTokenizerFast\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"facebook/opt-125m\", use_fast=True)\n", "model = AutoModelForCausalLM.from_pretrained(\"facebook/opt-125m\")" @@ -140,60 +140,6 @@ "ds_train[0]" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "e63464a5", - "metadata": {}, - "outputs": [], - "source": [ - "batch = ds_train[:10]\n", - "model_inputs = {\n", - " 'input_ids': batch['input_ids'],\n", - " 'attention_mask': batch['attention_mask'],\n", - " #'labels': batch['input_ids'].copy()\n", - "}\n", - "model_inputs = tokenizer.pad(model_inputs, padding=True, return_tensors='pt')\n", - "model_inputs['labels'] = model_inputs['input_ids'].clone()\n", - "model_outputs = model(**model_inputs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f52621a", - "metadata": {}, - "outputs": [], - "source": [ - "model_outputs.logits.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c9cb17b", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "\n", - "fake_sample_ids = torch.tensor([[2, 337, 245, 103, 63, 839, 215, 239]])\n", - "fake_attention = torch.ones(1, 8)\n", - "fake_labels = fake_sample_ids.clone()\n", - "model_outputs = model(input_ids=fake_sample_ids, attention_mask=fake_attention, labels=fake_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "07256adb", - "metadata": {}, - "outputs": [], - "source": [ - "model_outputs.logits.shape" - ] - }, { "cell_type": "code", "execution_count": null, @@ -206,6 +152,7 @@ "os.environ[\"GALILEO_USERNAME\"]=\"\"\n", "os.environ[\"GALILEO_PASSWORD\"]=\"\"\n", "\n", + "\n", "import dataquality as dq\n", "from dataquality.integrations.seq2seq.core import watch\n", "dq.configure()" @@ -218,9 +165,9 @@ "metadata": {}, "outputs": [], "source": [ - "dq.init(\"seq2seq\", project_name=\"Seq2Seq_DecoderOnly_Real_Model_shifted\")\n", + "dq.init(\"seq2seq\", project_name=\"Seq2Seq_DecoderOnly_Generation\")\n", "\n", - "temperature = 0.4\n", + "temperature = 0.\n", "generation_config = GenerationConfig(\n", " max_new_tokens=15,\n", " # Whether we use multinomial sampling\n", @@ -228,9 +175,13 @@ " temperature=temperature,\n", ")\n", "\n", + "response_template = \"###Response:\"\n", + "response_template = tokenizer(response_template, add_special_tokens=False)[\"input_ids\"]\n", + "\n", "watch(\n", - " model,\n", " tokenizer,\n", + " \"decoder_only\",\n", + " model,\n", " generation_config,\n", " generation_splits=[],\n", " max_input_tokens=1024,\n", @@ -316,16 +267,8 @@ "metadata": {}, "outputs": [], "source": [ - "dq.finish()" + "dq.finish(data_embs_col=\"title\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d356e528", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {