From 937c3491daddcfeabde27cd89f213d12f46a3a94 Mon Sep 17 00:00:00 2001 From: rasbt Date: Sat, 23 Mar 2024 06:50:34 -0500 Subject: [PATCH] add colon and semicolon to tokenizer --- ch02/01_main-chapter-code/ch02.ipynb | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ch02/01_main-chapter-code/ch02.ipynb b/ch02/01_main-chapter-code/ch02.ipynb index 065df856..c169128a 100644 --- a/ch02/01_main-chapter-code/ch02.ipynb +++ b/ch02/01_main-chapter-code/ch02.ipynb @@ -37,7 +37,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "torch version: 2.1.0\n", + "torch version: 2.2.1\n", "tiktoken version: 0.5.1\n" ] } @@ -273,7 +273,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "902f0d9c-9828-4c46-ba32-8fe810c3840a", + "id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374", "metadata": {}, "outputs": [ { @@ -287,7 +287,7 @@ "source": [ "text = \"Hello, world. Is this-- a test?\"\n", "\n", - "result = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n", + "result = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n", "result = [item.strip() for item in result if item.strip()]\n", "print(result)" ] @@ -750,7 +750,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f", "metadata": {}, "outputs": [], @@ -766,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "id": "57c3143b-e860-4d3b-a22a-de22b547a6a9", "metadata": {}, "outputs": [ @@ -776,7 +776,7 @@ "1161" ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -787,7 +787,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959", "metadata": {}, "outputs": [ @@ -818,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "id": "948861c5-3f30-4712-a234-725f20d26f68", "metadata": {}, "outputs": [], @@ -854,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 21, "id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a", "metadata": {}, "outputs": [ @@ -879,7 +879,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "id": "ddfe7346-398d-4bf8-99f1-5b071244ce95", "metadata": {}, "outputs": [ @@ -904,7 +904,7 @@ " 7]" ] }, - "execution_count": 26, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -915,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b", "metadata": {}, "outputs": [ @@ -925,7 +925,7 @@ "'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'" ] }, - "execution_count": 27, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1876,7 +1876,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" } }, "nbformat": 4,