Skip to content

Commit

Permalink
add colon and semicolon to tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
rasbt committed Mar 23, 2024
1 parent 65053ef commit 937c349
Showing 1 changed file with 14 additions and 14 deletions.
28 changes: 14 additions & 14 deletions ch02/01_main-chapter-code/ch02.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"torch version: 2.1.0\n",
"torch version: 2.2.1\n",
"tiktoken version: 0.5.1\n"
]
}
Expand Down Expand Up @@ -273,7 +273,7 @@
{
"cell_type": "code",
"execution_count": 6,
"id": "902f0d9c-9828-4c46-ba32-8fe810c3840a",
"id": "ed3a9467-04b4-49d9-96c5-b8042bcf8374",
"metadata": {},
"outputs": [
{
Expand All @@ -287,7 +287,7 @@
"source": [
"text = \"Hello, world. Is this-- a test?\"\n",
"\n",
"result = re.split(r'([,.?_!\"()\\']|--|\\s)', text)\n",
"result = re.split(r'([,.:;?_!\"()\\']|--|\\s)', text)\n",
"result = [item.strip() for item in result if item.strip()]\n",
"print(result)"
]
Expand Down Expand Up @@ -750,7 +750,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 17,
"id": "ce9df29c-6c5b-43f1-8c1a-c7f7b79db78f",
"metadata": {},
"outputs": [],
Expand All @@ -766,7 +766,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 18,
"id": "57c3143b-e860-4d3b-a22a-de22b547a6a9",
"metadata": {},
"outputs": [
Expand All @@ -776,7 +776,7 @@
"1161"
]
},
"execution_count": 22,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -787,7 +787,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 19,
"id": "50e51bb1-ae05-4aa8-a9ff-455b65ed1959",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -818,7 +818,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 20,
"id": "948861c5-3f30-4712-a234-725f20d26f68",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -854,7 +854,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 21,
"id": "effcef79-e0a5-4f4a-a43a-31dd94b9250a",
"metadata": {},
"outputs": [
Expand All @@ -879,7 +879,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 22,
"id": "ddfe7346-398d-4bf8-99f1-5b071244ce95",
"metadata": {},
"outputs": [
Expand All @@ -904,7 +904,7 @@
" 7]"
]
},
"execution_count": 26,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -915,7 +915,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 23,
"id": "0c350ff6-2734-4e84-9ec7-d578baa4ae1b",
"metadata": {},
"outputs": [
Expand All @@ -925,7 +925,7 @@
"'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'"
]
},
"execution_count": 27,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1876,7 +1876,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 937c349

Please sign in to comment.