update README.md

cahya-wirawan · Aug 17, 2024 · 80a4d9e · 80a4d9e
1 parent b2336e8
commit 80a4d9e
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -74,3 +74,4 @@ docs/_build/
 tmp/
 tools/rwkv_tokenizers_bpe.ipynb
 data/wiki-en-tiny.jsonl
+target/
diff --git a/README.md b/README.md
@@ -56,7 +56,8 @@ tokenizer is around 17x faster than the original tokenizer and 9.6x faster than
 
 ![performance-comparison](data/performance-comparison.png)
 
-We compared also the multithreading/batch encoding performance using the [Huggingface Tokenizers comparison script](https://github.com/huggingface/tokenizers/blob/main/bindings/python/benches/test_tiktoken.py):
+We compared also the multithreading/batch encoding performance using a [script](tools/test_tiktoken-huggingface-rwkv.py) 
+which based on the [Huggingface Tokenizers](https://github.com/huggingface/tokenizers):
 ![performance-comparison](data/performance-comparison-multithreading.png) 
 
 *The simple English Wikipedia dataset can be downloaded as jsonl file from

diff --git a/tools/test_tiktoken-huggingface-rwkv.py b/tools/test_tiktoken-huggingface-rwkv.py
@@ -78,20 +78,20 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
     enc.encode_ordinary_batch(documents, num_threads=num_threads)
     end = time.perf_counter_ns()
 
-    readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
+    readable_size, unit = format_byte_size(int(num_bytes / (end - start) * 1e9))
     print(f"tiktoken \t{readable_size}/s")
 
 
     start = time.perf_counter_ns()
     hf_enc.encode_batch_fast(documents)
     end = time.perf_counter_ns()
-    readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
+    readable_size, unit = format_byte_size(int(num_bytes / (end - start) * 1e9))
     print(f"huggingface \t{readable_size}/s")
 
     start = time.perf_counter_ns()
     rwkv_enc.encode_batch(documents)
     end = time.perf_counter_ns()
-    readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
+    readable_size, unit = format_byte_size(int(num_bytes / (end - start) * 1e9))
     print(f"rwkv \t\t{readable_size}/s")
-Original file line number
+Diff line change
@@ Expand Up / @@ -74,3 +74,4 @@ docs/_build/ @@
     tmp/
     tools/rwkv_tokenizers_bpe.ipynb
     data/wiki-en-tiny.jsonl
+    target/