feat: add token count for each chunk (#235)

Because - to build RAG, it is crucial to know the token count for each chunk - it is good to analyse the raw text with the token count This commit - add token count for each chunk - differentiate two token count between sum up token counts in chunks and raw text token count
instill-ai · Jul 26, 2024 · bb69104 · bb69104
1 parent dcad3f0
commit bb69104
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 34 deletions.
diff --git a/ai/cohere/v0/README.mdx b/ai/cohere/v0/README.mdx
@@ -115,6 +115,7 @@ Sort text inputs by semantic relevance to a specified query.
 | Output | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
 | Reranked documents | `ranking` | array[string] | Reranked documents |
+| Reranked documents relevance (optional) | `relevance` | array[number] | The relevance scores of the reranked documents |
 | Usage (optional) | `usage` | object | Search Usage on the Cohere Platform Rerank Models |
 
 

diff --git a/ai/mistralai/v0/README.mdx b/ai/mistralai/v0/README.mdx
@@ -1,11 +1,11 @@
 ---
-title: "Mistral"
+title: "Mistral AI"
 lang: "en-US"
 draft: false
-description: "Learn about how to set up a VDP Mistral component https://github.com/instill-ai/instill-core"
+description: "Learn about how to set up a VDP Mistral AI component https://github.com/instill-ai/instill-core"
 ---
 
-The Mistral component is an AI component that allows users to connect the AI models served on the Mistral Platform.
+The Mistral AI component is an AI component that allows users to connect the AI models served on the Mistral AI Platform.
 It can carry out the following tasks:
 
 - [Text Generation Chat](#text-generation-chat)
@@ -21,7 +21,7 @@ It can carry out the following tasks:
 
 ## Configuration
 
-The component configuration is defined and maintained [here](https://github.com/instill-ai/component/blob/main/ai/mistral/v0/config/definition.json).
+The component configuration is defined and maintained [here](https://github.com/instill-ai/component/blob/main/ai/mistralai/v0/config/definition.json).
 
 
 
@@ -31,7 +31,7 @@ The component configuration is defined and maintained [here](https://github.com/
 
 | Field | Field ID | Type | Note |
 | :--- | :--- | :--- | :--- |
-| API Key (required) | `api-key` | string | Fill in your Mistral API key. To find your keys, visit the Mistral platform page. |
+| API Key (required) | `api-key` | string | Fill in your Mistral API key. To find your keys, visit the Mistral AI platform page. |
 
 
 
@@ -78,7 +78,6 @@ Turn text into a vector of numbers that capture its meaning, unlocking use cases
 | Input | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
 | Task ID (required) | `task` | string | `TASK_TEXT_EMBEDDINGS` |
-| Embedding Type (required) | `embedding-type` | string | Specifies the return type of embedding. |
 | Model Name (required) | `model-name` | string | The Mistral embed model to be used |
 | Text (required) | `text` | string | The text |
 

diff --git a/operator/text/v0/README.mdx b/operator/text/v0/README.mdx
@@ -68,9 +68,10 @@ Chunk text with different strategies
 
 | Output | ID | Type | Description |
 | :--- | :--- | :--- | :--- |
-| Token Count (optional) | `token-count` | integer | Total count of tokens in the input text |
+| Token Count | `token-count` | integer | Total count of tokens in the original input text |
 | Text Chunks | `text-chunks` | array[object] | Text chunks after splitting |
 | Number of Text Chunks | `chunk-num` | integer | Total number of output text chunks |
+| Token Count Chunks | `chunks-token-count` | integer | Total count of tokens in the output text chunks |
 
 
 ### Chunking Strategy

diff --git a/operator/text/v0/chunk_text.go b/operator/text/v0/chunk_text.go
@@ -33,15 +33,17 @@ type Setting struct {
 }
 
 type ChunkTextOutput struct {
-	ChunkNum   int         `json:"chunk-num"`
-	TextChunks []TextChunk `json:"text-chunks"`
-	TokenCount int         `json:"token-count,omitempty"`
+	ChunkNum         int         `json:"chunk-num"`
+	TextChunks       []TextChunk `json:"text-chunks"`
+	TokenCount       int         `json:"token-count"`
+	ChunksTokenCount int         `json:"chunks-token-count"`
 }
 
 type TextChunk struct {
 	Text          string `json:"text"`
 	StartPosition int    `json:"start-position"`
 	EndPosition   int    `json:"end-position"`
+	TokenCount    int    `json:"token-count"`
 }
 
 func (s *Setting) SetDefault() {
@@ -110,19 +112,18 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
 		)
 	}
 
-	tkm, err := tiktoken.EncodingForModel(setting.ModelName)
+	chunks, err := split.SplitText(input.Text)
 	if err != nil {
 		return output, err
 	}
-	token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)
-	output.TokenCount = len(token)
+	output.ChunkNum = len(chunks)
 
-	chunks, err := split.SplitText(input.Text)
+	tkm, err := tiktoken.EncodingForModel(setting.ModelName)
 	if err != nil {
 		return output, err
 	}
-	output.ChunkNum = len(chunks)
 
+	totalTokenCount := 0
 	startScanPosition := 0
 	rawRunes := []rune(input.Text)
 	for i, chunk := range chunks {
@@ -139,22 +140,35 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
 			continue
 		}
 
+		token := tkm.Encode(chunk, setting.AllowedSpecial, setting.DisallowedSpecial)
+
 		output.TextChunks = append(output.TextChunks, TextChunk{
 			Text:          chunk,
 			StartPosition: startPosition,
 			EndPosition:   endPosition,
+			TokenCount:    len(token),
 		})
+		totalTokenCount += len(token)
 		startScanPosition = startPosition + 1
 	}
 
 	if len(output.TextChunks) == 0 {
+		token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)
+
 		output.TextChunks = append(output.TextChunks, TextChunk{
 			Text:          input.Text,
 			StartPosition: 0,
 			EndPosition:   len(rawRunes) - 1,
+			TokenCount:    len(token),
 		})
 		output.ChunkNum = 1
+		totalTokenCount = len(token)
 	}
+
+	originalTextToken := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)
+	output.TokenCount = len(originalTextToken)
+	output.ChunksTokenCount = totalTokenCount
+
 	return output, nil
 }
 

diff --git a/operator/text/v0/chunk_text_test.go b/operator/text/v0/chunk_text_test.go
@@ -22,9 +22,9 @@ func TestChunkText(t *testing.T) {
 				Text: "Hello world.",
 				Strategy: Strategy{
 					Setting: Setting{
-						ChunkMethod:    "Token",
-						ChunkSize:      512,
-						ModelName:      "gpt-3.5-turbo",
+						ChunkMethod: "Token",
+						ChunkSize:   512,
+						ModelName:   "gpt-3.5-turbo",
 					},
 				},
 			},
@@ -34,10 +34,12 @@ func TestChunkText(t *testing.T) {
 						Text:          "Hello world.",
 						StartPosition: 0,
 						EndPosition:   11,
+						TokenCount:    3,
 					},
 				},
-				ChunkNum:   1,
-				TokenCount: 3,
+				ChunkNum:         1,
+				TokenCount:       3,
+				ChunksTokenCount: 3,
 			},
 		},
 		{
@@ -46,9 +48,9 @@ func TestChunkText(t *testing.T) {
 				Text: "Hello world.",
 				Strategy: Strategy{
 					Setting: Setting{
-						ChunkMethod:    "Markdown",
-						ModelName:      "gpt-3.5-turbo",
-						ChunkSize:      5,
+						ChunkMethod: "Markdown",
+						ModelName:   "gpt-3.5-turbo",
+						ChunkSize:   5,
 					},
 				},
 			},
@@ -58,15 +60,18 @@ func TestChunkText(t *testing.T) {
 						Text:          "Hello",
 						StartPosition: 0,
 						EndPosition:   4,
+						TokenCount:    1,
 					},
 					{
 						Text:          "world.",
 						StartPosition: 6,
 						EndPosition:   11,
+						TokenCount:    2,
 					},
 				},
-				ChunkNum:   2,
-				TokenCount: 3,
+				ChunkNum:         2,
+				TokenCount:       3,
+				ChunksTokenCount: 3,
 			},
 		},
 		{
@@ -75,10 +80,10 @@ func TestChunkText(t *testing.T) {
 				Text: "Hello world.",
 				Strategy: Strategy{
 					Setting: Setting{
-						ChunkMethod:    "Recursive",
-						ModelName:      "gpt-3.5-turbo",
-						ChunkSize:      5,
-						Separators:     []string{" ", "."},
+						ChunkMethod: "Recursive",
+						ModelName:   "gpt-3.5-turbo",
+						ChunkSize:   5,
+						Separators:  []string{" ", "."},
 					},
 				},
 			},
@@ -88,15 +93,18 @@ func TestChunkText(t *testing.T) {
 						Text:          "Hello",
 						StartPosition: 0,
 						EndPosition:   4,
+						TokenCount:    1,
 					},
 					{
 						Text:          "world",
 						StartPosition: 6,
 						EndPosition:   10,
+						TokenCount:    1,
 					},
 				},
-				ChunkNum:   2,
-				TokenCount: 3,
+				ChunkNum:         2,
+				TokenCount:       3,
+				ChunksTokenCount: 2,
 			},
 		},
 	}

diff --git a/operator/text/v0/config/tasks.json b/operator/text/v0/config/tasks.json
@@ -418,10 +418,19 @@
                 "description": "The ending position of the chunk in the original text",
                 "instillFormat": "integer",
                 "type": "integer"
+              },
+              "token-count": {
+                "title": "Token Count",
+                "description": "Count of tokens in a chunk",
+                "instillFormat": "integer",
+                "type": "integer"
               }
             },
             "required": [
-              "text"
+              "text",
+              "start-position",
+              "end-position",
+              "token-count"
             ],
             "instillUIMultiline": true,
             "type": "object"
@@ -430,16 +439,25 @@
           "type": "array"
         },
         "token-count": {
-          "description": "Total count of tokens in the input text",
+          "description": "Total count of tokens in the original input text",
           "instillUIOrder": 0,
           "instillFormat": "integer",
           "title": "Token Count",
           "type": "integer"
+        },
+        "chunks-token-count": {
+          "description": "Total count of tokens in the output text chunks",
+          "instillUIOrder": 3,
+          "instillFormat": "integer",
+          "title": "Token Count Chunks",
+          "type": "integer"
         }
       },
       "required": [
         "text-chunks",
-        "chunk-num"
+        "chunk-num",
+        "token-count",
+        "chunks-token-count"
       ],
       "title": "Output",
       "type": "object"