Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

feat: add token count for each chunk #235

Merged
merged 4 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ai/cohere/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ Sort text inputs by semantic relevance to a specified query.
| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Reranked documents | `ranking` | array[string] | Reranked documents |
| Reranked documents relevance (optional) | `relevance` | array[number] | The relevance scores of the reranked documents |
| Usage (optional) | `usage` | object | Search Usage on the Cohere Platform Rerank Models |


Expand Down
11 changes: 5 additions & 6 deletions ai/mistralai/v0/README.mdx
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
---
title: "Mistral"
title: "Mistral AI"
lang: "en-US"
draft: false
description: "Learn about how to set up a VDP Mistral component https://github.com/instill-ai/instill-core"
description: "Learn about how to set up a VDP Mistral AI component https://github.com/instill-ai/instill-core"
---

The Mistral component is an AI component that allows users to connect the AI models served on the Mistral Platform.
The Mistral AI component is an AI component that allows users to connect the AI models served on the Mistral AI Platform.
It can carry out the following tasks:

- [Text Generation Chat](#text-generation-chat)
Expand All @@ -21,7 +21,7 @@ It can carry out the following tasks:

## Configuration

The component configuration is defined and maintained [here](https://github.com/instill-ai/component/blob/main/ai/mistral/v0/config/definition.json).
The component configuration is defined and maintained [here](https://github.com/instill-ai/component/blob/main/ai/mistralai/v0/config/definition.json).



Expand All @@ -31,7 +31,7 @@ The component configuration is defined and maintained [here](https://github.com/

| Field | Field ID | Type | Note |
| :--- | :--- | :--- | :--- |
| API Key (required) | `api-key` | string | Fill in your Mistral API key. To find your keys, visit the Mistral platform page. |
| API Key (required) | `api-key` | string | Fill in your Mistral API key. To find your keys, visit the Mistral AI platform page. |



Expand Down Expand Up @@ -78,7 +78,6 @@ Turn text into a vector of numbers that capture its meaning, unlocking use cases
| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_TEXT_EMBEDDINGS` |
| Embedding Type (required) | `embedding-type` | string | Specifies the return type of embedding. |
| Model Name (required) | `model-name` | string | The Mistral embed model to be used |
| Text (required) | `text` | string | The text |

Expand Down
3 changes: 2 additions & 1 deletion operator/text/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ Chunk text with different strategies

| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Token Count (optional) | `token-count` | integer | Total count of tokens in the input text |
| Token Count | `token-count` | integer | Total count of tokens in the original input text |
| Text Chunks | `text-chunks` | array[object] | Text chunks after splitting |
| Number of Text Chunks | `chunk-num` | integer | Total number of output text chunks |
| Token Count Chunks | `chunks-token-count` | integer | Total count of tokens in the output text chunks |


### Chunking Strategy
Expand Down
30 changes: 22 additions & 8 deletions operator/text/v0/chunk_text.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,17 @@ type Setting struct {
}

type ChunkTextOutput struct {
ChunkNum int `json:"chunk-num"`
TextChunks []TextChunk `json:"text-chunks"`
TokenCount int `json:"token-count,omitempty"`
ChunkNum int `json:"chunk-num"`
TextChunks []TextChunk `json:"text-chunks"`
TokenCount int `json:"token-count"`
ChunksTokenCount int `json:"chunks-token-count"`
}

type TextChunk struct {
Text string `json:"text"`
StartPosition int `json:"start-position"`
EndPosition int `json:"end-position"`
TokenCount int `json:"token-count"`
}

func (s *Setting) SetDefault() {
Expand Down Expand Up @@ -110,19 +112,18 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
)
}

tkm, err := tiktoken.EncodingForModel(setting.ModelName)
chunks, err := split.SplitText(input.Text)
if err != nil {
return output, err
}
token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)
output.TokenCount = len(token)
output.ChunkNum = len(chunks)

chunks, err := split.SplitText(input.Text)
tkm, err := tiktoken.EncodingForModel(setting.ModelName)
if err != nil {
return output, err
}
output.ChunkNum = len(chunks)

totalTokenCount := 0
startScanPosition := 0
rawRunes := []rune(input.Text)
for i, chunk := range chunks {
Expand All @@ -139,22 +140,35 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) {
continue
}

token := tkm.Encode(chunk, setting.AllowedSpecial, setting.DisallowedSpecial)

output.TextChunks = append(output.TextChunks, TextChunk{
Text: chunk,
StartPosition: startPosition,
EndPosition: endPosition,
TokenCount: len(token),
})
totalTokenCount += len(token)
startScanPosition = startPosition + 1
}

if len(output.TextChunks) == 0 {
token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)

output.TextChunks = append(output.TextChunks, TextChunk{
Text: input.Text,
StartPosition: 0,
EndPosition: len(rawRunes) - 1,
TokenCount: len(token),
})
output.ChunkNum = 1
totalTokenCount = len(token)
}

originalTextToken := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial)
output.TokenCount = len(originalTextToken)
output.ChunksTokenCount = totalTokenCount

return output, nil
}

Expand Down
40 changes: 24 additions & 16 deletions operator/text/v0/chunk_text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ func TestChunkText(t *testing.T) {
Text: "Hello world.",
Strategy: Strategy{
Setting: Setting{
ChunkMethod: "Token",
ChunkSize: 512,
ModelName: "gpt-3.5-turbo",
ChunkMethod: "Token",
ChunkSize: 512,
ModelName: "gpt-3.5-turbo",
},
},
},
Expand All @@ -34,10 +34,12 @@ func TestChunkText(t *testing.T) {
Text: "Hello world.",
StartPosition: 0,
EndPosition: 11,
TokenCount: 3,
},
},
ChunkNum: 1,
TokenCount: 3,
ChunkNum: 1,
TokenCount: 3,
ChunksTokenCount: 3,
},
},
{
Expand All @@ -46,9 +48,9 @@ func TestChunkText(t *testing.T) {
Text: "Hello world.",
Strategy: Strategy{
Setting: Setting{
ChunkMethod: "Markdown",
ModelName: "gpt-3.5-turbo",
ChunkSize: 5,
ChunkMethod: "Markdown",
ModelName: "gpt-3.5-turbo",
ChunkSize: 5,
},
},
},
Expand All @@ -58,15 +60,18 @@ func TestChunkText(t *testing.T) {
Text: "Hello",
StartPosition: 0,
EndPosition: 4,
TokenCount: 1,
},
{
Text: "world.",
StartPosition: 6,
EndPosition: 11,
TokenCount: 2,
},
},
ChunkNum: 2,
TokenCount: 3,
ChunkNum: 2,
TokenCount: 3,
ChunksTokenCount: 3,
},
},
{
Expand All @@ -75,10 +80,10 @@ func TestChunkText(t *testing.T) {
Text: "Hello world.",
Strategy: Strategy{
Setting: Setting{
ChunkMethod: "Recursive",
ModelName: "gpt-3.5-turbo",
ChunkSize: 5,
Separators: []string{" ", "."},
ChunkMethod: "Recursive",
ModelName: "gpt-3.5-turbo",
ChunkSize: 5,
Separators: []string{" ", "."},
},
},
},
Expand All @@ -88,15 +93,18 @@ func TestChunkText(t *testing.T) {
Text: "Hello",
StartPosition: 0,
EndPosition: 4,
TokenCount: 1,
},
{
Text: "world",
StartPosition: 6,
EndPosition: 10,
TokenCount: 1,
},
},
ChunkNum: 2,
TokenCount: 3,
ChunkNum: 2,
TokenCount: 3,
ChunksTokenCount: 2,
},
},
}
Expand Down
24 changes: 21 additions & 3 deletions operator/text/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -418,10 +418,19 @@
"description": "The ending position of the chunk in the original text",
"instillFormat": "integer",
"type": "integer"
},
"token-count": {
"title": "Token Count",
"description": "Count of tokens in a chunk",
"instillFormat": "integer",
"type": "integer"
}
},
"required": [
"text"
"text",
"start-position",
"end-position",
"token-count"
],
"instillUIMultiline": true,
"type": "object"
Expand All @@ -430,16 +439,25 @@
"type": "array"
},
"token-count": {
"description": "Total count of tokens in the input text",
"description": "Total count of tokens in the original input text",
"instillUIOrder": 0,
"instillFormat": "integer",
"title": "Token Count",
"type": "integer"
},
"chunks-token-count": {
"description": "Total count of tokens in the output text chunks",
"instillUIOrder": 3,
"instillFormat": "integer",
"title": "Token Count Chunks",
"type": "integer"
}
},
"required": [
"text-chunks",
"chunk-num"
"chunk-num",
"token-count",
"chunks-token-count"
],
"title": "Output",
"type": "object"
Expand Down
Loading