diff --git a/go.mod b/go.mod index f3ab3a56..51475d1f 100644 --- a/go.mod +++ b/go.mod @@ -43,7 +43,7 @@ require ( github.com/lib/pq v1.10.9 github.com/nakagami/firebirdsql v0.9.10 github.com/pkg/errors v0.9.1 - github.com/pkoukk/tiktoken-go v0.1.6 + github.com/pkoukk/tiktoken-go v0.1.7 github.com/redis/go-redis/v9 v9.5.1 github.com/santhosh-tekuri/jsonschema/v5 v5.3.0 github.com/sijms/go-ora/v2 v2.8.19 diff --git a/go.sum b/go.sum index dbed026c..b370fd20 100644 --- a/go.sum +++ b/go.sum @@ -396,8 +396,8 @@ github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw= -github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= +github.com/pkoukk/tiktoken-go v0.1.7 h1:qOBHXX4PHtvIvmOtyg1EeKlwFRiMKAcoMp4Q+bLQDmw= +github.com/pkoukk/tiktoken-go v0.1.7/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= diff --git a/operator/text/v0/.compogen/extra-chunk-text.mdx b/operator/text/v0/.compogen/extra-chunk-text.mdx index b96c779d..6b521ce2 100644 --- a/operator/text/v0/.compogen/extra-chunk-text.mdx +++ b/operator/text/v0/.compogen/extra-chunk-text.mdx @@ -7,39 +7,102 @@ There are three strategies available for chunking text in Text Component: #### Token Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model. -| **Parameter** | **Type** | **Description** | -|----------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | -| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | +| **Parameter** | **Type** | **Description** | +| -------------------- | ---------------- | ------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | +| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | #### Recursive This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text. -| **Parameter** | **Type** | **Description** | -|--------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `separators` | array of strings | A list of strings representing the separators used to split the text | -| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | +| **Parameter** | **Type** | **Description** | +| ---------------- | ---------------- | -------------------------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `separators` | array of strings | A list of strings representing the separators used to split the text | +| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | #### Markdown This text splitter is specially designed for Markdown format. -| **Parameter** | **Type** | **Description** | -|--------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | +| **Parameter** | **Type** | **Description** | +| --------------- | -------- | ------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | + +### Tokenization +There are 2 ways to choose the tokenizer: +- 1. Use Model name to choose the tokenizer +- 2. Use Encoding name to choose the tokenizer + +#### Model Name + +| **Model** | +| ----------------------------- | +| gpt-4o | +| gpt-4 | +| gpt-3.5-turbo | +| command-r-plus | +| command-r | +| command | +| command-nightly | +| command-light | +| command-light-nightly | +| embed-english-v3.0 | +| embed-multilingual-v3.0 | +| embed-english-light-v3.0 | +| embed-multilingual-light-v3.0 | +| text-davinci-003 | +| text-davinci-002 | +| text-davinci-001 | +| text-curie-001 | +| text-babbage-001 | +| text-ada-001 | +| davinci | +| curie | +| babbage | +| ada | +| code-davinci-002 | +| code-davinci-001 | +| code-cushman-002 | +| code-cushman-001 | +| davinci-codex | +| cushman-codex | +| text-davinci-edit-001 | +| code-davinci-edit-001 | +| text-embedding-ada-002 | +| text-similarity-davinci-001 | +| text-similarity-curie-001 | +| text-similarity-babbage-001 | +| text-similarity-ada-001 | +| text-search-davinci-doc-001 | +| text-search-curie-doc-001 | +| text-search-babbage-doc-001 | +| text-search-ada-doc-001 | +| code-search-babbage-code-001 | +| code-search-ada-code-001 | +| gpt2 | + + + +#### Encoding Name +| **Encoding** | +| ------------ | +| o200k_base | +| cl100k_base | +| p50k_base | +| r50k_base | +| p50k_edit | + ### Text Chunks in Output | **Parameter** | **Type** | **Description** | -|------------------|----------|--------------------------------------------------------------| +| ---------------- | -------- | ------------------------------------------------------------ | | `test` | string | The text chunk | | `start-position` | integer | The starting position of the text chunk in the original text | | `end-position` | integer | The ending position of the text chunk in the original text | diff --git a/operator/text/v0/README.mdx b/operator/text/v0/README.mdx index d3ce65e5..642ed3ec 100644 --- a/operator/text/v0/README.mdx +++ b/operator/text/v0/README.mdx @@ -37,7 +37,8 @@ Chunk text with different strategies | :--- | :--- | :--- | :--- | | Task ID (required) | `task` | string | `TASK_CHUNK_TEXT` | | Text (required) | `text` | string | Text to be chunked | -| Strategy (required) | `strategy` | object | Chunking strategy | +| Chunk Strategy (required) | `strategy` | object | Chunking strategy | +| Tokenization (required) | `tokenization` | object | Tokenization choices | @@ -58,39 +59,102 @@ There are three strategies available for chunking text in Text Component: #### Token Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model. -| **Parameter** | **Type** | **Description** | -|----------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | -| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | +| **Parameter** | **Type** | **Description** | +| -------------------- | ---------------- | ------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `model-name` | string | The name of the model used for tokenization | +| `allowed-special` | array of strings | A list of special tokens that are allowed within chunks | +| `disallowed-special` | array of strings | A list of special tokens that should not appear within chunks | #### Recursive This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text. -| **Parameter** | **Type** | **Description** | -|--------------------|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `separators` | array of strings | A list of strings representing the separators used to split the text | -| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | +| **Parameter** | **Type** | **Description** | +| ---------------- | ---------------- | -------------------------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `separators` | array of strings | A list of strings representing the separators used to split the text | +| `keep-separator` | boolean | A flag indicating whether to keep the separator characters at the beginning or end of chunks | #### Markdown This text splitter is specially designed for Markdown format. -| **Parameter** | **Type** | **Description** | -|--------------------|----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | -| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | -| `model-name` | string | The name of the model used for tokenization | -| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | +| **Parameter** | **Type** | **Description** | +| --------------- | -------- | ------------------------------------------------------------------------- | +| `chunk-size` | integer | Specifies the maximum size of each chunk in terms of the number of tokens | +| `chunk-overlap` | integer | Determines the number of tokens that overlap between consecutive chunks | +| `code-blocks` | boolean | A flag indicating whether code blocks should be treated as a single unit | + +### Tokenization +There are 2 ways to choose the tokenizer: +- 1. Use Model name to choose the tokenizer +- 2. Use Encoding name to choose the tokenizer + +#### Model Name + +| **Model** | +| ----------------------------- | +| gpt-4o | +| gpt-4 | +| gpt-3.5-turbo | +| command-r-plus | +| command-r | +| command | +| command-nightly | +| command-light | +| command-light-nightly | +| embed-english-v3.0 | +| embed-multilingual-v3.0 | +| embed-english-light-v3.0 | +| embed-multilingual-light-v3.0 | +| text-davinci-003 | +| text-davinci-002 | +| text-davinci-001 | +| text-curie-001 | +| text-babbage-001 | +| text-ada-001 | +| davinci | +| curie | +| babbage | +| ada | +| code-davinci-002 | +| code-davinci-001 | +| code-cushman-002 | +| code-cushman-001 | +| davinci-codex | +| cushman-codex | +| text-davinci-edit-001 | +| code-davinci-edit-001 | +| text-embedding-ada-002 | +| text-similarity-davinci-001 | +| text-similarity-curie-001 | +| text-similarity-babbage-001 | +| text-similarity-ada-001 | +| text-search-davinci-doc-001 | +| text-search-curie-doc-001 | +| text-search-babbage-doc-001 | +| text-search-ada-doc-001 | +| code-search-babbage-code-001 | +| code-search-ada-code-001 | +| gpt2 | + + + +#### Encoding Name +| **Encoding** | +| ------------ | +| o200k_base | +| cl100k_base | +| p50k_base | +| r50k_base | +| p50k_edit | + ### Text Chunks in Output | **Parameter** | **Type** | **Description** | -|------------------|----------|--------------------------------------------------------------| +| ---------------- | -------- | ------------------------------------------------------------ | | `test` | string | The text chunk | | `start-position` | integer | The starting position of the text chunk in the original text | | `end-position` | integer | The ending position of the text chunk in the original text | diff --git a/operator/text/v0/chunk_position_calculator.go b/operator/text/v0/chunk_position_calculator.go new file mode 100644 index 00000000..e5a3a883 --- /dev/null +++ b/operator/text/v0/chunk_position_calculator.go @@ -0,0 +1,113 @@ +package text + +import "reflect" + +type ChunkPositionCalculator interface { + getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) +} +type PositionCalculator struct{} +type MarkdownPositionCalculator struct{} + +func (output *ChunkTextOutput) setChunksWithPosition(chunks []string, rawText, chunkMethod string) { + + rawRunes := []rune(rawText) + var positionCalculator ChunkPositionCalculator + + switch chunkMethod { + case "Token", "Recursive": + positionCalculator = PositionCalculator{} + case "Markdown": + positionCalculator = MarkdownPositionCalculator{} + } + + startScanPosition := 0 + for i, chunk := range chunks { + chunkRunes := []rune(chunk) + + startPosition, endPosition := positionCalculator.getChunkPositions(rawRunes, chunkRunes, startScanPosition) + + if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) { + previousChunk := output.TextChunks[i-1] + startPosition, endPosition = positionCalculator.getChunkPositions(rawRunes, chunkRunes, previousChunk.StartPosition) + } + + if startPosition == endPosition { + continue + } + output.TextChunks = append(output.TextChunks, TextChunk{ + Text: chunk, + StartPosition: startPosition, + EndPosition: endPosition, + }) + startScanPosition = startPosition + 1 + } + + if len(output.TextChunks) == 0 { + output.TextChunks = append(output.TextChunks, TextChunk{ + Text: rawText, + StartPosition: 0, + EndPosition: len(rawRunes) - 1, + }) + } +} + +func (PositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { + + for i := startScanPosition; i < len(rawText); i++ { + if rawText[i] == chunk[0] { + + if i+len(chunk) > len(rawText) { + break + } + + if reflect.DeepEqual(rawText[i:i+len(chunk)], chunk) { + startPosition = i + endPosition = len(chunk) + i - 1 + break + } + } + } + return startPosition, endPosition +} + +func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { + + skipHeaderIndex := getSkipHeaderIndex(chunk) + + for i := startScanPosition; i < len(rawText); i++ { + + if rawText[i] == chunk[skipHeaderIndex] { + + if i+len(chunk)-skipHeaderIndex > len(rawText) { + break + } + + if reflect.DeepEqual(rawText[i:(i+len(chunk)-skipHeaderIndex)], chunk[skipHeaderIndex:]) { + startPosition = i + endPosition = len(chunk) + i - 1 - skipHeaderIndex + break + } + } + } + return startPosition, endPosition +} + +func shouldScanRawTextFromPreviousChunk(startPosition, endPosition int) bool { + return startPosition == 0 && endPosition == 0 +} + +func getSkipHeaderIndex(chunk []rune) int { + hashtagCount := 0 + skipPosition := 0 + for i := 0; i < len(chunk); i++ { + if chunk[i] == '#' { + hashtagCount++ + } + + if hashtagCount >= 1 && chunk[i] == '\n' { + skipPosition = i + 1 + hashtagCount = 0 + } + } + return skipPosition +} diff --git a/operator/text/v0/chunk_text.go b/operator/text/v0/chunk_text.go index 99530805..6881880b 100644 --- a/operator/text/v0/chunk_text.go +++ b/operator/text/v0/chunk_text.go @@ -2,15 +2,16 @@ package text import ( "fmt" - "reflect" - tiktoken "github.com/pkoukk/tiktoken-go" + "github.com/instill-ai/component/base" "github.com/tmc/langchaingo/textsplitter" + "google.golang.org/protobuf/types/known/structpb" ) type ChunkTextInput struct { - Text string `json:"text"` - Strategy Strategy `json:"strategy"` + Text string `json:"text"` + Strategy Strategy `json:"strategy"` + Tokenization Tokenization `json:"tokenization"` } type Strategy struct { @@ -32,6 +33,17 @@ type Setting struct { // secondSplitter textsplitter.TextSplitter } +type Tokenization struct { + Choice Choice `json:"choice"` +} + +type Choice struct { + TokenizationMethod string `json:"tokenization-method"` + Model string `json:"model,omitempty"` + Encoding string `json:"encoding,omitempty"` + HuggingFaceModel string `json:"hugging-face-model,omitempty"` +} + type ChunkTextOutput struct { ChunkNum int `json:"chunk-num"` TextChunks []TextChunk `json:"text-chunks"` @@ -71,24 +83,28 @@ type TextSplitter interface { SplitText(text string) ([]string, error) } -func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { + +func chunkText(inputPb *structpb.Struct) (*structpb.Struct, error) { + input := ChunkTextInput{} + + err := base.ConvertFromStructpb(inputPb, &input) + if err != nil { + return nil, err + } + var split TextSplitter + var output ChunkTextOutput + setting := input.Strategy.Setting // TODO: Take this out when we fix the error in frontend side. // Bug: The default value is not set from frontend side. setting.SetDefault() - var output ChunkTextOutput - var positionCalculator ChunkPositionCalculator - switch setting.ChunkMethod { case "Token": - positionCalculator = PositionCalculator{} if setting.ChunkOverlap >= setting.ChunkSize { - err := fmt.Errorf("ChunkOverlap must be less than ChunkSize when using Token method") - return output, err + return nil, fmt.Errorf("ChunkOverlap must be less than ChunkSize when using Token method") } - split = textsplitter.NewTokenSplitter( textsplitter.WithChunkSize(setting.ChunkSize), textsplitter.WithChunkOverlap(setting.ChunkOverlap), @@ -97,13 +113,11 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { textsplitter.WithDisallowedSpecial(setting.DisallowedSpecial), ) case "Markdown": - positionCalculator = MarkdownPositionCalculator{} split = NewMarkdownTextSplitter( textsplitter.WithChunkSize(setting.ChunkSize), textsplitter.WithChunkOverlap(setting.ChunkOverlap), ) case "Recursive": - positionCalculator = PositionCalculator{} split = textsplitter.NewRecursiveCharacter( textsplitter.WithSeparators(setting.Separators), textsplitter.WithChunkSize(setting.ChunkSize), @@ -114,129 +128,29 @@ func chunkText(input ChunkTextInput) (ChunkTextOutput, error) { chunks, err := split.SplitText(input.Text) if err != nil { - return output, err - } - output.ChunkNum = len(chunks) - - tkm, err := tiktoken.EncodingForModel(setting.ModelName) - if err != nil { - return output, err + return nil, fmt.Errorf("failed to split text: %w", err) } - totalTokenCount := 0 - startScanPosition := 0 - rawRunes := []rune(input.Text) - for i, chunk := range chunks { - chunkRunes := []rune(chunk) + output.setChunksWithPosition(chunks, input.Text, setting.ChunkMethod) + output.ChunkNum = len(output.TextChunks) - startPosition, endPosition := positionCalculator.getChunkPositions(rawRunes, chunkRunes, startScanPosition) + choice := input.Tokenization.Choice + err = output.setTokenizeChunks(choice) - if shouldScanRawTextFromPreviousChunk(startPosition, endPosition) { - previousChunk := output.TextChunks[i-1] - startPosition, endPosition = positionCalculator.getChunkPositions(rawRunes, chunkRunes, previousChunk.StartPosition) - } - - if startPosition == endPosition { - continue - } - - token := tkm.Encode(chunk, setting.AllowedSpecial, setting.DisallowedSpecial) - - output.TextChunks = append(output.TextChunks, TextChunk{ - Text: chunk, - StartPosition: startPosition, - EndPosition: endPosition, - TokenCount: len(token), - }) - totalTokenCount += len(token) - startScanPosition = startPosition + 1 - } - - if len(output.TextChunks) == 0 { - token := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial) - - output.TextChunks = append(output.TextChunks, TextChunk{ - Text: input.Text, - StartPosition: 0, - EndPosition: len(rawRunes) - 1, - TokenCount: len(token), - }) - output.ChunkNum = 1 - totalTokenCount = len(token) + if err != nil { + return nil, fmt.Errorf("failed to tokenize chunks: \n%w", err) } - originalTextToken := tkm.Encode(input.Text, setting.AllowedSpecial, setting.DisallowedSpecial) - output.TokenCount = len(originalTextToken) - output.ChunksTokenCount = totalTokenCount + err = output.setFileTokenCount(choice, input.Text) - return output, nil -} - -func shouldScanRawTextFromPreviousChunk(startPosition, endPosition int) bool { - return startPosition == 0 && endPosition == 0 -} - -type ChunkPositionCalculator interface { - getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) -} - -type PositionCalculator struct{} - -func (PositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { - - for i := startScanPosition; i < len(rawText); i++ { - if rawText[i] == chunk[0] { - - if i+len(chunk) > len(rawText) { - break - } - - if reflect.DeepEqual(rawText[i:i+len(chunk)], chunk) { - startPosition = i - endPosition = len(chunk) + i - 1 - break - } - } + if err != nil { + return nil, fmt.Errorf("failed to set file token count: \n%w", err) } - return startPosition, endPosition -} - -type MarkdownPositionCalculator struct{} -func (MarkdownPositionCalculator) getChunkPositions(rawText, chunk []rune, startScanPosition int) (startPosition int, endPosition int) { - - skipHeaderIndex := getSkipHeaderIndex(chunk) - - for i := startScanPosition; i < len(rawText); i++ { - - if rawText[i] == chunk[skipHeaderIndex] { - - if i+len(chunk)-skipHeaderIndex > len(rawText) { - break - } - - if reflect.DeepEqual(rawText[i:(i+len(chunk)-skipHeaderIndex)], chunk[skipHeaderIndex:]) { - startPosition = i - endPosition = len(chunk) + i - 1 - skipHeaderIndex - break - } - } + outputPb, err := base.ConvertToStructpb(output) + if err != nil { + return nil, fmt.Errorf("failed to convert output to structpb: %w", err) } - return startPosition, endPosition -} - -func getSkipHeaderIndex(chunk []rune) int { - hashtagCount := 0 - skipPosition := 0 - for i := 0; i < len(chunk); i++ { - if chunk[i] == '#' { - hashtagCount++ - } - if hashtagCount >= 1 && chunk[i] == '\n' { - skipPosition = i + 1 - hashtagCount = 0 - } - } - return skipPosition + return outputPb, nil } diff --git a/operator/text/v0/chunk_text_test.go b/operator/text/v0/chunk_text_test.go index fb128fe8..f46db293 100644 --- a/operator/text/v0/chunk_text_test.go +++ b/operator/text/v0/chunk_text_test.go @@ -1,10 +1,13 @@ package text import ( + "context" "os" "testing" "github.com/frankban/quicktest" + "github.com/instill-ai/component/base" + "google.golang.org/protobuf/types/known/structpb" ) func TestChunkText(t *testing.T) { @@ -27,6 +30,14 @@ func TestChunkText(t *testing.T) { ModelName: "gpt-3.5-turbo", }, }, + Tokenization: Tokenization{ + Choice: Choice{ + TokenizationMethod: "Model", + Model: "gpt-3.5-turbo", + Encoding: "", + HuggingFaceModel: "", + }, + }, }, output: ChunkTextOutput{ TextChunks: []TextChunk{ @@ -53,6 +64,14 @@ func TestChunkText(t *testing.T) { ChunkSize: 5, }, }, + Tokenization: Tokenization{ + Choice: Choice{ + TokenizationMethod: "Model", + Model: "gpt-3.5-turbo", + Encoding: "", + HuggingFaceModel: "", + }, + }, }, output: ChunkTextOutput{ TextChunks: []TextChunk{ @@ -86,6 +105,14 @@ func TestChunkText(t *testing.T) { Separators: []string{" ", "."}, }, }, + Tokenization: Tokenization{ + Choice: Choice{ + TokenizationMethod: "Model", + Model: "gpt-3.5-turbo", + Encoding: "", + HuggingFaceModel: "", + }, + }, }, output: ChunkTextOutput{ TextChunks: []TextChunk{ @@ -111,7 +138,27 @@ func TestChunkText(t *testing.T) { for _, tc := range testCases { c.Run(tc.name, func(c *quicktest.C) { - output, err := chunkText(tc.input) + + bc := base.Component{} + component := Init(bc) + c.Assert(component, quicktest.IsNotNil) + + execution, err := component.CreateExecution(base.ComponentExecution{ + Component: component, + Task: "TASK_CHUNK_TEXT", + }) + + c.Assert(err, quicktest.IsNil) + c.Assert(execution, quicktest.IsNotNil) + + inputPd, err := base.ConvertToStructpb(tc.input) + c.Assert(err, quicktest.IsNil) + + outputPd, err := execution.Execute(context.TODO(), []*structpb.Struct{inputPd}) + c.Assert(err, quicktest.IsNil) + output := ChunkTextOutput{} + err = base.ConvertFromStructpb(outputPd[0], &output) + c.Assert(err, quicktest.IsNil) c.Check(output, quicktest.DeepEquals, tc.output) }) diff --git a/operator/text/v0/config/tasks.json b/operator/text/v0/config/tasks.json index 191aeaa0..8fe5e85e 100644 --- a/operator/text/v0/config/tasks.json +++ b/operator/text/v0/config/tasks.json @@ -46,7 +46,7 @@ "type": "integer" }, "model-name": { - "description": "The name of the model used for tokenization.", + "description": "The name of the model used for token chunk strategy.", "enum": [ "gpt-4", "gpt-3.5-turbo", @@ -92,6 +92,85 @@ ], "title": "Model", "type": "string" + }, + "encoding-name": { + "description": "The name of the encoding used to convert text into tokens.", + "enum": [ + "o200k_base", + "cl100k_base", + "p50k_base", + "r50k_base", + "p50k_edit" + ], + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 3, + "instillUpstreamTypes": [ + "value", + "reference" + ], + "title": "Encoding Name", + "type": "string" + }, + "tokenization-model": { + "description": "The name of the model used for tokenization", + "enum": [ + "gpt-4o", + "gpt-4", + "gpt-3.5-turbo", + "command-r-plus", + "command-r", + "command", + "command-nightly", + "command-light", + "command-light-nightly", + "embed-english-v3.0", + "embed-multilingual-v3.0", + "embed-english-light-v3.0", + "embed-multilingual-light-v3.0", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + "davinci", + "curie", + "babbage", + "ada", + "code-davinci-002", + "code-davinci-001", + "code-cushman-002", + "code-cushman-001", + "davinci-codex", + "cushman-codex", + "text-davinci-edit-001", + "code-davinci-edit-001", + "text-embedding-ada-002", + "text-similarity-davinci-001", + "text-similarity-curie-001", + "text-similarity-babbage-001", + "text-similarity-ada-001", + "text-search-davinci-doc-001", + "text-search-curie-doc-001", + "text-search-babbage-doc-001", + "text-search-ada-doc-001", + "code-search-babbage-code-001", + "code-search-ada-code-001", + "gpt2" + ], + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 2, + "instillUpstreamTypes": [ + "value", + "reference", + "template" + ], + "title": "Model", + "type": "string" } }, "TASK_CHUNK_TEXT": { @@ -100,7 +179,8 @@ "description": "Input", "instillEditOnNodeFields": [ "text", - "strategy" + "strategy", + "tokenization" ], "instillUIOrder": 0, "properties": { @@ -198,9 +278,6 @@ "chunk-overlap": { "$ref": "#/$defs/chunk-overlap" }, - "model-name": { - "$ref": "#/$defs/model-name" - }, "separators": { "default": [], "description": "A list of strings representing the separators used to split the text.", @@ -242,7 +319,6 @@ "chunk-method", "chunk-size", "chunk-overlap", - "model-name", "separators", "keep-separator" ], @@ -260,9 +336,6 @@ "chunk-overlap": { "$ref": "#/$defs/chunk-overlap" }, - "model-name": { - "$ref": "#/$defs/model-name" - }, "code-blocks": { "description": "A flag indicating whether code blocks should be treated as a single unit", "instillAcceptFormats": [ @@ -285,7 +358,6 @@ "chunk-method", "chunk-size", "chunk-overlap", - "model-name", "code-blocks" ], "type": "object" @@ -293,16 +365,80 @@ ] } }, - "title": "Strategy", + "title": "Chunk Strategy", "required": [ "setting" ], "type": "object" + }, + "tokenization": { + "description": "Tokenization choices", + "instillUIOrder": 2, + "properties": { + "choice": { + "description": "Tokenization choice", + "additionalProperties": true, + "type": "object", + "required": [ + "tokenization-method" + ], + "oneOf": [ + { + "properties": { + "tokenization-method": { + "const": "Model", + "type": "string", + "title": "Choose by Model Name" + }, + "model": { + "$ref": "#/$defs/tokenization-model" + } + }, + "required": [ + "tokenization-method", + "model" + ], + "instillEditOnNodeFields": [ + "tokenization-method", + "model" + ], + "type": "object" + }, + { + "properties": { + "tokenization-method": { + "const": "Encoding", + "type": "string", + "title": "Choose by Encoding Name" + }, + "encoding": { + "$ref": "#/$defs/encoding-name" + } + }, + "required": [ + "tokenization-method", + "encoding" + ], + "instillEditOnNodeFields": [ + "tokenization-method", + "encoding" + ], + "type": "object" + } + ] + } + }, + "title": "Tokenization", + "required": [ + "choice" + ], + "type": "object" } }, "required": [ "text", - "strategy" + "strategy", + "tokenization" ], "title": "Input", "type": "object" diff --git a/operator/text/v0/main.go b/operator/text/v0/main.go index 0dd62e67..e6d98847 100644 --- a/operator/text/v0/main.go +++ b/operator/text/v0/main.go @@ -14,7 +14,8 @@ import ( ) const ( - taskChunkText string = "TASK_CHUNK_TEXT" + taskChunkText string = "TASK_CHUNK_TEXT" + pythonInterpreter string = "/opt/venv/bin/python" ) var ( @@ -24,6 +25,13 @@ var ( tasksJSON []byte once sync.Once comp *component + + //go:embed python/cohere_tokenizer.py + cohereTokenizer string + //go:embed python/huggingface_tokenizer.py + huggingfaceTokenizer string + //go:embed python/mistral_tokenizer.py + mistralTokenizer string ) // Operator is the derived operator @@ -34,6 +42,7 @@ type component struct { // Execution is the derived execution type execution struct { base.ComponentExecution + execute func(*structpb.Struct) (*structpb.Struct, error) } // Init initializes the operator @@ -51,34 +60,30 @@ func Init(bc base.Component) *component { // CreateExecution initializes a connector executor that can be used in a // pipeline trigger. func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution, error) { - return &execution{ComponentExecution: x}, nil + e := &execution{ComponentExecution: x} + + switch x.Task { + case taskChunkText: + e.execute = chunkText + default: + return nil, fmt.Errorf(x.Task + " task is not supported.") + } + + return e, nil } // Execute executes the derived execution func (e *execution) Execute(_ context.Context, inputs []*structpb.Struct) ([]*structpb.Struct, error) { - outputs := []*structpb.Struct{} - - for _, input := range inputs { - switch e.Task { - case taskChunkText: - inputStruct := ChunkTextInput{} - err := base.ConvertFromStructpb(input, &inputStruct) - if err != nil { - return nil, err - } - - outputStruct, err := chunkText(inputStruct) - if err != nil { - return nil, err - } - output, err := base.ConvertToStructpb(outputStruct) - if err != nil { - return nil, err - } - outputs = append(outputs, output) - default: - return nil, fmt.Errorf("not supported task: %s", e.Task) + outputs := make([]*structpb.Struct, len(inputs)) + + for i, input := range inputs { + output, err := e.execute(input) + if err != nil { + return nil, err } + + outputs[i] = output } + return outputs, nil } diff --git a/operator/text/v0/main_test.go b/operator/text/v0/main_test.go index 76f1deca..af5959ed 100644 --- a/operator/text/v0/main_test.go +++ b/operator/text/v0/main_test.go @@ -32,6 +32,16 @@ func TestOperator(t *testing.T) { }}}, }, }}}, + "tokenization": {Kind: &structpb.Value_StructValue{StructValue: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + "choice": {Kind: &structpb.Value_StructValue{StructValue: &structpb.Struct{ + Fields: map[string]*structpb.Value{ + "tokenization-method": {Kind: &structpb.Value_StringValue{StringValue: "Model"}}, + "model": {Kind: &structpb.Value_StringValue{StringValue: "gpt-3.5-turbo"}}, + }, + }}}, + }, + }}}, }, }, }, @@ -53,6 +63,10 @@ func TestOperator(t *testing.T) { Component: component, Task: tc.task, }) + if tc.name == "error case" { + c.Assert(err, quicktest.ErrorMatches, "FAKE_TASK task is not supported.") + return + } c.Assert(err, quicktest.IsNil) c.Assert(execution, quicktest.IsNotNil) @@ -60,11 +74,6 @@ func TestOperator(t *testing.T) { outputs, err := execution.Execute(ctx, input) - if tc.name == "error case" { - c.Assert(err, quicktest.ErrorMatches, "not supported task: FAKE_TASK") - c.Assert(outputs, quicktest.IsNil) - return - } c.Assert(err, quicktest.IsNil) c.Assert(outputs, quicktest.HasLen, 1) }) diff --git a/operator/text/v0/python/cohere_tokenizer.py b/operator/text/v0/python/cohere_tokenizer.py new file mode 100644 index 00000000..bc7ccd20 --- /dev/null +++ b/operator/text/v0/python/cohere_tokenizer.py @@ -0,0 +1,40 @@ +from tokenizers import Tokenizer +import requests +import json +import sys + +json_str = sys.stdin.buffer.read().decode('utf-8') +# Sample input +# { +# "model": "xxx", +# "text_chunks": [ +# "Hello, how are you?", +# "I'm doing well, thank you!" +# ] +# } +params = json.loads(json_str) + +model = params["model"] +url = f"https://api.cohere.com/v1/models/{model}" + +headers = { + "accept": "application/json", + "authorization": "Bearer ZgfnmYBuFNcFUhYW3xEZeKxwVT6pCfb4YFZ0vUIE" +} + +response = requests.get(url, headers=headers) +json_response = json.loads(response.text) + +tokenizer_url = json_response["tokenizer_url"] + +response = requests.get(tokenizer_url) + +tokenizer = Tokenizer.from_str(response.text) + +output = { "toke_count": [0] * len(params["text_chunks"]) } + +for i, chunk in enumerate(params["text_chunks"]): + result = tokenizer.encode(sequence=chunk, add_special_tokens=False) + output["toke_count"][i] = len(result.ids) + +print(json.dumps(output)) diff --git a/operator/text/v0/python/huggingface_tokenizer.py b/operator/text/v0/python/huggingface_tokenizer.py new file mode 100644 index 00000000..7c2e48bf --- /dev/null +++ b/operator/text/v0/python/huggingface_tokenizer.py @@ -0,0 +1,29 @@ +## It is not used because there is a problem related to container built with Alpine Linux +from transformers import AutoTokenizer +import json +import sys +import os + +json_str = sys.stdin.buffer.read().decode('utf-8') +# Sample input +# { +# "model": "xxx", +# "text_chunks": [ +# "Hello, how are you?", +# "I'm doing well, thank you!" +# ] +# } +params = json.loads(json_str) + +model = params["model"] +tokenizer = AutoTokenizer.from_pretrained(model, + trust_remote_code=True, + force_download=True) + +output = { "toke_count": [0] * len(params["text_chunks"]) } + +for i, chunk in enumerate(params["text_chunks"]): + encoding = tokenizer(chunk) + output["toke_count"][i] = len(encoding["input_ids"]) + +print(json.dumps(output)) diff --git a/operator/text/v0/python/mistral_tokenizer.py b/operator/text/v0/python/mistral_tokenizer.py new file mode 100644 index 00000000..a2e6c616 --- /dev/null +++ b/operator/text/v0/python/mistral_tokenizer.py @@ -0,0 +1,29 @@ +## It is not used because there is a problem related to container built with Alpine Linux +import json +import sys +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer +from mistral_common.protocol.instruct.request import ChatCompletionRequest +from mistral_common.protocol.instruct.messages import UserMessage + +json_str = sys.stdin.buffer.read().decode('utf-8') +# Sample input +# { +# "model": "xxx", +# "text_chunks": [ +# "Hello, how are you?", +# "I'm doing well, thank you!" +# ] +# } +params = json.loads(json_str) + +tokenizer = MistralTokenizer.from_model(params["model"]) + +output = { "toke_count": [0] * len(params["text_chunks"]) } + +for i, chunk in enumerate(params["text_chunks"]): + res = tokenizer.encode_chat_completion( + ChatCompletionRequest(messages=[UserMessage(content=chunk)]) + ) + output["toke_count"][i] = len(res.tokens) + +print(json.dumps(output)) diff --git a/operator/text/v0/token_splitter.go b/operator/text/v0/token_splitter.go new file mode 100644 index 00000000..0c1af76a --- /dev/null +++ b/operator/text/v0/token_splitter.go @@ -0,0 +1,5 @@ +// TODO chuang8511: add interface & implementation +// to split by token with different tokenizer. +// To make I/O simpler for python & Go code, we decouple +// text splitter & tokenizer totally. +package text diff --git a/operator/text/v0/tokenizer.go b/operator/text/v0/tokenizer.go new file mode 100644 index 00000000..0a9a3abb --- /dev/null +++ b/operator/text/v0/tokenizer.go @@ -0,0 +1,297 @@ +package text + +import ( + "bytes" + "encoding/json" + "fmt" + "os" + "os/exec" + + "github.com/pkoukk/tiktoken-go" +) + +type Tokenizer interface { + Encode(chunks []TextChunk) ([]int, error) +} + +type OpenAITokenizer struct { + model string +} +type MistralTokenizer struct { + model string +} +type CohereTokenizer struct { + model string +} +type EncodingTokenizer struct { + encoding string +} +type HuggingFaceTokenizer struct { + model string +} + +func (choice Choice) GetTokenizer() (Tokenizer, error) { + switch choice.TokenizationMethod { + case "Model": + return getModelTokenizer(choice.Model) + case "Encoding": + return EncodingTokenizer{ + encoding: choice.Encoding, + }, nil + case "HuggingFace": + return HuggingFaceTokenizer{ + model: choice.HuggingFaceModel, + }, nil + } + return nil, fmt.Errorf("tokenization method %s not found", choice.TokenizationMethod) +} + +func getModelTokenizer(model string) (Tokenizer, error) { + if modelInList(model, MistralModels) { + return MistralTokenizer{ + model: model, + }, nil + } + if modelInList(model, OpenAIModels) { + return OpenAITokenizer{ + model: model, + }, nil + } + if modelInList(model, CohereModels) { + return CohereTokenizer{ + model: model, + }, nil + } + return nil, fmt.Errorf("model %s not found", model) +} + +func (t OpenAITokenizer) Encode(textChunks []TextChunk) ([]int, error) { + tke, err := tiktoken.EncodingForModel(t.model) + if err != nil { + return []int{}, fmt.Errorf("failed to get encoding by model name %s: %w", t.model, err) + } + + tokenIdxCountMap := make([]int, len(textChunks)) + + for i, textChunk := range textChunks { + tokenCount := len(tke.Encode(textChunk.Text, nil, nil)) + tokenIdxCountMap[i] = tokenCount + } + + return tokenIdxCountMap, nil +} + +func (t EncodingTokenizer) Encode(textChunks []TextChunk) ([]int, error) { + tke, err := tiktoken.GetEncoding(t.encoding) + if err != nil { + return []int{}, fmt.Errorf("failed to get encoding by encoding name %s: %w", t.encoding, err) + } + + tokenIdxCountMap := make([]int, len(textChunks)) + + for i, textChunk := range textChunks { + tokenCount := len(tke.Encode(textChunk.Text, nil, nil)) + tokenIdxCountMap[i] = tokenCount + } + + return tokenIdxCountMap, nil +} + +func (t MistralTokenizer) Encode(textChunks []TextChunk) ([]int, error) { + return executePythonCode(mistralTokenizer, textChunks, t.model, false) +} + +func (t CohereTokenizer) Encode(textChunks []TextChunk) ([]int, error) { + return executePythonCode(cohereTokenizer, textChunks, t.model, false) +} + +func (t HuggingFaceTokenizer) Encode(textChunks []TextChunk) ([]int, error) { + return executePythonCode(huggingfaceTokenizer, textChunks, t.model, true) +} + +func (output *ChunkTextOutput) setTokenizeChunks(choice Choice) error { + tokenizer, err := choice.GetTokenizer() + + if err != nil { + return fmt.Errorf("failed to get tokenizer: %w", err) + } + + tokens, err := tokenizer.Encode(output.TextChunks) + + if err != nil { + return fmt.Errorf("failed to encode text: %w", err) + } + + for i, tokenCount := range tokens { + output.TextChunks[i].TokenCount = tokenCount + output.ChunksTokenCount += tokenCount + } + + return nil +} + +func (output *ChunkTextOutput) setFileTokenCount(choice Choice, rawText string) error { + tokenizer, err := choice.GetTokenizer() + + if err != nil { + return fmt.Errorf("failed to get tokenizer: %w", err) + } + + tokenMap, err := tokenizer.Encode([]TextChunk{ + { + Text: rawText, + }, + }) + + if err != nil { + return fmt.Errorf("failed to encode text: %w", err) + } + + output.TokenCount = tokenMap[0] + + return nil +} + +type pythonRunnerOutput struct { + TokenCountMap []int `json:"toke_count"` +} + +func executePythonCode(pythonCode string, textChunks []TextChunk, model string, needTempDir bool) ([]int, error) { + + tokenCounts := make([]int, len(textChunks)) + params := make(map[string]interface{}) + params["text_chunks"] = make([]string, 0) + for _, textChunk := range textChunks { + params["text_chunks"] = append(params["text_chunks"].([]string), textChunk.Text) + } + + params["model"] = model + + paramsJSON, err := json.Marshal(params) + + if err != nil { + return tokenCounts, fmt.Errorf("failed to marshal chunk map: %w", err) + } + + cmdRunner := exec.Command(pythonInterpreter, "-c", pythonCode) + + if needTempDir { + tempDir, _ := os.MkdirTemp("", "downloaded-models") + defer os.RemoveAll(tempDir) + cmdRunner.Env = append(os.Environ(), "HOME="+tempDir) + } + + stdin, err := cmdRunner.StdinPipe() + + if err != nil { + return tokenCounts, fmt.Errorf("failed to get stdin pipe: %w", err) + } + + errChan := make(chan error, 1) + go func() { + defer stdin.Close() + _, err := stdin.Write(paramsJSON) + if err != nil { + errChan <- err + return + } + errChan <- nil + }() + + var stdoutStderr bytes.Buffer + cmdRunner.Stdout = &stdoutStderr + cmdRunner.Stderr = &stdoutStderr + + err = cmdRunner.Start() + if err != nil { + return tokenCounts, fmt.Errorf("error starting command: %v", err) + } + + err = <-errChan + if err != nil { + return tokenCounts, fmt.Errorf("error writing to stdin: %v", err) + } + + err = cmdRunner.Wait() + if err != nil { + return tokenCounts, fmt.Errorf("failed to wait for command: %w, \nOutput: %s", err, stdoutStderr.String()) + } + + outputBytes := stdoutStderr.Bytes() + + var output pythonRunnerOutput + err = json.Unmarshal(outputBytes, &output) + if err != nil { + return tokenCounts, fmt.Errorf("failed to unmarshal output: %s", string(outputBytes)) + } + + return output.TokenCountMap, nil +} + +var OpenAIModels = []string{ + "gpt-4o", + "gpt-4", + "gpt-3.5-turbo", + "text-davinci-003", + "text-davinci-002", + "text-davinci-001", + "text-curie-001", + "text-babbage-001", + "text-ada-001", + "davinci", + "curie", + "babbage", + "ada", + "code-davinci-002", + "code-davinci-001", + "code-cushman-002", + "code-cushman-001", + "davinci-codex", + "cushman-codex", + "text-davinci-edit-001", + "code-davinci-edit-001", + "text-embedding-ada-002", + "text-similarity-davinci-001", + "text-similarity-curie-001", + "text-similarity-babbage-001", + "text-similarity-ada-001", + "text-search-davinci-doc-001", + "text-search-curie-doc-001", + "text-search-babbage-doc-001", + "text-search-ada-doc-001", + "code-search-babbage-code-001", + "code-search-ada-code-001", + "gpt2", +} + +var MistralModels = []string{ + "open-mixtral-8x22b", + "open-mixtral-8x7b", + "open-mistral-7b", + "mistral-large-latest", + "mistral-small-latest", + "codestral-22b", + "mistral-embed", +} + +var CohereModels = []string{ + "command-r-plus", + "command-r", + "command", + "command-nightly", + "command-light", + "command-light-nightly", + "embed-english-v3.0", + "embed-multilingual-v3.0", + "embed-english-light-v3.0", + "embed-multilingual-light-v3.0", +} + +func modelInList(model string, list []string) bool { + for _, m := range list { + if m == model { + return true + } + } + return false +} diff --git a/operator/text/v0/tokenizer_test.go b/operator/text/v0/tokenizer_test.go new file mode 100644 index 00000000..f22695f1 --- /dev/null +++ b/operator/text/v0/tokenizer_test.go @@ -0,0 +1,2 @@ +// TODO chuang8511 add test cases for tokenizer +package text