diff --git a/go.mod b/go.mod index 06348d65..daf61415 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/frankban/quicktest v1.14.6 github.com/gabriel-vasile/mimetype v1.4.3 github.com/gage-technologies/mistral-go v1.1.0 + github.com/gen2brain/go-fitz v1.23.7 github.com/go-chi/chi/v5 v5.1.0 github.com/go-openapi/strfmt v0.23.0 github.com/go-resty/resty/v2 v2.12.0 @@ -53,6 +54,7 @@ require ( github.com/tmc/langchaingo v0.1.10 github.com/u2takey/ffmpeg-go v0.5.0 github.com/weaviate/weaviate v1.26.0-rc.1 + github.com/xuri/excelize/v2 v2.8.1 go.mongodb.org/mongo-driver v1.16.0 go.uber.org/zap v1.24.0 golang.org/x/image v0.18.0 @@ -85,11 +87,14 @@ require ( github.com/josharian/intern v1.0.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect github.com/montanaflynn/stats v0.7.1 // indirect github.com/oklog/ulid v1.3.1 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect github.com/xdg-go/scram v1.1.2 // indirect github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 // indirect + github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 // indirect github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect ) @@ -150,7 +155,7 @@ require ( github.com/pierrec/lz4/v4 v4.1.18 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect - github.com/richardlehane/mscfb v1.0.3 // indirect + github.com/richardlehane/mscfb v1.0.4 // indirect github.com/richardlehane/msoleps v1.0.3 // indirect github.com/rivo/uniseg v0.4.4 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect diff --git a/go.sum b/go.sum index 3e7c65fb..e96b6eb4 100644 --- a/go.sum +++ b/go.sum @@ -128,6 +128,8 @@ github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uq github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk= github.com/gage-technologies/mistral-go v1.1.0 h1:POv1wM9jA/9OBXGV2YdPi9Y/h09+MjCbUF+9hRYlVUI= github.com/gage-technologies/mistral-go v1.1.0/go.mod h1:tF++Xt7U975GcLlzhrjSQb8l/x+PrriO9QEdsgm9l28= +github.com/gen2brain/go-fitz v1.23.7 h1:HPhzEVzmOINvCKqQgB/DwMzYh4ArIgy3tMwq1eJTcbg= +github.com/gen2brain/go-fitz v1.23.7/go.mod h1:HU04vc+RisUh/kvEd2pB0LAxmK1oyXdN4ftyshUr9rQ= github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I= github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs= github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw= @@ -376,6 +378,8 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw= +github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8= github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= @@ -412,8 +416,8 @@ github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0 github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= -github.com/richardlehane/mscfb v1.0.3 h1:rD8TBkYWkObWO0oLDFCbwMeZ4KoalxQy+QgniCj3nKI= -github.com/richardlehane/mscfb v1.0.3/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= +github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM= +github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM= github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= @@ -499,6 +503,12 @@ github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3k github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM= github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= +github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 h1:Chd9DkqERQQuHpXjR/HSV1jLZA6uaoiwwH3vSuF3IW0= +github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI= +github.com/xuri/excelize/v2 v2.8.1 h1:pZLMEwK8ep+CLIUWpWmvW8IWE/yxqG0I1xcN6cVMGuQ= +github.com/xuri/excelize/v2 v2.8.1/go.mod h1:oli1E4C3Pa5RXg1TBXn4ENCXDV5JUMlBluUhG7c+CEE= +github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 h1:qhbILQo1K3mphbwKh1vNm4oGezE1eF9fQWmNiIpSfI4= +github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ= github.com/xwb1989/sqlparser v0.0.0-20180606152119-120387863bf2 h1:zzrxE1FKn5ryBNl9eKOeqQ58Y/Qpo3Q9QNxKHX5uzzQ= github.com/xwb1989/sqlparser v0.0.0-20180606152119-120387863bf2/go.mod h1:hzfGeIUDq/j97IG+FhNqkowIyEcD88LrW6fyU3K3WqY= github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA= @@ -677,10 +687,10 @@ golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190329151228-23e29df326fe/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190416151739-9c9e1878f421/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190420181800-aa740d480789/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= -golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190531172133-b3315ee88b7d/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= diff --git a/internal/util/helper.go b/internal/util/helper.go index 8a579f91..87a074a0 100644 --- a/internal/util/helper.go +++ b/internal/util/helper.go @@ -129,6 +129,8 @@ func TransformContentTypeToFileExtension(contentType string) string { return "html" case "application/pdf": return "pdf" + case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": + return "xlsx" } return "" } @@ -154,3 +156,29 @@ func GetInstillUserUID(vars map[string]any) string { func GetInstillRequesterUID(vars map[string]any) string { return vars["__PIPELINE_REQUESTER_UID"].(string) } + +func ConvertDataFrameToMarkdownTable(rows [][]string) string { + var sb strings.Builder + + sb.WriteString("|") + for _, colCell := range rows[0] { + sb.WriteString(fmt.Sprintf(" %s |", colCell)) + } + sb.WriteString("\n") + + sb.WriteString("|") + for range rows[0] { + sb.WriteString(" --- |") + } + sb.WriteString("\n") + + for _, row := range rows[1:] { + sb.WriteString("|") + for _, colCell := range row { + sb.WriteString(fmt.Sprintf(" %s |", colCell)) + } + sb.WriteString("\n") + } + + return sb.String() +} diff --git a/operator/document/v0/README.mdx b/operator/document/v0/README.mdx index 37cc2013..3aced235 100644 --- a/operator/document/v0/README.mdx +++ b/operator/document/v0/README.mdx @@ -10,6 +10,7 @@ It can carry out the following tasks: - [Convert To Markdown](#convert-to-markdown) - [Convert To Text](#convert-to-text) +- [Convert To Images](#convert-to-images) @@ -37,13 +38,15 @@ Convert document to text in Markdown format. | Input | ID | Type | Description | | :--- | :--- | :--- | :--- | | Task ID (required) | `task` | string | `TASK_CONVERT_TO_MARKDOWN` | -| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML to be converted to text in Markdown format | +| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format | +| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | | Body | `body` | string | Markdown text converted from the PDF document | +| Filename (optional) | `filename` | string | The name of the file | @@ -59,12 +62,14 @@ Convert document to text. | :--- | :--- | :--- | :--- | | Task ID (required) | `task` | string | `TASK_CONVERT_TO_TEXT` | | Document (required) | `doc` | string | Base64 encoded document (PDF, DOC, DOCX, XML, HTML, RTF, etc.) to be converted to plain text | +| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' | | Output | ID | Type | Description | | :--- | :--- | :--- | :--- | | Body | `body` | string | Plain text converted from the document | +| Filename (optional) | `filename` | string | The name of the file | | Meta | `meta` | object | Metadata extracted from the document | | MSecs | `msecs` | number | Time taken to convert the document | | Error | `error` | string | Error message if any during the conversion process | @@ -74,4 +79,27 @@ Convert document to text. +### Convert To Images + +Convert PDF to images. + + +| Input | ID | Type | Description | +| :--- | :--- | :--- | :--- | +| Task ID (required) | `task` | string | `TASK_CONVERT_TO_IMAGES` | +| PDF (required) | `pdf` | string | Base64 encoded PDF to be converted to images | +| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' | + + + +| Output | ID | Type | Description | +| :--- | :--- | :--- | :--- | +| Images | `images` | array[string] | Images converted from the PDF document | +| Filenames (optional) | `filenames` | array[string] | The filenames of the images. The filenames will be appended with the page number. e.g. 'example-1.jpg' | + + + + + + diff --git a/operator/document/v0/config/definition.json b/operator/document/v0/config/definition.json index fd88661f..955a7fac 100644 --- a/operator/document/v0/config/definition.json +++ b/operator/document/v0/config/definition.json @@ -1,7 +1,8 @@ { "availableTasks": [ "TASK_CONVERT_TO_MARKDOWN", - "TASK_CONVERT_TO_TEXT" + "TASK_CONVERT_TO_TEXT", + "TASK_CONVERT_TO_IMAGES" ], "custom": false, "documentationUrl": "https://www.instill.tech/docs/component/operator/document", diff --git a/operator/document/v0/config/tasks.json b/operator/document/v0/config/tasks.json index 92f91240..2102adbd 100644 --- a/operator/document/v0/config/tasks.json +++ b/operator/document/v0/config/tasks.json @@ -9,7 +9,7 @@ "instillUIOrder": 0, "properties": { "document": { - "description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML to be converted to text in Markdown format", + "description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format", "instillAcceptFormats": [ "*/*" ], @@ -20,6 +20,19 @@ ], "title": "Document", "type": "string" + }, + "filename": { + "description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'", + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 1, + "instillUpstreamTypes": [ + "reference", + "value" + ], + "title": "Filename", + "type": "string" } }, "required": [ @@ -39,6 +52,13 @@ "instillUIOrder": 0, "title": "Body", "type": "string" + }, + "filename": { + "description": "The name of the file", + "instillFormat": "string", + "instillUIOrder": 1, + "title": "Filename", + "type": "string" } }, "required": [ @@ -69,6 +89,19 @@ ], "title": "Document", "type": "string" + }, + "filename": { + "description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'", + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 1, + "instillUpstreamTypes": [ + "reference", + "value" + ], + "title": "Filename", + "type": "string" } }, "required": [ @@ -111,6 +144,13 @@ "instillUIOrder": 2, "title": "MSecs", "type": "number" + }, + "filename": { + "description": "The name of the file", + "instillFormat": "string", + "instillUIOrder": 1, + "title": "Filename", + "type": "string" } }, "required": [ @@ -122,5 +162,79 @@ "title": "Output", "type": "object" } + }, + "TASK_CONVERT_TO_IMAGES": { + "instillShortDescription": "Convert PDF to images.", + "input": { + "description": "Input", + "instillEditOnNodeFields": [ + "pdf" + ], + "instillUIOrder": 0, + "properties": { + "pdf": { + "description": "Base64 encoded PDF to be converted to images", + "instillAcceptFormats": [ + "*/*" + ], + "instillUIMultiline": true, + "instillUIOrder": 0, + "instillUpstreamTypes": [ + "reference" + ], + "title": "PDF", + "type": "string" + }, + "filename": { + "description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'", + "instillAcceptFormats": [ + "string" + ], + "instillUIOrder": 1, + "instillUpstreamTypes": [ + "reference", + "value" + ], + "title": "Filename", + "type": "string" + } + }, + "required": [ + "pdf" + ], + "title": "Input", + "type": "object" + }, + "output": { + "description": "Output", + "instillUIOrder": 0, + "properties": { + "images": { + "description": "Images converted from the PDF document", + "instillFormat": "array:image/*", + "instillUIOrder": 0, + "items": { + "type": "string" + }, + "title": "Images", + "type": "array" + }, + "filenames": { + "description": "The filenames of the images. The filenames will be appended with the page number. e.g. 'example-1.jpg'", + "instillFormat": "array:string", + "instillUIOrder": 1, + "items": { + "type": "string" + }, + "title": "Filenames", + "type": "array" + } + }, + "required": [ + "images" + ], + "title": "Output", + "type": "object" + } } } diff --git a/operator/document/v0/convert.go b/operator/document/v0/convert.go index 2151146a..9780921e 100644 --- a/operator/document/v0/convert.go +++ b/operator/document/v0/convert.go @@ -3,6 +3,7 @@ package document import ( "bytes" "fmt" + "strings" "time" "unicode/utf8" @@ -32,7 +33,7 @@ var ( "text/url": true, "text/xml": true, "application/xml": true, - "image/jpeg": true, + "image/jpeg": false, "image/png": true, "image/tif": true, "image/tiff": true, @@ -43,7 +44,8 @@ var ( // ConvertToTextInput defines the input for convert to text task type ConvertToTextInput struct { // Doc: Document to convert - Doc string `json:"doc"` + Doc string `json:"doc"` + Filename string `json:"filename"` } // ConvertToTextOutput defines the output for convert to text task @@ -55,7 +57,8 @@ type ConvertToTextOutput struct { // MSecs: Time taken to convert the document MSecs uint32 `json:"msecs"` // Error: Error message if any during the conversion process - Error string `json:"error"` + Error string `json:"error"` + Filename string `json:"filename"` } type converter interface { @@ -134,5 +137,10 @@ func convertToText(input ConvertToTextInput) (ConvertToTextOutput, error) { return ConvertToTextOutput{}, err } + if input.Filename != "" { + filename := strings.Split(input.Filename, ".")[0] + ".txt" + res.Filename = filename + } + return res, nil } diff --git a/operator/document/v0/convert_document_to_markdown.go b/operator/document/v0/convert_document_to_markdown.go index fc5f24c0..11c89908 100644 --- a/operator/document/v0/convert_document_to_markdown.go +++ b/operator/document/v0/convert_document_to_markdown.go @@ -2,6 +2,7 @@ package document import ( "fmt" + "strings" "github.com/instill-ai/component/base" "github.com/instill-ai/component/internal/util" @@ -12,10 +13,12 @@ type convertDocumentToMarkdownInput struct { Document string `json:"document"` DisplayImageTag bool `json:"display-image-tag"` Converter string `json:"converter"` + Filename string `json:"filename"` } type convertDocumentToMarkdownOutput struct { - Body string `json:"body"` + Body string `json:"body"` + Filename string `json:"filename"` } func (e *execution) convertDocumentToMarkdown(input *structpb.Struct) (*structpb.Struct, error) { @@ -50,6 +53,12 @@ func (e *execution) convertDocumentToMarkdown(input *structpb.Struct) (*structpb outputStruct := convertDocumentToMarkdownOutput{ Body: extractedTextInMarkdownFormat, } + + if inputStruct.Filename != "" { + filename := strings.Split(inputStruct.Filename, ".")[0] + ".md" + outputStruct.Filename = filename + } + output, err := base.ConvertToStructpb(outputStruct) if err != nil { return nil, err @@ -88,6 +97,10 @@ func getMarkdownTransformer(fileExtension string, inputStruct convertDocumentToM DisplayImageTag: inputStruct.DisplayImageTag, Converter: inputStruct.Converter, }, nil + case "xlsx": + return XlsxToMarkdownTransformer{ + Base64EncodedText: inputStruct.Document, + }, nil default: return nil, fmt.Errorf("unsupported file type") } diff --git a/operator/document/v0/convert_test.go b/operator/document/v0/convert_test.go index 5d2f21fa..34b3daaa 100644 --- a/operator/document/v0/convert_test.go +++ b/operator/document/v0/convert_test.go @@ -45,10 +45,10 @@ func TestConvertToText(t *testing.T) { name: "Convert png file", filepath: "testdata/test.png", }, - { - name: "Convert jpg file", - filepath: "testdata/test.jpg", - }, + // { + // name: "Convert jpg file", + // filepath: "testdata/test.jpg", + // }, { name: "Convert tiff file", filepath: "testdata/test.tif", diff --git a/operator/document/v0/convert_to_images.go b/operator/document/v0/convert_to_images.go new file mode 100644 index 00000000..33c5c8e8 --- /dev/null +++ b/operator/document/v0/convert_to_images.go @@ -0,0 +1,78 @@ +package document + +import ( + "bytes" + "encoding/base64" + "fmt" + "image/jpeg" + "strings" + + "github.com/gen2brain/go-fitz" + "github.com/instill-ai/component/base" + "google.golang.org/protobuf/types/known/structpb" +) + +type ConvertPDFToImagesInput struct { + PDF string `json:"pdf"` + Filename string `json:"filename"` +} + +type ConvertPDFToImagesOutput struct { + Images []string `json:"images"` + Filenames []string `json:"filenames"` +} + +func (e *execution) convertPDFToImages(input *structpb.Struct) (*structpb.Struct, error) { + + inputStruct := ConvertPDFToImagesInput{} + err := base.ConvertFromStructpb(input, &inputStruct) + if err != nil { + return nil, fmt.Errorf("failed to convert input struct: %w", err) + } + + base64String := strings.Split(inputStruct.PDF, ",")[1] + fileContent, err := base64.StdEncoding.DecodeString(base64String) + + if err != nil { + return nil, fmt.Errorf("failed to decode base64 string: %w", err) + } + + pdfToBeConverted, err := fitz.NewFromMemory(fileContent) + + if err != nil { + return nil, fmt.Errorf("failed to create PDF from memory: %w", err) + } + + defer pdfToBeConverted.Close() + + images := make([]string, pdfToBeConverted.NumPage()) + filenames := make([]string, pdfToBeConverted.NumPage()) + + for n := 0; n < pdfToBeConverted.NumPage(); n++ { + img, err := pdfToBeConverted.Image(n) + if err != nil { + return nil, fmt.Errorf("failed to extract image from PDF: %w", err) + } + + var buf bytes.Buffer + err = jpeg.Encode(&buf, img, &jpeg.Options{Quality: jpeg.DefaultQuality}) + + if err != nil { + return nil, fmt.Errorf("failed to encode image to JPEG: %w", err) + } + + imgBase64Str := base64.StdEncoding.EncodeToString(buf.Bytes()) + images[n] = fmt.Sprintf("data:image/jpeg;base64,%s", imgBase64Str) + + filename := strings.Split(inputStruct.Filename, ".")[0] + filenames[n] = fmt.Sprintf("%s_%d.jpg", filename, n) + } + + outputStruct := ConvertPDFToImagesOutput{ + Images: images, + Filenames: filenames, + } + + return base.ConvertToStructpb(outputStruct) + +} diff --git a/operator/document/v0/main.go b/operator/document/v0/main.go index 879904f3..34fe202e 100644 --- a/operator/document/v0/main.go +++ b/operator/document/v0/main.go @@ -16,6 +16,7 @@ import ( const ( taskConvertToMarkdown string = "TASK_CONVERT_TO_MARKDOWN" taskConvertToText string = "TASK_CONVERT_TO_TEXT" + taskConvertToImages string = "TASK_CONVERT_TO_IMAGES" pythonInterpreter string = "/opt/venv/bin/python" ) @@ -81,6 +82,8 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution, e.execute = e.convertDocumentToMarkdown case taskConvertToText: e.execute = e.convertToText + case taskConvertToImages: + e.execute = e.convertPDFToImages default: return nil, fmt.Errorf(fmt.Sprintf("%s task is not supported.", x.Task)) } diff --git a/operator/document/v0/markdown_transformer.go b/operator/document/v0/markdown_transformer.go index 26969a71..2893eaee 100644 --- a/operator/document/v0/markdown_transformer.go +++ b/operator/document/v0/markdown_transformer.go @@ -1,6 +1,7 @@ package document import ( + "bytes" "encoding/base64" "encoding/json" "fmt" @@ -13,6 +14,8 @@ import ( md "github.com/JohannesKaufmann/html-to-markdown" "github.com/instill-ai/component/base" + "github.com/instill-ai/component/internal/util" + "github.com/xuri/excelize/v2" ) type MarkdownTransformer interface { @@ -138,6 +141,45 @@ func (t HTMLToMarkdownTransformer) Transform() (string, error) { return markdown, nil } +type XlsxToMarkdownTransformer struct { + Base64EncodedText string +} + +func (t XlsxToMarkdownTransformer) Transform() (string, error) { + + base64String := strings.Split(t.Base64EncodedText, ",")[1] + fileContent, err := base64.StdEncoding.DecodeString(base64String) + + if err != nil { + return "", fmt.Errorf("failed to decode base64 to file: %w", err) + } + + reader := bytes.NewReader(fileContent) + + f, err := excelize.OpenReader(reader) + if err != nil { + return "", fmt.Errorf("failed to open reader: %w", err) + } + defer f.Close() + + sheets := f.GetSheetList() + + var result string + for _, sheet := range sheets { + rows, err := f.GetRows(sheet) + + if err != nil { + return "", fmt.Errorf("failed to get rows: %w", err) + } + + result += fmt.Sprintf("# %s\n", sheet) + result += util.ConvertDataFrameToMarkdownTable(rows) + result += "\n\n" + } + + return result, nil +} + type pythonRunnerOutput struct { Body string `json:"body"` }