Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

feat(document): improve document operator #287

Merged
merged 5 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ require (
github.com/frankban/quicktest v1.14.6
github.com/gabriel-vasile/mimetype v1.4.3
github.com/gage-technologies/mistral-go v1.1.0
github.com/gen2brain/go-fitz v1.23.7
github.com/go-chi/chi/v5 v5.1.0
github.com/go-openapi/strfmt v0.23.0
github.com/go-resty/resty/v2 v2.12.0
Expand Down Expand Up @@ -53,6 +54,7 @@ require (
github.com/tmc/langchaingo v0.1.10
github.com/u2takey/ffmpeg-go v0.5.0
github.com/weaviate/weaviate v1.26.0-rc.1
github.com/xuri/excelize/v2 v2.8.1
go.mongodb.org/mongo-driver v1.16.0
go.uber.org/zap v1.24.0
golang.org/x/image v0.18.0
Expand Down Expand Up @@ -85,11 +87,14 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 // indirect
github.com/montanaflynn/stats v0.7.1 // indirect
github.com/oklog/ulid v1.3.1 // indirect
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
github.com/xdg-go/scram v1.1.2 // indirect
github.com/xdg-go/stringprep v1.0.4 // indirect
github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 // indirect
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 // indirect
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d // indirect
)

Expand Down Expand Up @@ -150,7 +155,7 @@ require (
github.com/pierrec/lz4/v4 v4.1.18 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/richardlehane/mscfb v1.0.3 // indirect
github.com/richardlehane/mscfb v1.0.4 // indirect
github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/rivo/uniseg v0.4.4 // indirect
github.com/rogpeppe/go-internal v1.11.0 // indirect
Expand Down
16 changes: 13 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uq
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
github.com/gage-technologies/mistral-go v1.1.0 h1:POv1wM9jA/9OBXGV2YdPi9Y/h09+MjCbUF+9hRYlVUI=
github.com/gage-technologies/mistral-go v1.1.0/go.mod h1:tF++Xt7U975GcLlzhrjSQb8l/x+PrriO9QEdsgm9l28=
github.com/gen2brain/go-fitz v1.23.7 h1:HPhzEVzmOINvCKqQgB/DwMzYh4ArIgy3tMwq1eJTcbg=
github.com/gen2brain/go-fitz v1.23.7/go.mod h1:HU04vc+RisUh/kvEd2pB0LAxmK1oyXdN4ftyshUr9rQ=
github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573 h1:u8AQ9bPa9oC+8/A/jlWouakhIvkFfuxgIIRjiy8av7I=
github.com/gigawattio/window v0.0.0-20180317192513-0f5467e35573/go.mod h1:eBvb3i++NHDH4Ugo9qCvMw8t0mTSctaEa5blJbWcNxs=
github.com/go-chi/chi/v5 v5.1.0 h1:acVI1TYaD+hhedDJ3r54HyA6sExp3HfXq7QWEEY/xMw=
Expand Down Expand Up @@ -376,6 +378,8 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/modocache/gover v0.0.0-20171022184752-b58185e213c5/go.mod h1:caMODM3PzxT8aQXRPkAt8xlV/e7d7w8GM5g0fa5F0D8=
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw=
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE=
github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow=
Expand Down Expand Up @@ -412,8 +416,8 @@ github.com/redis/go-redis/v9 v9.5.1/go.mod h1:hdY0cQFCN4fnSYT6TkisLufl/4W5UIXyv0
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/richardlehane/mscfb v1.0.3 h1:rD8TBkYWkObWO0oLDFCbwMeZ4KoalxQy+QgniCj3nKI=
github.com/richardlehane/mscfb v1.0.3/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk=
github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM=
github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk=
github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
github.com/richardlehane/msoleps v1.0.3 h1:aznSZzrwYRl3rLKRT3gUk9am7T/mLNSnJINvN0AQoVM=
github.com/richardlehane/msoleps v1.0.3/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg=
Expand Down Expand Up @@ -499,6 +503,12 @@ github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3k
github.com/xdg-go/stringprep v1.0.2/go.mod h1:8F9zXuvzgwmyT5DUm4GUfZGDdT3W+LCvS6+da4O5kxM=
github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8=
github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM=
github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53 h1:Chd9DkqERQQuHpXjR/HSV1jLZA6uaoiwwH3vSuF3IW0=
github.com/xuri/efp v0.0.0-20231025114914-d1ff6096ae53/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI=
github.com/xuri/excelize/v2 v2.8.1 h1:pZLMEwK8ep+CLIUWpWmvW8IWE/yxqG0I1xcN6cVMGuQ=
github.com/xuri/excelize/v2 v2.8.1/go.mod h1:oli1E4C3Pa5RXg1TBXn4ENCXDV5JUMlBluUhG7c+CEE=
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 h1:qhbILQo1K3mphbwKh1vNm4oGezE1eF9fQWmNiIpSfI4=
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
github.com/xwb1989/sqlparser v0.0.0-20180606152119-120387863bf2 h1:zzrxE1FKn5ryBNl9eKOeqQ58Y/Qpo3Q9QNxKHX5uzzQ=
github.com/xwb1989/sqlparser v0.0.0-20180606152119-120387863bf2/go.mod h1:hzfGeIUDq/j97IG+FhNqkowIyEcD88LrW6fyU3K3WqY=
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d h1:splanxYIlg+5LfHAM6xpdFEAYOk8iySO56hMFq6uLyA=
Expand Down Expand Up @@ -677,10 +687,10 @@ golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGm
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190329151228-23e29df326fe/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190416151739-9c9e1878f421/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190420181800-aa740d480789/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs=
golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q=
golang.org/x/tools v0.0.0-20190531172133-b3315ee88b7d/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc=
Expand Down
28 changes: 28 additions & 0 deletions internal/util/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,8 @@ func TransformContentTypeToFileExtension(contentType string) string {
return "html"
case "application/pdf":
return "pdf"
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
return "xlsx"
}
return ""
}
Expand All @@ -154,3 +156,29 @@ func GetInstillUserUID(vars map[string]any) string {
func GetInstillRequesterUID(vars map[string]any) string {
return vars["__PIPELINE_REQUESTER_UID"].(string)
}

func ConvertDataFrameToMarkdownTable(rows [][]string) string {
var sb strings.Builder

sb.WriteString("|")
for _, colCell := range rows[0] {
sb.WriteString(fmt.Sprintf(" %s |", colCell))
}
sb.WriteString("\n")

sb.WriteString("|")
for range rows[0] {
sb.WriteString(" --- |")
}
sb.WriteString("\n")

for _, row := range rows[1:] {
sb.WriteString("|")
for _, colCell := range row {
sb.WriteString(fmt.Sprintf(" %s |", colCell))
}
sb.WriteString("\n")
}

return sb.String()
}
30 changes: 29 additions & 1 deletion operator/document/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ It can carry out the following tasks:

- [Convert To Markdown](#convert-to-markdown)
- [Convert To Text](#convert-to-text)
- [Convert To Images](#convert-to-images)



Expand Down Expand Up @@ -37,13 +38,15 @@ Convert document to text in Markdown format.
| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_MARKDOWN` |
| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML to be converted to text in Markdown format |
| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Body | `body` | string | Markdown text converted from the PDF document |
| Filename (optional) | `filename` | string | The name of the file |



Expand All @@ -59,12 +62,14 @@ Convert document to text.
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_TEXT` |
| Document (required) | `doc` | string | Base64 encoded document (PDF, DOC, DOCX, XML, HTML, RTF, etc.) to be converted to plain text |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Body | `body` | string | Plain text converted from the document |
| Filename (optional) | `filename` | string | The name of the file |
| Meta | `meta` | object | Metadata extracted from the document |
| MSecs | `msecs` | number | Time taken to convert the document |
| Error | `error` | string | Error message if any during the conversion process |
Expand All @@ -74,4 +79,27 @@ Convert document to text.



### Convert To Images

Convert PDF to images.


| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_IMAGES` |
| PDF (required) | `pdf` | string | Base64 encoded PDF to be converted to images |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |



| Output | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Images | `images` | array[string] | Images converted from the PDF document |
| Filenames (optional) | `filenames` | array[string] | The filenames of the images. The filenames will be appended with the page number. e.g. 'example-1.jpg' |







3 changes: 2 additions & 1 deletion operator/document/v0/config/definition.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{
"availableTasks": [
"TASK_CONVERT_TO_MARKDOWN",
"TASK_CONVERT_TO_TEXT"
"TASK_CONVERT_TO_TEXT",
"TASK_CONVERT_TO_IMAGES"
],
"custom": false,
"documentationUrl": "https://www.instill.tech/docs/component/operator/document",
Expand Down
116 changes: 115 additions & 1 deletion operator/document/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"instillUIOrder": 0,
"properties": {
"document": {
"description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML to be converted to text in Markdown format",
"description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format",
"instillAcceptFormats": [
"*/*"
],
Expand All @@ -20,6 +20,19 @@
],
"title": "Document",
"type": "string"
},
"filename": {
"description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'",
"instillAcceptFormats": [
"string"
],
"instillUIOrder": 1,
"instillUpstreamTypes": [
"reference",
"value"
],
"title": "Filename",
"type": "string"
}
},
"required": [
Expand All @@ -39,6 +52,13 @@
"instillUIOrder": 0,
"title": "Body",
"type": "string"
},
"filename": {
"description": "The name of the file",
"instillFormat": "string",
"instillUIOrder": 1,
"title": "Filename",
"type": "string"
}
},
"required": [
Expand Down Expand Up @@ -69,6 +89,19 @@
],
"title": "Document",
"type": "string"
},
"filename": {
"description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'",
"instillAcceptFormats": [
"string"
],
"instillUIOrder": 1,
"instillUpstreamTypes": [
"reference",
"value"
],
"title": "Filename",
"type": "string"
}
},
"required": [
Expand Down Expand Up @@ -111,6 +144,13 @@
"instillUIOrder": 2,
"title": "MSecs",
"type": "number"
},
"filename": {
"description": "The name of the file",
"instillFormat": "string",
"instillUIOrder": 1,
"title": "Filename",
"type": "string"
}
},
"required": [
Expand All @@ -122,5 +162,79 @@
"title": "Output",
"type": "object"
}
},
"TASK_CONVERT_TO_IMAGES": {
"instillShortDescription": "Convert PDF to images.",
"input": {
"description": "Input",
"instillEditOnNodeFields": [
"pdf"
],
"instillUIOrder": 0,
"properties": {
"pdf": {
"description": "Base64 encoded PDF to be converted to images",
"instillAcceptFormats": [
"*/*"
],
"instillUIMultiline": true,
"instillUIOrder": 0,
"instillUpstreamTypes": [
"reference"
],
"title": "PDF",
"type": "string"
},
"filename": {
"description": "The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf'",
"instillAcceptFormats": [
"string"
],
"instillUIOrder": 1,
"instillUpstreamTypes": [
"reference",
"value"
],
"title": "Filename",
"type": "string"
}
},
"required": [
"pdf"
],
"title": "Input",
"type": "object"
},
"output": {
"description": "Output",
"instillUIOrder": 0,
"properties": {
"images": {
"description": "Images converted from the PDF document",
"instillFormat": "array:image/*",
"instillUIOrder": 0,
"items": {
"type": "string"
},
"title": "Images",
"type": "array"
},
"filenames": {
"description": "The filenames of the images. The filenames will be appended with the page number. e.g. 'example-1.jpg'",
"instillFormat": "array:string",
"instillUIOrder": 1,
"items": {
"type": "string"
},
"title": "Filenames",
"type": "array"
}
},
"required": [
"images"
],
"title": "Output",
"type": "object"
}
}
}
Loading
Loading