Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Commit

Permalink
feat(document): support xls and csv (#337)
Browse files Browse the repository at this point in the history
Because

- we do not support xls & csv to markdown

This commit

- support xls & csv
  • Loading branch information
chuang8511 authored Sep 18, 2024
1 parent 79b84b3 commit 892c51f
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 2 deletions.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/cohere-ai/cohere-go/v2 v2.8.5
github.com/emersion/go-imap/v2 v2.0.0-beta.3
github.com/emersion/go-message v0.18.1
github.com/extrame/xls v0.0.1
github.com/fatih/color v1.16.0
github.com/fogleman/gg v1.3.0
github.com/frankban/quicktest v1.14.6
Expand Down Expand Up @@ -77,6 +78,7 @@ require (
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7 // indirect
github.com/go-openapi/analysis v0.21.2 // indirect
github.com/go-openapi/errors v0.22.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7 h1:n+nk0bNe2+gVbRI8WRbLFVwwcBQ0rr5p+gzkKb6ol8c=
github.com/extrame/ole2 v0.0.0-20160812065207-d69429661ad7/go.mod h1:GPpMrAfHdb8IdQ1/R2uIRBsNfnPnwsYE9YYI5WyY1zw=
github.com/extrame/xls v0.0.1 h1:jI7L/o3z73TyyENPopsLS/Jlekm3nF1a/kF5hKBvy/k=
github.com/extrame/xls v0.0.1/go.mod h1:iACcgahst7BboCpIMSpnFs4SKyU9ZjsvZBfNbUxZOJI=
github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM=
github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE=
github.com/fatih/set v0.2.1 h1:nn2CaJyknWE/6txyUDGwysr3G5QC6xWB/PtVjPBbeaA=
Expand Down
4 changes: 4 additions & 0 deletions internal/util/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,10 @@ func TransformContentTypeToFileExtension(contentType string) string {
return "pdf"
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
return "xlsx"
case "application/vnd.ms-excel":
return "xls"
case "text/csv":
return "csv"
}
return ""
}
Expand Down
2 changes: 1 addition & 1 deletion operator/document/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Convert document to text in Markdown format.
| Input | ID | Type | Description |
| :--- | :--- | :--- | :--- |
| Task ID (required) | `task` | string | `TASK_CONVERT_TO_MARKDOWN` |
| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format |
| Document (required) | `document` | string | Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX/XLS/CSV to be converted to text in Markdown format |
| Filename | `filename` | string | The name of the file, please remember to add the file extension in the end of file name. e.g. 'example.pdf' |
| Display Image Tag | `display-image-tag` | boolean | Whether to display image tag in the markdown text. Default is 'false'. It is only applicable for convert-2024-08-28 converter. And, it is only applicable for the type of PPTX/PPT/DOCX/DOC/PDF. |

Expand Down
2 changes: 1 addition & 1 deletion operator/document/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"instillUIOrder": 0,
"properties": {
"document": {
"description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX to be converted to text in Markdown format",
"description": "Base64 encoded PDF/DOCX/DOC/PPTX/PPT/HTML/XLSX/XLS/CSV to be converted to text in Markdown format",
"instillAcceptFormats": [
"*/*"
],
Expand Down
8 changes: 8 additions & 0 deletions operator/document/v0/convert_document_to_markdown.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ func GetMarkdownTransformer(fileExtension string, inputStruct *ConvertDocumentTo
return XlsxToMarkdownTransformer{
Base64EncodedText: inputStruct.Document,
}, nil
case "xls":
return XlsToMarkdownTransformer{
Base64EncodedText: inputStruct.Document,
}, nil
case "csv":
return CSVToMarkdownTransformer{
Base64EncodedText: inputStruct.Document,
}, nil
default:
return nil, fmt.Errorf("unsupported file type")
}
Expand Down
12 changes: 12 additions & 0 deletions operator/document/v0/convert_document_to_markdown_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ func TestConvertDocumentToMarkdown(t *testing.T) {
name: "Convert XLSX file",
filepath: "testdata/test.xlsx",
},
{
name: "Convert XLS file",
filepath: "testdata/test.xls",
},
{
name: "Convert CSV file",
filepath: "testdata/test.csv",
},
}
for _, test := range tests {
c.Run(test.name, func(c *quicktest.C) {
Expand Down Expand Up @@ -81,6 +89,10 @@ func mimeTypeByExtension(filepath string) string {
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
case "testdata/test.xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
case "testdata/test.xls":
return "application/vnd.ms-excel"
case "testdata/test.csv":
return "text/csv"
default:
return ""
}
Expand Down
82 changes: 82 additions & 0 deletions operator/document/v0/markdown_transformer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@ package document
import (
"bytes"
"encoding/base64"
"encoding/csv"
"fmt"
"os"
"os/exec"
"path/filepath"
"strings"

md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/extrame/xls"
"github.com/instill-ai/component/base"
"github.com/instill-ai/component/internal/util"
"github.com/xuri/excelize/v2"
Expand Down Expand Up @@ -135,6 +137,86 @@ func (t XlsxToMarkdownTransformer) Transform() (converterOutput, error) {
return converterOutput{Body: result}, nil
}

type XlsToMarkdownTransformer struct {
Base64EncodedText string
}

func (t XlsToMarkdownTransformer) Transform() (converterOutput, error) {

base64String := strings.Split(t.Base64EncodedText, ",")[1]
fileContent, err := base64.StdEncoding.DecodeString(base64String)

output := converterOutput{}

if err != nil {
return output, fmt.Errorf("failed to decode base64 to file: %w", err)
}

reader := bytes.NewReader(fileContent)

xlsFile, err := xls.OpenReader(reader, "utf-8")
if err != nil {
return output, fmt.Errorf("failed to open XLS reader: %w", err)
}

result := ""
for i := 0; i < xlsFile.NumSheets(); i++ {
sheet := xlsFile.GetSheet(i)
if sheet == nil {
continue
}

result += fmt.Sprintf("# %s\n", sheet.Name)
dataFrame := make([][]string, 0)

for rowIndex := 0; rowIndex <= int(sheet.MaxRow); rowIndex++ {
row := sheet.Row(rowIndex)
if row == nil {
continue
}
dataRow := make([]string, 0)
for colIndex := 0; colIndex <= int(row.LastCol()); colIndex++ {
cell := row.Col(colIndex)
dataRow = append(dataRow, cell)
}
dataFrame = append(dataFrame, dataRow)
}

result += util.ConvertDataFrameToMarkdownTable(dataFrame)
result += "\n\n"
}

output.Body = result
return output, nil

}

type CSVToMarkdownTransformer struct {
Base64EncodedText string
}

func (t CSVToMarkdownTransformer) Transform() (converterOutput, error) {

base64String := strings.Split(t.Base64EncodedText, ",")[1]
fileContent, err := base64.StdEncoding.DecodeString(base64String)

if err != nil {
return converterOutput{}, fmt.Errorf("failed to decode base64 to file: %w", err)
}

reader := csv.NewReader(bytes.NewReader(fileContent))

records, err := reader.ReadAll()

if err != nil {
return converterOutput{}, fmt.Errorf("failed to read csv: %w", err)
}

result := util.ConvertDataFrameToMarkdownTable(records)

return converterOutput{Body: result}, nil
}

func writeDecodeToFile(base64Str string, file *os.File) error {
data, err := base64.StdEncoding.DecodeString(base.TrimBase64Mime(base64Str))
if err != nil {
Expand Down
Binary file added operator/document/v0/testdata/test.xls
Binary file not shown.

0 comments on commit 892c51f

Please sign in to comment.