Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

feat(web): improve web operator #292

Merged
merged 4 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/JohannesKaufmann/html-to-markdown v1.5.0
github.com/PuerkitoBio/goquery v1.9.1
github.com/belong-inc/go-hubspot v0.9.0
github.com/chromedp/chromedp v0.10.0
github.com/cohere-ai/cohere-go/v2 v2.8.5
github.com/denisenkom/go-mssqldb v0.12.3
github.com/emersion/go-imap/v2 v2.0.0-beta.3
Expand Down Expand Up @@ -75,6 +76,8 @@ require (
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/go-openapi/analysis v0.21.2 // indirect
github.com/go-openapi/errors v0.22.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
Expand All @@ -83,6 +86,9 @@ require (
github.com/go-openapi/spec v0.20.4 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/go-openapi/validate v0.21.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
Expand Down Expand Up @@ -187,7 +193,7 @@ require (
golang.org/x/mod v0.17.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
Expand Down
21 changes: 19 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/cohere-ai/cohere-go/v2 v2.8.5 h1:AwBnhMUg71Ewqxqb0P+WLs+luiQveRkMH2cjwM3b0AY=
Expand Down Expand Up @@ -202,6 +208,12 @@ github.com/gobuffalo/packr/v2 v2.2.0/go.mod h1:CaAwI0GPIAv+5wKLtv8Afwl+Cm78K/I/V
github.com/gobuffalo/syncx v0.0.0-20190224160051-33c29581e754/go.mod h1:HhnNqWY95UYwwW3uSASeV7vtgYkT2t16hJgV3AEPUpw=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
Expand Down Expand Up @@ -339,6 +351,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/lestrrat-go/jspointer v0.0.0-20181205001929-82fadba7561c h1:pGh5EFIfczeDHwgMHgfwjhZzL+8/E3uZF6T7vER/W8c=
github.com/lestrrat-go/jspointer v0.0.0-20181205001929-82fadba7561c/go.mod h1:xw2Gm4Mg+ST9s8fHR1VkUIyOJMJnSloRZlPQB+wyVpY=
github.com/lestrrat-go/jsref v0.0.0-20211028120858-c0bcbb5abf20 h1:E1vlSQTLj+2EK0mrFGDc57u3QN7RybAUiGEOUAlYRd0=
Expand Down Expand Up @@ -391,6 +405,8 @@ github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn
github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo=
github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8=
github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
github.com/otiai10/gosseract/v2 v2.2.4 h1:h/PV+oJqke8q2Ccw9bjpMBWfd7N2vtGDCUcihZj3nRo=
github.com/otiai10/gosseract/v2 v2.2.4/go.mod h1:ahOp/kHojnOMGv1RaUnR0jwY5JVa6BYKhYAS8nbMLSo=
Expand Down Expand Up @@ -650,13 +666,14 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ=
Expand Down
7 changes: 6 additions & 1 deletion operator/web/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The component configuration is defined and maintained [here](https://github.com/

### Crawl Website

Scrape the website contents.
Crawl the website contents and manipulate html with jquery command. The sequence of jquery commands will be executed in the order of only-main-content, remove-tags, and only-include-tags.


| Input | ID | Type | Description |
Expand All @@ -43,6 +43,10 @@ Scrape the website contents.
| Max Number of Pages (required) | `max-k` | integer | The max number of pages to return. If the number is set to 0, all pages will be returned. If the number is set to a positive integer, at most max k pages will be returned. |
| Include Link Text | `include-link-text` | boolean | Indicate whether to scrape the link and include the text of the link associated with this page in the 'link-text' field |
| Include Link HTML | `include-link-html` | boolean | Indicate whether to scrape the link and include the raw HTML of the link associated with this page in the 'link-html' field |
| Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
| Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
| Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. |



Expand Down Expand Up @@ -89,6 +93,7 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
| Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
| Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
| Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. |



Expand Down
79 changes: 78 additions & 1 deletion operator/web/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
}
},
"TASK_CRAWL_WEBSITE": {
"instillShortDescription": "Scrape the website contents.",
"instillShortDescription": "Crawl the website contents and manipulate html with jquery command. The sequence of jquery commands will be executed in the order of only-main-content, remove-tags, and only-include-tags.",
"input": {
"instillUIOrder": 0,
"properties": {
Expand Down Expand Up @@ -120,6 +120,67 @@
],
"title": "Query",
"type": "string"
},
"only-main-content": {
"description": "Only return the main content of the page excluding header, nav, footer.",
"instillAcceptFormats": [
"boolean"
],
"instillUIOrder": 5,
"instillUpstreamTypes": [
"value",
"reference"
],
"title": "Only Main Content",
"type": "boolean"
},
"remove-tags": {
"description": "A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer'",
"instillAcceptFormats": [
"array:string"
],
"instillUIOrder": 6,
"instillUpstreamTypes": [
"value",
"reference"
],
"items": {
"type": "string"
},
"title": "Remove Tags",
"type": "array"
},
"only-include-tags": {
"description": "A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer'",
"instillAcceptFormats": [
"array:string"
],
"instillUIOrder": 7,
"instillUpstreamTypes": [
"value",
"reference"
],
"items": {
"type": "string"
},
"title": "Only Include Tags",
"type": "array"
},
"timeout": {
"default": 1000,
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.",
"instillAcceptFormats": [
"integer"
],
"instillUIOrder": 8,
"instillUpstreamTypes": [
"value",
"reference"
],
"minimum": 0,
"maximum": 60000,
"title": "Timeout",
"type": "integer"
}
},
"required": [
Expand Down Expand Up @@ -304,6 +365,22 @@
},
"title": "Only Include Tags",
"type": "array"
},
"timeout": {
"default": 1000,
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.",
"instillAcceptFormats": [
"integer"
],
"instillUIOrder": 5,
"instillUpstreamTypes": [
"value",
"reference"
],
"minimum": 0,
"maximum": 60000,
"title": "Timeout",
"type": "integer"
}
},
"required": [
Expand Down
8 changes: 4 additions & 4 deletions operator/web/v0/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ type component struct {

type execution struct {
base.ComponentExecution
execute func(*structpb.Struct) (*structpb.Struct, error)
externalCaller func(url string) (ioCloser io.ReadCloser, err error)
request func(url string) (*goquery.Document, error)
execute func(*structpb.Struct) (*structpb.Struct, error)
externalCaller func(url string) (ioCloser io.ReadCloser, err error)
getDocAfterRequestURL func(url string, timeout int) (*goquery.Document, error)
}

func Init(bc base.Component) *component {
Expand All @@ -65,7 +65,7 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
e.externalCaller = scrapSitemapCaller
e.execute = e.ScrapeSitemap
case taskScrapeWebpage:
e.request = httpRequest
e.getDocAfterRequestURL = getDocAfterRequestURL
e.execute = e.ScrapeWebpage
default:
return nil, fmt.Errorf(x.Task + " task is not supported.")
Expand Down
6 changes: 3 additions & 3 deletions operator/web/v0/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ func TestScrapeWebpage(t *testing.T) {
c.Run("ScrapeWebpage", func(c *quicktest.C) {
component := Init(base.Component{})
e := &execution{
ComponentExecution: base.ComponentExecution{Component: component, SystemVariables: nil, Setup: nil, Task: taskScrapeWebpage},
request: fakeHTTPRequest,
ComponentExecution: base.ComponentExecution{Component: component, SystemVariables: nil, Setup: nil, Task: taskScrapeWebpage},
getDocAfterRequestURL: fakeHTTPRequest,
}

e.execute = e.ScrapeWebpage
Expand All @@ -95,7 +95,7 @@ func TestScrapeWebpage(t *testing.T) {
})
}

func fakeHTTPRequest(url string) (*goquery.Document, error) {
func fakeHTTPRequest(url string, timeout int) (*goquery.Document, error) {
html := `
<!DOCTYPE html>
<html>
Expand Down
46 changes: 45 additions & 1 deletion operator/web/v0/scrape_webpage.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package web

import (
"context"
"fmt"
"log"
"net/http"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
"github.com/instill-ai/component/base"
"github.com/instill-ai/component/internal/util"
"github.com/k3a/html2text"
Expand All @@ -18,6 +22,7 @@ type ScrapeWebpageInput struct {
OnlyMainContent bool `json:"only-main-content"`
RemoveTags []string `json:"remove-tags,omitempty"`
OnlyIncludeTags []string `json:"only-include-tags,omitempty"`
Timeout int `json:"timeout,omitempty"`
}

type ScrapeWebpageOutput struct {
Expand Down Expand Up @@ -46,7 +51,7 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err

output := ScrapeWebpageOutput{}

doc, err := e.request(inputStruct.URL)
doc, err := e.getDocAfterRequestURL(inputStruct.URL, inputStruct.Timeout)

if err != nil {
return nil, fmt.Errorf("error getting HTML page doc: %v", err)
Expand All @@ -64,6 +69,16 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err

}

func getDocAfterRequestURL(url string, timeout int) (*goquery.Document, error) {

if timeout > 0 {
return requestToWebpage(url, timeout)
} else {
return httpRequest(url)
}

}

func httpRequest(url string) (*goquery.Document, error) {
client := &http.Client{}
res, err := client.Get(url)
Expand All @@ -80,6 +95,35 @@ func httpRequest(url string) (*goquery.Document, error) {
return doc, nil
}

func requestToWebpage(url string, timeout int) (*goquery.Document, error) {

ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
defer cancel()

ctx, cancelBrowser := chromedp.NewContext(ctx)
defer cancelBrowser()

var htmlContent string

err := chromedp.Run(ctx,
chromedp.Navigate(url),
chromedp.WaitReady("body"),
chromedp.OuterHTML("html", &htmlContent),
)
if err != nil {
return nil, fmt.Errorf("failed to get HTML content: %v", err)
}

htmlReader := strings.NewReader(htmlContent)

doc, err := goquery.NewDocumentFromReader(htmlReader)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML from %s: %v", url, err)
}

return doc, nil
}

func getRemovedTagsHTML(doc *goquery.Document, input ScrapeWebpageInput) string {
if input.OnlyMainContent {
removeSelectors := []string{"header", "nav", "footer"}
Expand Down
Loading
Loading