Skip to content
This repository has been archived by the owner on Jan 9, 2025. It is now read-only.

Commit

Permalink
feat(web): improve web operator (#292)
Browse files Browse the repository at this point in the history
Because

- there are contents that we cannot get by pure http request
- there is no tag manipulation in crawl but in scrape

This commit

- add wait-for to get the ajax content
- add tag manipulation in crawl to get the better data for llm
  • Loading branch information
chuang8511 authored Aug 22, 2024
1 parent 44ea196 commit 1da84af
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 37 deletions.
8 changes: 7 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/JohannesKaufmann/html-to-markdown v1.5.0
github.com/PuerkitoBio/goquery v1.9.1
github.com/belong-inc/go-hubspot v0.9.0
github.com/chromedp/chromedp v0.10.0
github.com/cohere-ai/cohere-go/v2 v2.8.5
github.com/denisenkom/go-mssqldb v0.12.3
github.com/emersion/go-imap/v2 v2.0.0-beta.3
Expand Down Expand Up @@ -75,6 +76,8 @@ require (
github.com/PuerkitoBio/purell v1.1.1 // indirect
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 // indirect
github.com/chromedp/sysutil v1.0.0 // indirect
github.com/go-openapi/analysis v0.21.2 // indirect
github.com/go-openapi/errors v0.22.0 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
Expand All @@ -83,6 +86,9 @@ require (
github.com/go-openapi/spec v0.20.4 // indirect
github.com/go-openapi/swag v0.22.4 // indirect
github.com/go-openapi/validate v0.21.0 // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
github.com/gobwas/ws v1.4.0 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
Expand Down Expand Up @@ -187,7 +193,7 @@ require (
golang.org/x/mod v0.17.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/sync v0.7.0 // indirect
golang.org/x/sys v0.21.0 // indirect
golang.org/x/sys v0.22.0 // indirect
golang.org/x/time v0.5.0 // indirect
golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect
golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect
Expand Down
21 changes: 19 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
github.com/cohere-ai/cohere-go/v2 v2.8.5 h1:AwBnhMUg71Ewqxqb0P+WLs+luiQveRkMH2cjwM3b0AY=
Expand Down Expand Up @@ -202,6 +208,12 @@ github.com/gobuffalo/packr/v2 v2.2.0/go.mod h1:CaAwI0GPIAv+5wKLtv8Afwl+Cm78K/I/V
github.com/gobuffalo/syncx v0.0.0-20190224160051-33c29581e754/go.mod h1:HhnNqWY95UYwwW3uSASeV7vtgYkT2t16hJgV3AEPUpw=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/gocolly/colly v1.2.0/go.mod h1:Hof5T3ZswNVsOHYmba1u03W65HDWgpV5HifSuueE0EA=
Expand Down Expand Up @@ -339,6 +351,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/lestrrat-go/jspointer v0.0.0-20181205001929-82fadba7561c h1:pGh5EFIfczeDHwgMHgfwjhZzL+8/E3uZF6T7vER/W8c=
github.com/lestrrat-go/jspointer v0.0.0-20181205001929-82fadba7561c/go.mod h1:xw2Gm4Mg+ST9s8fHR1VkUIyOJMJnSloRZlPQB+wyVpY=
github.com/lestrrat-go/jsref v0.0.0-20211028120858-c0bcbb5abf20 h1:E1vlSQTLj+2EK0mrFGDc57u3QN7RybAUiGEOUAlYRd0=
Expand Down Expand Up @@ -391,6 +405,8 @@ github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn
github.com/olekukonko/tablewriter v0.0.0-20180506121414-d4647c9c7a84/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo=
github.com/olekukonko/tablewriter v0.0.4 h1:vHD/YYe1Wolo78koG299f7V/VAS08c6IpCLn+Ejf/w8=
github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE=
github.com/otiai10/gosseract/v2 v2.2.4 h1:h/PV+oJqke8q2Ccw9bjpMBWfd7N2vtGDCUcihZj3nRo=
github.com/otiai10/gosseract/v2 v2.2.4/go.mod h1:ahOp/kHojnOMGv1RaUnR0jwY5JVa6BYKhYAS8nbMLSo=
Expand Down Expand Up @@ -650,13 +666,14 @@ golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.4.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ=
Expand Down
7 changes: 6 additions & 1 deletion operator/web/v0/README.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The component configuration is defined and maintained [here](https://github.com/

### Crawl Website

Scrape the website contents.
Crawl the website contents and manipulate html with jquery command. The sequence of jquery commands will be executed in the order of only-main-content, remove-tags, and only-include-tags.


| Input | ID | Type | Description |
Expand All @@ -43,6 +43,10 @@ Scrape the website contents.
| Max Number of Pages (required) | `max-k` | integer | The max number of pages to return. If the number is set to 0, all pages will be returned. If the number is set to a positive integer, at most max k pages will be returned. |
| Include Link Text | `include-link-text` | boolean | Indicate whether to scrape the link and include the text of the link associated with this page in the 'link-text' field |
| Include Link HTML | `include-link-html` | boolean | Indicate whether to scrape the link and include the raw HTML of the link associated with this page in the 'link-html' field |
| Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
| Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
| Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. |



Expand Down Expand Up @@ -89,6 +93,7 @@ Scrape the webpage contents and manipulate html with jquery command. The sequenc
| Only Main Content | `only-main-content` | boolean | Only return the main content of the page excluding header, nav, footer. |
| Remove Tags | `remove-tags` | array[string] | A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer' |
| Only Include Tags | `only-include-tags` | array[string] | A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer' |
| Timeout | `timeout` | integer | The time to wait for the page to load in milliseconds. Min 0, Max 60000. |



Expand Down
79 changes: 78 additions & 1 deletion operator/web/v0/config/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
}
},
"TASK_CRAWL_WEBSITE": {
"instillShortDescription": "Scrape the website contents.",
"instillShortDescription": "Crawl the website contents and manipulate html with jquery command. The sequence of jquery commands will be executed in the order of only-main-content, remove-tags, and only-include-tags.",
"input": {
"instillUIOrder": 0,
"properties": {
Expand Down Expand Up @@ -120,6 +120,67 @@
],
"title": "Query",
"type": "string"
},
"only-main-content": {
"description": "Only return the main content of the page excluding header, nav, footer.",
"instillAcceptFormats": [
"boolean"
],
"instillUIOrder": 5,
"instillUpstreamTypes": [
"value",
"reference"
],
"title": "Only Main Content",
"type": "boolean"
},
"remove-tags": {
"description": "A list of tags, classes, and ids to remove from the output. If empty, no tags will be removed. Example: 'script, .ad, #footer'",
"instillAcceptFormats": [
"array:string"
],
"instillUIOrder": 6,
"instillUpstreamTypes": [
"value",
"reference"
],
"items": {
"type": "string"
},
"title": "Remove Tags",
"type": "array"
},
"only-include-tags": {
"description": "A list of tags, classes, and ids to include in the output. If empty, all tags will be included. Example: 'script, .ad, #footer'",
"instillAcceptFormats": [
"array:string"
],
"instillUIOrder": 7,
"instillUpstreamTypes": [
"value",
"reference"
],
"items": {
"type": "string"
},
"title": "Only Include Tags",
"type": "array"
},
"timeout": {
"default": 1000,
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.",
"instillAcceptFormats": [
"integer"
],
"instillUIOrder": 8,
"instillUpstreamTypes": [
"value",
"reference"
],
"minimum": 0,
"maximum": 60000,
"title": "Timeout",
"type": "integer"
}
},
"required": [
Expand Down Expand Up @@ -304,6 +365,22 @@
},
"title": "Only Include Tags",
"type": "array"
},
"timeout": {
"default": 1000,
"description": "The time to wait for the page to load in milliseconds. Min 0, Max 60000.",
"instillAcceptFormats": [
"integer"
],
"instillUIOrder": 5,
"instillUpstreamTypes": [
"value",
"reference"
],
"minimum": 0,
"maximum": 60000,
"title": "Timeout",
"type": "integer"
}
},
"required": [
Expand Down
8 changes: 4 additions & 4 deletions operator/web/v0/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ type component struct {

type execution struct {
base.ComponentExecution
execute func(*structpb.Struct) (*structpb.Struct, error)
externalCaller func(url string) (ioCloser io.ReadCloser, err error)
request func(url string) (*goquery.Document, error)
execute func(*structpb.Struct) (*structpb.Struct, error)
externalCaller func(url string) (ioCloser io.ReadCloser, err error)
getDocAfterRequestURL func(url string, timeout int) (*goquery.Document, error)
}

func Init(bc base.Component) *component {
Expand All @@ -65,7 +65,7 @@ func (c *component) CreateExecution(x base.ComponentExecution) (base.IExecution,
e.externalCaller = scrapSitemapCaller
e.execute = e.ScrapeSitemap
case taskScrapeWebpage:
e.request = httpRequest
e.getDocAfterRequestURL = getDocAfterRequestURL
e.execute = e.ScrapeWebpage
default:
return nil, fmt.Errorf(x.Task + " task is not supported.")
Expand Down
6 changes: 3 additions & 3 deletions operator/web/v0/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ func TestScrapeWebpage(t *testing.T) {
c.Run("ScrapeWebpage", func(c *quicktest.C) {
component := Init(base.Component{})
e := &execution{
ComponentExecution: base.ComponentExecution{Component: component, SystemVariables: nil, Setup: nil, Task: taskScrapeWebpage},
request: fakeHTTPRequest,
ComponentExecution: base.ComponentExecution{Component: component, SystemVariables: nil, Setup: nil, Task: taskScrapeWebpage},
getDocAfterRequestURL: fakeHTTPRequest,
}

e.execute = e.ScrapeWebpage
Expand All @@ -95,7 +95,7 @@ func TestScrapeWebpage(t *testing.T) {
})
}

func fakeHTTPRequest(url string) (*goquery.Document, error) {
func fakeHTTPRequest(url string, timeout int) (*goquery.Document, error) {
html := `
<!DOCTYPE html>
<html>
Expand Down
46 changes: 45 additions & 1 deletion operator/web/v0/scrape_webpage.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package web

import (
"context"
"fmt"
"log"
"net/http"
"strings"
"time"

"github.com/PuerkitoBio/goquery"
"github.com/chromedp/chromedp"
"github.com/instill-ai/component/base"
"github.com/instill-ai/component/internal/util"
"github.com/k3a/html2text"
Expand All @@ -18,6 +22,7 @@ type ScrapeWebpageInput struct {
OnlyMainContent bool `json:"only-main-content"`
RemoveTags []string `json:"remove-tags,omitempty"`
OnlyIncludeTags []string `json:"only-include-tags,omitempty"`
Timeout int `json:"timeout,omitempty"`
}

type ScrapeWebpageOutput struct {
Expand Down Expand Up @@ -46,7 +51,7 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err

output := ScrapeWebpageOutput{}

doc, err := e.request(inputStruct.URL)
doc, err := e.getDocAfterRequestURL(inputStruct.URL, inputStruct.Timeout)

if err != nil {
return nil, fmt.Errorf("error getting HTML page doc: %v", err)
Expand All @@ -64,6 +69,16 @@ func (e *execution) ScrapeWebpage(input *structpb.Struct) (*structpb.Struct, err

}

func getDocAfterRequestURL(url string, timeout int) (*goquery.Document, error) {

if timeout > 0 {
return requestToWebpage(url, timeout)
} else {
return httpRequest(url)
}

}

func httpRequest(url string) (*goquery.Document, error) {
client := &http.Client{}
res, err := client.Get(url)
Expand All @@ -80,6 +95,35 @@ func httpRequest(url string) (*goquery.Document, error) {
return doc, nil
}

func requestToWebpage(url string, timeout int) (*goquery.Document, error) {

ctx, cancel := context.WithTimeout(context.Background(), time.Duration(timeout)*time.Millisecond)
defer cancel()

ctx, cancelBrowser := chromedp.NewContext(ctx)
defer cancelBrowser()

var htmlContent string

err := chromedp.Run(ctx,
chromedp.Navigate(url),
chromedp.WaitReady("body"),
chromedp.OuterHTML("html", &htmlContent),
)
if err != nil {
return nil, fmt.Errorf("failed to get HTML content: %v", err)
}

htmlReader := strings.NewReader(htmlContent)

doc, err := goquery.NewDocumentFromReader(htmlReader)
if err != nil {
return nil, fmt.Errorf("failed to parse HTML from %s: %v", url, err)
}

return doc, nil
}

func getRemovedTagsHTML(doc *goquery.Document, input ScrapeWebpageInput) string {
if input.OnlyMainContent {
removeSelectors := []string{"header", "nav", "footer"}
Expand Down
Loading

0 comments on commit 1da84af

Please sign in to comment.