Skip to content

Commit

Permalink
Snap cleanup & chromedp parameters initialization
Browse files Browse the repository at this point in the history
  • Loading branch information
Law Zava committed Dec 27, 2019
1 parent 7561e23 commit 033bdff
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ compress:
cd ./bin && find . -name 'scrape*' | xargs -I{} tar czf {}.tar.gz {}

snap-clean:
rm -f scrape_*_amd64.snap
rm -f scrape_*_amd64.snap*
snapcraft clean

snap-build:
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Sample call:

`scrape -w https://lawzava.com`

Depends on `chromium` or `google-chrome` being available in path if `--js-wait` is used
Depends on `chromium` or `google-chrome` being available in path if `--js` is used

#### Parameters:
```
Expand Down
17 changes: 14 additions & 3 deletions scraper/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func (s *Scraper) Scrape(scrapedEmails *[]string) error {

if s.JSWait {
c.OnResponse(func(response *colly.Response) {
if err := initiateChromeSession(response); err != nil {
if err := initiateScrapingFromChrome(response); err != nil {
s.Log(err)
return
}
Expand Down Expand Up @@ -100,8 +100,19 @@ func trimProtocol(requestURL string) string {
return strings.TrimPrefix(strings.TrimPrefix(requestURL, "http://"), "https://")
}

func initiateChromeSession(response *colly.Response) error {
ctx, cancel := chromedp.NewContext(context.Background())
func initiateScrapingFromChrome(response *colly.Response) error {
opts := []chromedp.ExecAllocatorOption{
chromedp.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3830.0 Safari/537.36"), // nolint
chromedp.WindowSize(1920, 1080),
chromedp.NoFirstRun,
chromedp.Headless,
chromedp.DisableGPU,
}

ctx, cancel := chromedp.NewExecAllocator(context.Background(), opts...)
defer cancel()

ctx, cancel = chromedp.NewContext(ctx)
defer cancel()

var res string
Expand Down
6 changes: 2 additions & 4 deletions snapcraft.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: scrape
version: '1.3.1'
version: '1.3.2'
summary: CLI utility to scrape emails from websites
description: |
CLI utility that scrapes emails from specified website recursively and concurrently
Expand All @@ -12,7 +12,7 @@ description: |
--emails Scrape emails (default true)
--follow-external Follow external 3rd party links within website
-h, --help help for scrape
--js Enables JS execution await
--js Enables JS execution await (not supported throuh snap yet)
--logs Print debug logs
--recursively Scrape website recursively (default true)
-w, --website string Website to scrape (default "https://lawzava.com")
Expand All @@ -32,8 +32,6 @@ parts:
export GO111MODULE=on
export CGO_ENABLED=0
go build -o ../../../prime/bin/scrape
stage-snaps:
- chromium
apps:
scrape:
command: bin/scrape
Expand Down

0 comments on commit 033bdff

Please sign in to comment.