-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #39 from malparty/feature/google-search-parsed
[#7] [Backend] As a User, I can query a single keyword and get its Google search results parsed
- Loading branch information
Showing
10 changed files
with
3,923 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -506,6 +506,7 @@ DEPENDENCIES | |
letter_opener | ||
listen (= 3.1.5) | ||
mini_magick | ||
nokogiri | ||
pagy | ||
pg | ||
pry-byebug | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# frozen_string_literal: true | ||
|
||
module Google | ||
class ParserService | ||
NON_ADS_RESULT_SELECTOR = 'a[data-ved]:not([role]):not([jsaction]):not(.adwords):not(.footer-links)' | ||
AD_CONTAINER_ID = 'tads' | ||
ADWORDS_CLASS = 'adwords' | ||
|
||
def initialize(html_response:) | ||
raise ArgumentError, 'response.body cannot be blank' if html_response.body.blank? | ||
|
||
@html = html_response | ||
|
||
@document = Nokogiri::HTML.parse(html_response) | ||
|
||
# Add a class to all AdWords link for easier manipulation | ||
document.css('div[data-text-ad] a[data-ved]').add_class(ADWORDS_CLASS) | ||
|
||
# Mark footer links to identify them | ||
document.css('#footcnt a').add_class('footer-links') | ||
end | ||
|
||
# Parse html data and return a hash with the results | ||
def call | ||
{ | ||
ads_top_count: ads_top_count, | ||
ads_page_count: ads_page_count, | ||
ads_top_url: ads_top_url, | ||
ads_page_url: ads_page_url, | ||
non_ads_result_count: non_ads_result_count, | ||
non_ads_url: non_ads_url, | ||
total_link_count: total_link_count, | ||
html: html | ||
} | ||
end | ||
|
||
private | ||
|
||
attr_reader :html, :document | ||
|
||
def ads_top_count | ||
document.css("##{AD_CONTAINER_ID} .#{ADWORDS_CLASS}").count | ||
end | ||
|
||
def ads_page_count | ||
document.css(".#{ADWORDS_CLASS}").count | ||
end | ||
|
||
def ads_top_url | ||
document.css("##{AD_CONTAINER_ID} .#{ADWORDS_CLASS}").map { |a_tag| a_tag['href'] } | ||
end | ||
|
||
def ads_page_url | ||
document.css(".#{ADWORDS_CLASS}").map { |a_tag| a_tag['href'] } | ||
end | ||
|
||
def non_ads_result_count | ||
document.css(NON_ADS_RESULT_SELECTOR).count | ||
end | ||
|
||
def non_ads_url | ||
document.css(NON_ADS_RESULT_SELECTOR).map { |a_tag| a_tag['href'] } | ||
end | ||
|
||
def total_link_count | ||
document.css('a').count | ||
end | ||
end | ||
end |
File renamed without changes.
File renamed without changes.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# frozen_string_literal: true | ||
|
||
require 'rails_helper' | ||
|
||
RSpec.describe Google::ParserService, type: :service do | ||
describe '#call' do | ||
context 'when parsing a page having 1 top ad' do | ||
it 'counts exactly 1 top ad', vcr: 'google_search/top_ads_1' do | ||
result = Google::ClientService.new(keyword: 'squarespace').call | ||
|
||
expect(described_class.new(html_response: result).call[:ads_top_count]).to eq(1) | ||
end | ||
end | ||
|
||
context 'when parsing a page having 3 top ads, 3 bottom ads and 14 non ad links' do | ||
it 'counts exactly 3 top ads', vcr: 'google_search/top_ads_6' do | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:ads_top_count]).to eq(3) | ||
end | ||
|
||
it 'counts exactly 6 ads in total', vcr: 'google_search/top_ads_6' do | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:ads_page_count]).to eq(6) | ||
end | ||
|
||
it 'finds exactly the 3 top ads urls', vcr: 'google_search/top_ads_6' do | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:ads_top_url]).to contain_exactly('https://cloud.google.com/free', 'https://www.expressvpn.com/', 'https://www.top10vpn.com/best-vpn-for-vietnam/') | ||
end | ||
|
||
it 'counts exactly 14 non ad results', vcr: 'google_search/top_ads_6' do | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:non_ads_result_count]).to eq(14) | ||
end | ||
|
||
it 'gets 14 results', vcr: 'google_search/top_ads_6' do | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:non_ads_url].count).to eq(14) | ||
end | ||
|
||
it 'gets exactly 113 links', vcr: 'google_search/top_ads_6' do | ||
# Counted from cassette html raw code | ||
result = Google::ClientService.new(keyword: 'vpn').call | ||
|
||
expect(described_class.new(html_response: result).call[:total_link_count]).to eq(113) | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters