Skip to content

Commit

Permalink
ad more websites and extend unwanted scraping list
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelKatsoulis committed Apr 27, 2018
1 parent d2dcec6 commit 300cfe2
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 5 deletions.
37 changes: 35 additions & 2 deletions scraper_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,23 @@
"https://www.npr.org/sections/business/",
"https://www.nbcnews.com/business",
"https://www.cnbc.com/technology/",
"https://www.cnbc.com/finance/"
"https://www.cnbc.com/finance/",
"https://www.cnbc.com/investing/",
"https://www.cnbc.com/markets/",
"https://www.cnbc.com/autos/",
"https://news.sky.com/business",
"https://www.fin24.com/Companies/",
"https://www.wsj.com/news/technology",
"https://www.wsj.com/news/markets",
"https://www.ft.com/companies",
"https://www.marketwatch.com/",
"https://www.usatoday.com/money/markets/",
"http://www.londonstockexchange.com/exchange/news/alliance-news/archive.html?nameCode=&type=collapsed&tagCode=ALLCOS",
"https://www.forbes.com/business/#19a2399c535f",
"https://www.forbes.com/technology/#10c33c0b4bad",
"https://www.fnlondon.com/",
"http://money.cnn.com/news/",
"http://www.morningstar.co.uk/uk/equities/default.aspx"
]

website_list = [
Expand Down Expand Up @@ -90,5 +106,22 @@
"theguardian.com",
"npr.org",
"nbcnews.com",
"cnbc.com"
"cnbc.com",
"cnbc.com",
"cnbc.com",
"cnbc.com",
"cnbc.com",
"news.sky.com",
"fin24.com",
"wsj.com",
"wsj.com",
"ft.com",
"marketwatch.com",
"usatoday.com",
"londonstockexchange.com",
"forbes.com",
"forbes.com",
"fnlondon.com",
"cnn.com",
"morningstar.co.uk"
]
3 changes: 2 additions & 1 deletion scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def rchop(thestring, ending):

def skip_unwanted(h_link):
unwanted_list = ["://itunes.apple.com/", "//www.facebook.com/",
"//facebook.com/", "//apps.microsoft.com"]
"//facebook.com/", "//apps.microsoft.com", "yahoo.com",
"//yahoofinance.tumblr.com", "//play.google.com"]
for item in unwanted_list:
if item in h_link:
return True
Expand Down
9 changes: 7 additions & 2 deletions websites.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
all_websites = ['nytimes.com', 'bbc.co.uk', 'cnn.com', 'huffingtonpost.com', 'theguardian.com',
all_websites = ['nytimes.com', 'bbc.com', 'cnn.com', 'huffingtonpost.com', 'theguardian.com',
'forbes.com', 'wsj.com', 'washingtonpost.com', 'telegraph.co.uk', 'reuters.com',
'bloomberg.com', 'dailymail.co.uk']
'bloomberg.com', 'dailymail.co.uk', "businessinsider.com", 'abcnews.go.com',
"nypost.com", "chicagotribune.com", "foxbusiness.com", 'finance.yahoo.com',
"nbcnews.com", "newser.com", "newsweek.com", "usatoday.com", "npr.org",
"cnbc.com", "news.sky.com", "fin24.com", "ft.com", "marketwatch.com",
"londonstockexchange.com", "fnlondon.com", "morningstar.co.uk"
]

0 comments on commit 300cfe2

Please sign in to comment.