-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathonehash.py
183 lines (155 loc) · 9.47 KB
/
onehash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Scrapy spider for scraping dynamically generated esport betting information from web site *www.onehash.com*
This script was created for educational purposes to demonstrate how to scrape data from dynamically
generated web page using *Scrapy* and *Selenium webdriver*. Also, here you can find examples of
scrapy "Item" and "ItemLoader", as well as "How to use xpath selectors in scrapy" and "How to scroll
dynamically loading web page block with Selenium"
This script was written in Python 3.6(for scrapy 1.5), and before running it, you'll need to install:
- Scrapy (on Windows machine you'll need appropriate C++ SDK to run Twisted - check their docs);
- Selenium (with geckodriver for Windows machines);
- Firefox browser.
After installing all requirements - create Scrapy project and put this script into "spiders" folder.
"onehash.py" spider scrape information about esport events that not yet been played(or in progress).
What kind of data this script will scrape shows below(names in ' ' also are keys for Item container):
- 'date' - date of the single event/game in timedate format converted to UTC time(or tried to);
- 'game' - name of the game(csgo, overwatch, dota2 etc);
- 'player1' - name of the first participant(or team name, like: "Misfits" or "SK Gaming" etc);
- 'player2' - name of the second participant;
- 'odds1' - bet rate on the first player(float value, like: 1.345);
- 'odds2' - bet rate on the second player(float value).
Now, for convenient purpose this script needs that :var:TIME_DIFFERENCE been set inside script to your own value.
- TIME_DIFFERENCE - represent a difference between time for event, that shows on website
and UTC time(check in script comments). You can set it to "0", and then all dates for events,
which starts 24+ hours from current time will be saved in site_time format(not UTC).
Also, if script working to slowly or scraped information not full - you can try to adjust :param:sleep_time
and :param:loop_timer inside of method "parse" of OneHash class.
To run a spider - change your location in terminal to scrapy project folder and type: scrapy crawl onehash
To save data to .json file(for example), type: scrapy crawl onehash -o yourfile.json
"""
import time
import datetime
import scrapy
from scrapy import Selector
from scrapy.loader.processors import MapCompose, Join
from scrapy.loader import ItemLoader
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
# script author - Alexander Shums'kii
# https://github.com/Austerius
# put your time difference between time, that shows on site and current_utc time(use sight).
# Tip: use others esport betting site for reference (EGB.com shows events date in GMT +0). For example:
# If onehash show event start in 20min from now(your time 12:40), and on EGB this event will start at 11:00
# TIME_DIFFERENCE = (12:40 + 00:20) - 11:00 = +2 (yeh, it's work in most cases)
TIME_DIFFERENCE = +2
def get_name(name_string):
"""
Function for getting name of the game from class name string.
Incoming string looks like: "icon-oh cat-overwatch"
"""
name_string = name_string.split(" ")[1]
name_string = name_string.split("-")[1]
return name_string
def get_odd(odd_string):
"""Function for converting odds string to a correct float value."""
odd_string = odd_string[1:] # firs element is char 'x' - getting rid of it
return float(odd_string)
class Event(scrapy.Item):
"""
Class-container for scraped data.
Usually you put this class in separate file(but for the convenience of "one-file-example" I put it here).
"""
game = scrapy.Field(input_processor=MapCompose(get_name))
player1 = scrapy.Field()
player2 = scrapy.Field()
# So, our odds placed in several span selectors(one char of odd per 'span')
# That's why we using 'Join' processor fist, to get all chars and form a string from em(with no separator)
odds1 = scrapy.Field(input_processor=Join(''), output_processor=MapCompose(get_odd))
odds2 = scrapy.Field(input_processor=Join(''), output_processor=MapCompose(get_odd))
date = scrapy.Field()
class OneHash(scrapy.Spider):
name = "onehash"
allowed_domains = ['www.onehash.com']
start_urls = ["https://www.onehash.com/category/e_sport/"]
def __init__(self):
self.driver = webdriver.Firefox()
super().__init__()
def __del__(self):
self.driver.close()
def parse(self, response):
""" parsing page.
Edit *sleep_time* and *loop_timer* before running.
Elso, don't forgot about :var:TIME_DIFFERENCE variable.
"""
self.driver.get(response.url)
self.driver.maximize_window()
sleep_time = 3 # sleep time for completely load web page(increase/decrease if needed)
loop_timer = 10 # Timer for scrolling down to the bottom of the content(edit for your own value if needed)
time.sleep(sleep_time) # loading initial page
parsed_data = Event() # container for parsed data
# Scrolling to the end of results/bets page
event_box = self.driver.find_element_by_xpath('//body//main')
try:
event_box.click() # needed emulate click on the element to proceed
except:
pass
timer = time.time() + loop_timer
while True:
ActionChains(self.driver).move_to_element(event_box).send_keys(Keys.END).perform()
if time.time() > timer:
break
# getting outer html:
source = Selector(text=self.driver.page_source)
# here we can find all juicy information about single event
events = source.xpath('//div[@class="eventbox-container"]')
# processing/parsing game information here
for event in events:
il = ItemLoader(selector=event, item=parsed_data)
# time-date block(we will count our time in UTC and transform site time to UTC)
current_time_utc = datetime.datetime.utcnow()
# getting class name of first div block in "time-wrapper" block
time_block = event.xpath('.//div[@class="time-wrapper"]/div/@class').extract_first()
# site has 2 different time formats: actual datetime and timer to event start(like: 2h 43m)
if time_block == 'time-left': # if we get a timer
time_string = event.xpath('.//div[@class="time-wrapper"]/div[@class="time-left"]/span[2]/text()').extract_first()
try:
# time_string format: "2h 32m" or "34m 39s" or "12s"
first, second = time_string.split(' ') # getting minutes(seconds) and hours _strings
hours = 0
minutes = 0
seconds = 0
if first[-1] == 'h': # if we still have hour or few to the start of the event
hours = int(first[:-1]) # getting rid of letter(h or m) at the end of string
minutes = int(second[:-1])
elif first[-1] == 'm':
minutes = int(first[:-1])
seconds = int(second[:-1])
delta_time = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds)
except ValueError:
delta_time = current_time_utc # seconds left until event start
# so, to get actual date in utc we'll need to add delta_time to current utc time
game_date_utc = current_time_utc + delta_time
elif time_block == 'datetime': # ok, now actual fun: (( not really ((
date_string = event.xpath('.//div[@class="time-wrapper"]/div[@class="datetime"]/span[@class="date"]/text()').extract_first()
time_string = event.xpath('.//div[@class="time-wrapper"]/div[@class="datetime"]/span[@class="time"]/text()').extract_first()
day, month, year = date_string.split('/')
hour, minute = time_string.split(":")
# now, it's most easiest and maybe worst method: you actually need to go to web site and figure out
# a difference between current utc and site time(for me it was +2h - check TIME_DIFFERENCE variable)
site_date = datetime.datetime(year=int(year), month=int(month), day=int(day),
hour=int(hour), minute=int(minute))
delta_time = datetime.timedelta(minutes=TIME_DIFFERENCE*60)
game_date_utc = site_date - delta_time
# we don't want to scrap events that already ended(or in progress):
if current_time_utc > game_date_utc:
continue
# don't miss a '.' at the beginning of the selector, so we will parse only current event block
il.add_xpath('game', './/div[@class="item"]/i/@class')
il.add_value('date', game_date_utc)
il.add_xpath('player1', './/section[@class="team"][1]//div[@class="eventbox-title text-ellipsis"]/text()')
il.add_xpath('player2', './/section[@class="team"][2]//div[@class="eventbox-title text-ellipsis"]/text()')
# this selector will get text field from multiple 'span', than, we will use Join processor(check our Item)
il.add_xpath('odds1', './/section[@class="team"][1]//div[@class="multi-number-roller"]//span/text()')
il.add_xpath('odds2', './/section[@class="team"][2]//div[@class="multi-number-roller"]//span/text()')
il.load_item()
yield parsed_data