Skip to content

Commit

Permalink
add future scrapy
Browse files Browse the repository at this point in the history
  • Loading branch information
Yangguang authored and Yangguang committed Jul 9, 2018
1 parent 57ef708 commit 6b292ee
Show file tree
Hide file tree
Showing 6 changed files with 295 additions and 1 deletion.
64 changes: 64 additions & 0 deletions fooltrader/spiders/future/future_cffex_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime

import scrapy
from scrapy import Request
from scrapy import signals
import pandas as pd

from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureCffexSpider(scrapy.Spider):
name = "future_cffex_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)
self.trading_dates = None

def start_requests(self):
if self.dataType is None or self.dataType=='dayk':
daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
daterange=daterange[daterange.dayofweek<5]
for i in daterange:
the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='day_kdata',the_date=to_timestamp(i))+".csv"
if not os.path.exists(the_dir):
yield Request(url="http://www.cffex.com.cn/sj/hqsj/rtj/"+i.strftime("%Y%m/%d/%Y%m%d")+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
elif self.dataType =='inventory':
daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
k=['IF','IC','IH','T','TF']
daterange=daterange[daterange.dayofweek<5]
for i in daterange:
for j in k:
the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='inventory',the_date=to_timestamp(i))+j+".csv"
if not os.path.exists(the_dir):
yield Request(url="http://www.cffex.com.cn/sj/ccpm/"+i.strftime("%Y%m/%d/")+j+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})




def download_cffex_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get cffex year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
84 changes: 84 additions & 0 deletions fooltrader/spiders/future/future_czce_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request
from scrapy import signals

from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureCzceSpider(scrapy.Spider):
name = "future_czce_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)
self.trading_dates = None

def start_requests(self):
if self.dataType is None:
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='day_kdata')+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataDaily.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
elif self.dataType=='historyk':
yield Request(url="http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm",callback=self.download_czce_history_data)
elif self.dataType=='inventory':
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=450),end=today):
the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='inventory')+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataHolding.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})



def download_czce_kline_data(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed' or content_type_header.decode("utf-8") == 'application/excel':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get czce year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))

def download_czce_history_data(self,response):
the_dir = get_exchange_cache_dir(security_type='future', exchange='czce')
for filepath in response.xpath('//a[contains(@href,"zip")]').xpath('@href').extract():
yield Request(url="http://www.czce.com.cn/"+filepath,
meta={'filename':os.path.join(the_dir,("" if filepath.split("/")[-2] == "exchange" else filepath.split("/")[-2]) +filepath.split("/")[-1])},
callback=self.download_czce_history_data_file)

def download_czce_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get shfe year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
120 changes: 120 additions & 0 deletions fooltrader/spiders/future/future_dce_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request,FormRequest
from scrapy import signals

from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
from fooltrader.utils.utils import to_timestamp


class FutureDceSpider(scrapy.Spider):
name = "future_dce_spider"

custom_settings = {
# 'DOWNLOAD_DELAY': 2,
# 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,

}

def __init__(self, name=None, **kwargs):
super().__init__(name, **kwargs)

def start_requests(self):
if self.dataType is None:
return self.request_currentyear_kdata()
elif self.dataType == 'historyk':
return self.request_history_kdata()
elif self.dataType == 'inventory':
return self.request_inventory_data()
else:
return self.request_currentyear_kdata()

def request_inventory_data(self):
today = pd.Timestamp.today()
requests = []
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={
'batchExportFlag':'batch',
'contract.contract_id':'all',
'contract.variety_id':'a',
'year':str(date.year),
'month':str(date.month-1),
'day':str(date.day),
'memberDealPosiQuotes.trade_type':'0',
'memberDealPosiQuotes.variety':'all'
},callback=self.download_dce_kline_data,meta={
'filename':the_dir
}))
return requests

def request_currentyear_kdata(self):
today = pd.Timestamp.today()
requests=[]
for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls'
if(date.dayofweek<5 and not os.path.exists(the_dir)):
requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={
'year':str(date.year),
'month':str(date.month-1),
'day':str(date.day),
'dayQuotes.trade_type':'0',
'dayQuotes.variety':'all',
'exportType':'excel'
},callback=self.download_dce_kline_data,meta={
'filename':the_dir
}))
return requests

def request_history_kdata(self):
return [Request(url="http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html",callback=self.download_dce_history_data)]


def download_dce_history_data(self,response):
the_dir = get_exchange_cache_dir(security_type='future', exchange='dce')
for filepath in response.css('input').xpath('@rel').extract():
yield Request(url="http://www.dce.com.cn/"+filepath,
meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])},
callback=self.download_dce_history_data_file)


def download_dce_kline_data(self,response):
content_type_header = response.headers.get('content-type', None)
if content_type_header is None:
content_type_header = response.headers.get('Content-Type',None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/octet-stream;charset=utf-8':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get dce year kline data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))

def download_dce_history_data_file(self,response):
content_type_header = response.headers.get('content-type', None)
the_path = response.meta['filename']

if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
with open(the_path, "wb") as f:
f.write(response.body)
f.flush()

else:
self.logger.error(
"get shfe year data failed:the_path={} url={} content type={} ".format(
the_path,
response.url,
content_type_header))
13 changes: 13 additions & 0 deletions fooltrader/spiders/future/future_shfe_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import os
from datetime import datetime
import pandas as pd

import scrapy
from scrapy import Request
Expand All @@ -27,6 +28,15 @@ def __init__(self, name=None, **kwargs):

def start_requests(self):
self.trading_dates = self.settings.get("trading_dates")
if self.dataType or self.dataType=='inventory':
today = pd.Timestamp.today()
for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
the_dir=get_exchange_cache_path(security_type='future',exchange='shfe',the_date=to_timestamp(date),data_type='inventory')+'.json'
if date.dayofweek<5 and not os.path.exists(the_dir):
yield Request(url=self.get_day_inventory_url(the_date=date.strftime('%Y%m%d')),
meta={'the_date': date,
'the_path': the_dir},
callback=self.download_shfe_data_by_date)

if self.trading_dates:
# 每天的数据
Expand Down Expand Up @@ -96,5 +106,8 @@ def get_year_k_data_url(self, the_year):
def get_day_kdata_url(self, the_date):
return 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'.format(the_date)

def get_day_inventory_url(self, the_date):
return 'http://www.shfe.com.cn/data/dailydata/kx/pm{}.dat'.format(the_date)

def get_trading_date_url(self):
return 'http://www.shfe.com.cn/bourseService/businessdata/calendar/20171201all.dat'
4 changes: 3 additions & 1 deletion fooltrader/spiders/stock_finance_report_event_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scrapy import Request
from scrapy import Selector
from scrapy import signals
import traceback

from fooltrader.api import event
from fooltrader.api.quote import get_security_list
Expand Down Expand Up @@ -89,7 +90,7 @@ def download_fi_report_event_data(self, response):
if i == 0:
if not df.empty:
latest = pd.Timestamp(report_event_dates[0]).date()
if df.index.contains(latest) and (df.loc[latest, 'title'] == title):
if df.index.contains(latest) and ((type(df.loc[latest,'title'])==str and df.loc[latest,'title']==title) or (type(df.loc[latest,'title'])==list and (df.loc[latest,'title'] == title).any()) ):
self.logger.info(
"{} {} report has been the latest".format(security_item['code'], report_period))
return
Expand All @@ -106,6 +107,7 @@ def download_fi_report_event_data(self, response):
df = index_df_with_time(df, index='reportEventDate')
df.to_csv(path, index=False)
except Exception as e:
traceback.print_exc()
self.logger.error('error when getting k data url={} error={}'.format(response.url, e))

@classmethod
Expand Down
11 changes: 11 additions & 0 deletions scrapy.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = fooltrader.settings

[deploy]
#url = http://localhost:6800/
project = fooltrader

0 comments on commit 6b292ee

Please sign in to comment.