diff --git a/fooltrader/spiders/future/future_cffex_spider.py b/fooltrader/spiders/future/future_cffex_spider.py new file mode 100644 index 0000000..d7ca64e --- /dev/null +++ b/fooltrader/spiders/future/future_cffex_spider.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- + +import os +from datetime import datetime + +import scrapy +from scrapy import Request +from scrapy import signals +import pandas as pd + +from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data +from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path +from fooltrader.utils.utils import to_timestamp + + +class FutureCffexSpider(scrapy.Spider): + name = "future_cffex_spider" + + custom_settings = { + # 'DOWNLOAD_DELAY': 2, + # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8, + + } + + def __init__(self, name=None, **kwargs): + super().__init__(name, **kwargs) + self.trading_dates = None + + def start_requests(self): + if self.dataType is None or self.dataType=='dayk': + daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today()) + daterange=daterange[daterange.dayofweek<5] + for i in daterange: + the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='day_kdata',the_date=to_timestamp(i))+".csv" + if not os.path.exists(the_dir): + yield Request(url="http://www.cffex.com.cn/sj/hqsj/rtj/"+i.strftime("%Y%m/%d/%Y%m%d")+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir}) + elif self.dataType =='inventory': + daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today()) + k=['IF','IC','IH','T','TF'] + daterange=daterange[daterange.dayofweek<5] + for i in daterange: + for j in k: + the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='inventory',the_date=to_timestamp(i))+j+".csv" + if not os.path.exists(the_dir): + yield Request(url="http://www.cffex.com.cn/sj/ccpm/"+i.strftime("%Y%m/%d/")+j+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir}) + + + + + def download_cffex_history_data_file(self,response): + content_type_header = response.headers.get('content-type', None) + the_path = response.meta['filename'] + + if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv': + with open(the_path, "wb") as f: + f.write(response.body) + f.flush() + + else: + self.logger.error( + "get cffex year data failed:the_path={} url={} content type={} ".format( + the_path, + response.url, + content_type_header)) diff --git a/fooltrader/spiders/future/future_czce_spider.py b/fooltrader/spiders/future/future_czce_spider.py new file mode 100644 index 0000000..999100d --- /dev/null +++ b/fooltrader/spiders/future/future_czce_spider.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +import os +from datetime import datetime +import pandas as pd + +import scrapy +from scrapy import Request +from scrapy import signals + +from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data +from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path +from fooltrader.utils.utils import to_timestamp + + +class FutureCzceSpider(scrapy.Spider): + name = "future_czce_spider" + + custom_settings = { + # 'DOWNLOAD_DELAY': 2, + # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8, + + } + + def __init__(self, name=None, **kwargs): + super().__init__(name, **kwargs) + self.trading_dates = None + + def start_requests(self): + if self.dataType is None: + today = pd.Timestamp.today() + for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today): + the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='day_kdata')+'.xls' + if(date.dayofweek<5 and not os.path.exists(the_dir)): + yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataDaily.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir}) + elif self.dataType=='historyk': + yield Request(url="http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm",callback=self.download_czce_history_data) + elif self.dataType=='inventory': + today = pd.Timestamp.today() + for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=450),end=today): + the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='inventory')+'.xls' + if(date.dayofweek<5 and not os.path.exists(the_dir)): + yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataHolding.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir}) + + + + def download_czce_kline_data(self,response): + content_type_header = response.headers.get('content-type', None) + the_path = response.meta['filename'] + + if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed' or content_type_header.decode("utf-8") == 'application/excel': + with open(the_path, "wb") as f: + f.write(response.body) + f.flush() + + else: + self.logger.error( + "get czce year data failed:the_path={} url={} content type={} ".format( + the_path, + response.url, + content_type_header)) + + def download_czce_history_data(self,response): + the_dir = get_exchange_cache_dir(security_type='future', exchange='czce') + for filepath in response.xpath('//a[contains(@href,"zip")]').xpath('@href').extract(): + yield Request(url="http://www.czce.com.cn/"+filepath, + meta={'filename':os.path.join(the_dir,("" if filepath.split("/")[-2] == "exchange" else filepath.split("/")[-2]) +filepath.split("/")[-1])}, + callback=self.download_czce_history_data_file) + + def download_czce_history_data_file(self,response): + content_type_header = response.headers.get('content-type', None) + the_path = response.meta['filename'] + + if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed': + with open(the_path, "wb") as f: + f.write(response.body) + f.flush() + + else: + self.logger.error( + "get shfe year data failed:the_path={} url={} content type={} ".format( + the_path, + response.url, + content_type_header)) diff --git a/fooltrader/spiders/future/future_dce_spider.py b/fooltrader/spiders/future/future_dce_spider.py new file mode 100644 index 0000000..f30fbaf --- /dev/null +++ b/fooltrader/spiders/future/future_dce_spider.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +import os +from datetime import datetime +import pandas as pd + +import scrapy +from scrapy import Request,FormRequest +from scrapy import signals + +from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data +from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path +from fooltrader.utils.utils import to_timestamp + + +class FutureDceSpider(scrapy.Spider): + name = "future_dce_spider" + + custom_settings = { + # 'DOWNLOAD_DELAY': 2, + # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8, + + } + + def __init__(self, name=None, **kwargs): + super().__init__(name, **kwargs) + + def start_requests(self): + if self.dataType is None: + return self.request_currentyear_kdata() + elif self.dataType == 'historyk': + return self.request_history_kdata() + elif self.dataType == 'inventory': + return self.request_inventory_data() + else: + return self.request_currentyear_kdata() + + def request_inventory_data(self): + today = pd.Timestamp.today() + requests = [] + for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today): + the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip' + if(date.dayofweek<5 and not os.path.exists(the_dir)): + requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={ + 'batchExportFlag':'batch', + 'contract.contract_id':'all', + 'contract.variety_id':'a', + 'year':str(date.year), + 'month':str(date.month-1), + 'day':str(date.day), + 'memberDealPosiQuotes.trade_type':'0', + 'memberDealPosiQuotes.variety':'all' + },callback=self.download_dce_kline_data,meta={ + 'filename':the_dir + })) + return requests + + def request_currentyear_kdata(self): + today = pd.Timestamp.today() + requests=[] + for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today): + the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls' + if(date.dayofweek<5 and not os.path.exists(the_dir)): + requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={ + 'year':str(date.year), + 'month':str(date.month-1), + 'day':str(date.day), + 'dayQuotes.trade_type':'0', + 'dayQuotes.variety':'all', + 'exportType':'excel' + },callback=self.download_dce_kline_data,meta={ + 'filename':the_dir + })) + return requests + + def request_history_kdata(self): + return [Request(url="http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html",callback=self.download_dce_history_data)] + + + def download_dce_history_data(self,response): + the_dir = get_exchange_cache_dir(security_type='future', exchange='dce') + for filepath in response.css('input').xpath('@rel').extract(): + yield Request(url="http://www.dce.com.cn/"+filepath, + meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])}, + callback=self.download_dce_history_data_file) + + + def download_dce_kline_data(self,response): + content_type_header = response.headers.get('content-type', None) + if content_type_header is None: + content_type_header = response.headers.get('Content-Type',None) + the_path = response.meta['filename'] + + if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/octet-stream;charset=utf-8': + with open(the_path, "wb") as f: + f.write(response.body) + f.flush() + + else: + self.logger.error( + "get dce year kline data failed:the_path={} url={} content type={} ".format( + the_path, + response.url, + content_type_header)) + + def download_dce_history_data_file(self,response): + content_type_header = response.headers.get('content-type', None) + the_path = response.meta['filename'] + + if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv': + with open(the_path, "wb") as f: + f.write(response.body) + f.flush() + + else: + self.logger.error( + "get shfe year data failed:the_path={} url={} content type={} ".format( + the_path, + response.url, + content_type_header)) diff --git a/fooltrader/spiders/future/future_shfe_spider.py b/fooltrader/spiders/future/future_shfe_spider.py index 79a1c10..ac2661c 100644 --- a/fooltrader/spiders/future/future_shfe_spider.py +++ b/fooltrader/spiders/future/future_shfe_spider.py @@ -2,6 +2,7 @@ import os from datetime import datetime +import pandas as pd import scrapy from scrapy import Request @@ -27,6 +28,15 @@ def __init__(self, name=None, **kwargs): def start_requests(self): self.trading_dates = self.settings.get("trading_dates") + if self.dataType or self.dataType=='inventory': + today = pd.Timestamp.today() + for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today): + the_dir=get_exchange_cache_path(security_type='future',exchange='shfe',the_date=to_timestamp(date),data_type='inventory')+'.json' + if date.dayofweek<5 and not os.path.exists(the_dir): + yield Request(url=self.get_day_inventory_url(the_date=date.strftime('%Y%m%d')), + meta={'the_date': date, + 'the_path': the_dir}, + callback=self.download_shfe_data_by_date) if self.trading_dates: # 每天的数据 @@ -96,5 +106,8 @@ def get_year_k_data_url(self, the_year): def get_day_kdata_url(self, the_date): return 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'.format(the_date) + def get_day_inventory_url(self, the_date): + return 'http://www.shfe.com.cn/data/dailydata/kx/pm{}.dat'.format(the_date) + def get_trading_date_url(self): return 'http://www.shfe.com.cn/bourseService/businessdata/calendar/20171201all.dat' diff --git a/fooltrader/spiders/stock_finance_report_event_spider.py b/fooltrader/spiders/stock_finance_report_event_spider.py index b02a916..a319db8 100644 --- a/fooltrader/spiders/stock_finance_report_event_spider.py +++ b/fooltrader/spiders/stock_finance_report_event_spider.py @@ -7,6 +7,7 @@ from scrapy import Request from scrapy import Selector from scrapy import signals +import traceback from fooltrader.api import event from fooltrader.api.quote import get_security_list @@ -89,7 +90,7 @@ def download_fi_report_event_data(self, response): if i == 0: if not df.empty: latest = pd.Timestamp(report_event_dates[0]).date() - if df.index.contains(latest) and (df.loc[latest, 'title'] == title): + if df.index.contains(latest) and ((type(df.loc[latest,'title'])==str and df.loc[latest,'title']==title) or (type(df.loc[latest,'title'])==list and (df.loc[latest,'title'] == title).any()) ): self.logger.info( "{} {} report has been the latest".format(security_item['code'], report_period)) return @@ -106,6 +107,7 @@ def download_fi_report_event_data(self, response): df = index_df_with_time(df, index='reportEventDate') df.to_csv(path, index=False) except Exception as e: + traceback.print_exc() self.logger.error('error when getting k data url={} error={}'.format(response.url, e)) @classmethod diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..9397d42 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = fooltrader.settings + +[deploy] +#url = http://localhost:6800/ +project = fooltrader