add future scrapy

foolcage · Jul 9, 2018 · 6b292ee · 6b292ee
1 parent 57ef708
commit 6b292ee
Show file tree

Hide file tree

Showing 6 changed files with 295 additions and 1 deletion.
diff --git a/fooltrader/spiders/future/future_cffex_spider.py b/fooltrader/spiders/future/future_cffex_spider.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+
+import scrapy
+from scrapy import Request
+from scrapy import signals
+import pandas as pd
+
+from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureCffexSpider(scrapy.Spider):
+    name = "future_cffex_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+        self.trading_dates = None
+
+    def start_requests(self):
+        if self.dataType is None or self.dataType=='dayk':
+            daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
+            daterange=daterange[daterange.dayofweek<5]
+            for i in daterange:
+                the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='day_kdata',the_date=to_timestamp(i))+".csv"
+                if not os.path.exists(the_dir):
+                    yield Request(url="http://www.cffex.com.cn/sj/hqsj/rtj/"+i.strftime("%Y%m/%d/%Y%m%d")+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
+        elif self.dataType =='inventory':
+            daterange=pd.date_range(start='2006-06-30',end=pd.Timestamp.today())
+            k=['IF','IC','IH','T','TF']
+            daterange=daterange[daterange.dayofweek<5]
+            for i in daterange:
+                for j in k:
+                    the_dir = get_exchange_cache_path(security_type='future',exchange='cffex',data_type='inventory',the_date=to_timestamp(i))+j+".csv"
+                    if not os.path.exists(the_dir):
+                        yield Request(url="http://www.cffex.com.cn/sj/ccpm/"+i.strftime("%Y%m/%d/")+j+"_1.csv",callback=self.download_cffex_history_data_file,meta={'filename':the_dir})
+
+
+
+
+    def download_cffex_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get cffex year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/future/future_czce_spider.py b/fooltrader/spiders/future/future_czce_spider.py
@@ -0,0 +1,84 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+import pandas as pd
+
+import scrapy
+from scrapy import Request
+from scrapy import signals
+
+from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureCzceSpider(scrapy.Spider):
+    name = "future_czce_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+        self.trading_dates = None
+
+    def start_requests(self):
+        if self.dataType is None:
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
+                the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='day_kdata')+'.xls'
+                if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                    yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataDaily.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
+        elif self.dataType=='historyk':
+            yield Request(url="http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm",callback=self.download_czce_history_data)
+        elif self.dataType=='inventory':
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=450),end=today):
+                the_dir = get_exchange_cache_path(security_type='future',exchange='czce',the_date=to_timestamp(date),data_type='inventory')+'.xls'
+                if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                    yield Request(url="http://www.czce.com.cn/portal/DFSStaticFiles/Future/"+date.strftime("%Y/%Y%m%d")+"/FutureDataHolding.xls",callback=self.download_czce_kline_data,meta={'filename':the_dir})
+
+
+
+    def download_czce_kline_data(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed' or content_type_header.decode("utf-8") == 'application/excel':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get czce year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
+
+    def download_czce_history_data(self,response):
+        the_dir = get_exchange_cache_dir(security_type='future', exchange='czce')
+        for filepath in response.xpath('//a[contains(@href,"zip")]').xpath('@href').extract():
+            yield Request(url="http://www.czce.com.cn/"+filepath,
+                      meta={'filename':os.path.join(the_dir,("" if filepath.split("/")[-2] == "exchange" else filepath.split("/")[-2]) +filepath.split("/")[-1])},
+                      callback=self.download_czce_history_data_file)
+
+    def download_czce_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/x-zip-compressed':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get shfe year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/future/future_dce_spider.py b/fooltrader/spiders/future/future_dce_spider.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+import os
+from datetime import datetime
+import pandas as pd
+
+import scrapy
+from scrapy import Request,FormRequest
+from scrapy import signals
+
+from fooltrader.api.quote import parse_shfe_data, parse_shfe_day_data
+from fooltrader.contract.files_contract import get_exchange_cache_dir, get_exchange_cache_path
+from fooltrader.utils.utils import to_timestamp
+
+
+class FutureDceSpider(scrapy.Spider):
+    name = "future_dce_spider"
+
+    custom_settings = {
+        # 'DOWNLOAD_DELAY': 2,
+        # 'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
+
+    }
+
+    def __init__(self, name=None, **kwargs):
+        super().__init__(name, **kwargs)
+
+    def start_requests(self):
+        if self.dataType is None:
+            return self.request_currentyear_kdata()
+        elif self.dataType == 'historyk':
+            return self.request_history_kdata()
+        elif self.dataType == 'inventory':
+            return self.request_inventory_data()
+        else:
+            return self.request_currentyear_kdata()
+
+    def request_inventory_data(self):
+        today = pd.Timestamp.today()
+        requests = []
+        for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
+            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip'
+            if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={
+            'batchExportFlag':'batch',
+            'contract.contract_id':'all',
+            'contract.variety_id':'a',
+            'year':str(date.year),
+                'month':str(date.month-1),
+                'day':str(date.day),
+                'memberDealPosiQuotes.trade_type':'0',
+                'memberDealPosiQuotes.variety':'all'
+            },callback=self.download_dce_kline_data,meta={
+                'filename':the_dir
+            }))
+        return requests
+
+    def request_currentyear_kdata(self):
+        today = pd.Timestamp.today()
+        requests=[]
+        for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
+            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls'
+            if(date.dayofweek<5 and not os.path.exists(the_dir)):
+                requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={
+            'year':str(date.year),
+                'month':str(date.month-1),
+                'day':str(date.day),
+                'dayQuotes.trade_type':'0',
+                'dayQuotes.variety':'all',
+                'exportType':'excel'
+            },callback=self.download_dce_kline_data,meta={
+                'filename':the_dir
+            }))
+        return requests
+
+    def request_history_kdata(self):
+        return [Request(url="http://www.dce.com.cn/dalianshangpin/xqsj/lssj/index.html",callback=self.download_dce_history_data)]
+
+
+    def download_dce_history_data(self,response):
+        the_dir = get_exchange_cache_dir(security_type='future', exchange='dce')
+        for filepath in response.css('input').xpath('@rel').extract():
+            yield Request(url="http://www.dce.com.cn/"+filepath,
+                      meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])},
+                      callback=self.download_dce_history_data_file)
+
+
+    def download_dce_kline_data(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        if content_type_header is None:
+            content_type_header = response.headers.get('Content-Type',None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv' or content_type_header.decode("utf-8") == 'application/octet-stream;charset=utf-8':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get dce year kline data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
+
+    def download_dce_history_data_file(self,response):
+        content_type_header = response.headers.get('content-type', None)
+        the_path = response.meta['filename']
+
+        if content_type_header.decode("utf-8") == 'application/zip' or content_type_header.decode("utf-8") == 'text/csv':
+            with open(the_path, "wb") as f:
+                f.write(response.body)
+                f.flush()
+
+        else:
+            self.logger.error(
+                "get shfe year  data failed:the_path={} url={} content type={} ".format(
+                                                                                                 the_path,
+                                                                                                 response.url,
+                                                                                                 content_type_header))
diff --git a/fooltrader/spiders/future/future_shfe_spider.py b/fooltrader/spiders/future/future_shfe_spider.py
@@ -2,6 +2,7 @@
 
 import os
 from datetime import datetime
+import pandas as pd
 
 import scrapy
 from scrapy import Request
@@ -27,6 +28,15 @@ def __init__(self, name=None, **kwargs):
 
     def start_requests(self):
         self.trading_dates = self.settings.get("trading_dates")
+        if self.dataType or self.dataType=='inventory':
+            today = pd.Timestamp.today()
+            for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
+                the_dir=get_exchange_cache_path(security_type='future',exchange='shfe',the_date=to_timestamp(date),data_type='inventory')+'.json'
+                if date.dayofweek<5 and not os.path.exists(the_dir):
+                    yield Request(url=self.get_day_inventory_url(the_date=date.strftime('%Y%m%d')),
+                              meta={'the_date': date,
+                                    'the_path': the_dir},
+                              callback=self.download_shfe_data_by_date)
 
         if self.trading_dates:
             # 每天的数据
@@ -96,5 +106,8 @@ def get_year_k_data_url(self, the_year):
     def get_day_kdata_url(self, the_date):
         return 'http://www.shfe.com.cn/data/dailydata/kx/kx{}.dat'.format(the_date)
 
+    def get_day_inventory_url(self, the_date):
+        return 'http://www.shfe.com.cn/data/dailydata/kx/pm{}.dat'.format(the_date)
+
     def get_trading_date_url(self):
         return 'http://www.shfe.com.cn/bourseService/businessdata/calendar/20171201all.dat'
diff --git a/fooltrader/spiders/stock_finance_report_event_spider.py b/fooltrader/spiders/stock_finance_report_event_spider.py
@@ -7,6 +7,7 @@
 from scrapy import Request
 from scrapy import Selector
 from scrapy import signals
+import traceback
 
 from fooltrader.api import event
 from fooltrader.api.quote import get_security_list
@@ -89,7 +90,7 @@ def download_fi_report_event_data(self, response):
                 if i == 0:
                     if not df.empty:
                         latest = pd.Timestamp(report_event_dates[0]).date()
-                        if df.index.contains(latest) and (df.loc[latest, 'title'] == title):
+                        if df.index.contains(latest) and ((type(df.loc[latest,'title'])==str and df.loc[latest,'title']==title) or (type(df.loc[latest,'title'])==list and (df.loc[latest,'title'] == title).any()) ):
                             self.logger.info(
                                 "{} {} report has been the latest".format(security_item['code'], report_period))
                             return
@@ -106,6 +107,7 @@ def download_fi_report_event_data(self, response):
                 df = index_df_with_time(df, index='reportEventDate')
                 df.to_csv(path, index=False)
         except Exception as e:
+            traceback.print_exc()
             self.logger.error('error when getting k data url={} error={}'.format(response.url, e))
 
     @classmethod

diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = fooltrader.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = fooltrader