Repository: shidenggui/easyhistory Branch: master Commit: 46b64db25e56 Files: 12 Total size: 24.1 KB Directory structure: gitextract_zhjr4gnz/ ├── .gitignore ├── README.md ├── easyhistory/ │ ├── __init__.py │ ├── api.py │ ├── day.py │ ├── helpers.py │ ├── history.py │ └── store.py ├── requirements.txt ├── setup.py ├── test.py └── test_history.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ #Ipython Notebook .ipynb_checkpoints ================================================ FILE: README.md ================================================ # easyhistory 用于获取维护股票的历史数据 ### 引入 ```python import easyhistory ``` ### 读取 rqalpha 保存的历史数据 读取平安银行 ```python easyhistory.history('000001') return pandas dataframe: open close high low volume total_turnover \ datetime 20050104000000 1.6668 1.6491 1.6668 1.6338 6961738 11465602 20050105000000 1.6491 1.6338 1.6567 1.6061 12739274 20718558 20050106000000 1.6440 1.6491 1.6668 1.6314 10542101 17333840 20050107000000 1.6643 1.6466 1.6693 1.6338 7457207 12302853 20050110000000 1.6466 1.6668 1.6668 1.6112 10406261 17111498 limit_up limit_down datetime 20050104000000 1.8337 1.4999 20050105000000 1.8135 1.4846 20050106000000 1.7983 1.4695 20050107000000 1.8135 1.4846 20050110000000 1.8110 1.4822 ``` 读取上证指数 ```python easyhistory.history('000001', market='sh') ``` ### 初始化日线历史数据 ```python easyhistory.init('D', export='csv', path='history') ``` 注1: 下载后的原始数据在 `path/day/raw_data` 下, 复权后数据在 `path/day/data` 下 注2: 下载所有股票的历史数据需要很长时间,推荐直接从[百度盘](http://pan.baidu.com/s/1o7rwH0e)(数据到 20160318 )下载, ### 更新 ```python easyhistory.update('D', export='csv', path='history') ``` ### 指标系统 目前还在测试中,指标计算使用了 `talib` 和 `pandas`, 可以直接调用 `talib` 计算一百多种指标,包括 `MACD, EMA, MA` 等 * tablib 安装: https://github.com/mrjbq7/ta-lib * pandas: pip install pandas #### 使用 ```python his = easyhistory.History(dtype='D', path='行情目录') # MA 计算, 直接调用的 talib 的对应函数 res = his['000001'].MA(5) # 返回的是 pandas 的 dataframe 格式 open high close low volume amount factor MA5 date 2016-03-10 10.24 10.35 10.15 10.13 506112.94 5193459.68 93.659 10.268 2016-03-11 10.10 10.22 10.16 10.04 409716.87 4160186.89 93.659 10.220 ``` 注: [talib 可用指标以及相关参数](https://github.com/mrjbq7/ta-lib) 以及 [pandas 相关](https://github.com/pydata/pandas) ### Q&A Q:安装 `talib` 提示找不到 `vcvarsall.bat` ? A: 去 `http://www.lfd.uci.edu/~gohlke/pythonlibs` 下载 `wheels`版本的包使用 `pip install xxx.whl` 安装 ================================================ FILE: easyhistory/__init__.py ================================================ # coding:utf-8 from .api import * __version__ = '0.0.1' ================================================ FILE: easyhistory/api.py ================================================ # coding:utf-8 from rqalpha.data.base_data_source import BaseDataSource import pandas as pd import easyutils import datetime import os from .day import Day def init(dtype='D', export='csv', path='history'): return Day(path=path, export=export).init() def update_single_code(dtype='D', stock_code=None, path='history', export='csv'): if stock_code is None: raise Exception('stock code is None') return Day(path=path, export=export).update_single_code(stock_code) def update(dtype='D', export='csv', path='history'): return Day(path=path, export=export).update() def history(stock_code, market=None, bundle_path='~/.rqalpha/bundle'): d = BaseDataSource(os.path.expanduser(bundle_path)) instruments = d._instruments.get_all_instruments() stock_map = {i.order_book_id: i for i in instruments} if not market: market = easyutils.get_stock_type(stock_code) if market == 'sh': stock_code += '.XSHG' else: stock_code += '.XSHE' raw = d._all_day_bars_of(stock_map[stock_code]) df = pd.DataFrame.from_dict(raw) df.set_index('datetime', inplace=True) return df # ================================================ FILE: easyhistory/day.py ================================================ # coding: utf-8 import math import re import time from datetime import datetime from datetime import timedelta from multiprocessing.pool import ThreadPool import requests from pyquery import PyQuery from . import helpers from . import store class Day: SINA_API = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/{stock_code}.phtml' SINA_API_HOSTNAME = 'vip.stock.finance.sina.com.cn' STOCK_CODE_API = 'http://218.244.146.57/static/all.csv' def __init__(self, path='history', export='csv'): self.store = store.use(export=export, path=path, dtype='D') def init(self): stock_codes = self.store.init_stock_codes pool = ThreadPool(10) pool.map(self.init_stock_history, stock_codes) def update(self): """ 更新已经下载的历史数据 """ stock_codes = self.store.update_stock_codes pool = ThreadPool(2) pool.map(self.update_single_code, stock_codes) def update_single_code(self, stock_code): """ 更新对应的股票文件历史行情 :param stock_code: 股票代码 :return: """ latest_date = self.store.get_his_stock_date(stock_code) updated_data = self.get_update_day_history(stock_code, latest_date) if len(updated_data) == 0 or len(updated_data[0]) == 0: return self.store.write(stock_code, updated_data) def get_update_day_history(self, stock_code, latest_date): data_year = latest_date.year data_quarter = helpers.get_quarter(latest_date.month) now_year = datetime.now().year # 使用下一天的日期作为更新起始日,避免季度末时多更新上一季度的内容 tomorrow = datetime.now() + timedelta(days=1) now_quarter = helpers.get_quarter(tomorrow.month) updated_data = list() for year in range(data_year, now_year + 1): for quarter in range(1, 5): if year == data_year: if quarter < data_quarter: continue if year == now_year: if quarter > now_quarter: continue # if year == now_year: # if quarter > now_quarter: # continue # elif year == data_year: # if quarter < data_quarter: # continue updated_data += self.get_quarter_history(stock_code, year, quarter) updated_data.sort(key=lambda day: day[0]) return updated_data def init_stock_history(self, stock_code): all_history = self.get_all_history(stock_code) if len(all_history) <= 0: return self.store.write(stock_code, all_history) def get_all_history(self, stock_code): years = self.get_stock_time(stock_code) all_history = [] for year in years: year_history = self.get_year_history(stock_code, year) all_history += year_history all_history.sort(key=lambda day: day[0]) return all_history def get_year_history(self, stock_code, year): year_history = [] now_year = datetime.now().year now_month = datetime.now().month end_quarter = 5 if str(year) != str(now_year) else math.ceil(now_month / 3) + 1 for quarter in range(1, end_quarter): quarter_data = self.get_quarter_history(stock_code, year, quarter) if quarter_data is None: continue year_history += quarter_data return year_history def get_stock_time(self, stock_code): # 获取年月日 url = self.SINA_API.format(stock_code=stock_code) try: dom = PyQuery(url) except requests.ConnectionError: return [] year_options = dom('select[name=year] option') years = [o.text for o in year_options][::-1] return years def get_quarter_history(self, stock_code, year, quarter): year = int(year) if year < 1990: return list() params = dict( year=year, jidu=quarter ) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko' } print('request {},{},{}'.format(stock_code, year, quarter)) url = self.SINA_API.format(stock_code=stock_code) rep = list() loop_nums = 10 for i in range(loop_nums): try: rep = requests.get(url, params, timeout=3, headers=headers) break except requests.ConnectionError: time.sleep(60) except Exception as e: with open('error.log', 'a+') as f: f.write(str(e)) print('end request {}, {}, {}'.format(stock_code, year, quarter)) if rep is None: with open('error.txt', 'a+') as f: f.write('{},{},{}'.format(stock_code, year, quarter)) return list() res = self.handle_quarter_history(rep.text) return res def handle_quarter_history(self, rep_html): dom = PyQuery(rep_html) raw_trows = dom('#FundHoldSharesTable tr') empty_history_nodes = 2 if len(raw_trows) <= empty_history_nodes: return list() unused_head_index_end = 2 trows = raw_trows[unused_head_index_end:] res = list() for row_td_list in trows: td_list = row_td_list.getchildren() day_history = [] for i, td in enumerate(td_list): td_content = td.text_content() date_index = 0 if i == date_index: td_content = re.sub(r'\r|\n|\t', '', td_content) day_history.append(td_content) self.convert_stock_data_type(day_history) res.append(day_history) return res def convert_stock_data_type(self, day_data): """将获取的对应日期股票数据除了日期之外,转换为正确的 float / int 类型 :param day_data: ['2016-02-19', '945.019', '949.701', '940.336', '935.653', '31889824.000', '320939648.000', '93.659'] :return: ['2016-02-19', 945.019, 949.701, 940.336, 935.653, 31889824.000, 320939648.000, 93.659] """ date_index = 0 for i, val in enumerate(day_data): if i == date_index: continue day_data[i] = float(val) ================================================ FILE: easyhistory/helpers.py ================================================ # coding:utf-8 import math def get_quarter(month): return math.ceil(int(month) / 3) ================================================ FILE: easyhistory/history.py ================================================ # coding:utf-8 import os import pandas as pd import talib class Indicator(object): def __init__(self, stock_code, history): self.stock_code = stock_code self.history = history self.hisarg = {} def load_csv_files(self, path): file_list = [f for f in os.listdir(path) if f.endswith('.csv')] for stock_csv in file_list: csv_ext_index_start = -4 stock_code = stock_csv[:csv_ext_index_start] self.market[stock_code] = pd.read_csv(stock_csv, index_col='date') def __getattr__(self, item): def talib_func(*args, **kwargs): str_args = ''.join(map(str, args)) index = item + str_args if index in self.hisarg and self.hisarg[index] is not None: return self.hisarg[index] func = getattr(talib, item) res_arr = func(self.history['close'].values, *args, **kwargs) self.hisarg[index] = res_arr return self.hisarg[index] return talib_func class History(object): def __init__(self, dtype='D', path='history', stock=None): self.market = dict() data_path = os.path.join(path, 'day', 'data') self.load_csv_files(data_path, stock) def load_csv_files(self, path, stock=None): if stock and os.path.exists( os.path.join(path, stock+'.csv') ): stock_csv = stock+'.csv' stock_code = stock csv_path = os.path.join(path, stock_csv) self.market[stock_code] = Indicator(stock_code, pd.read_csv(csv_path, index_col='date')) return file_list = [f for f in os.listdir(path) if f.endswith('.csv')] for stock_csv in file_list: csv_ext_index_start = -4 stock_code = stock_csv[:csv_ext_index_start] csv_path = os.path.join(path, stock_csv) self.market[stock_code] = Indicator(stock_code, pd.read_csv(csv_path, index_col='date')) def __getitem__(self, item): return self.market[item] ================================================ FILE: easyhistory/store.py ================================================ # coding: utf8 import json import os from datetime import datetime import easyutils import pandas as pd def use(export='csv', **kwargs): if export.lower() in ['csv']: return CSVStore(**kwargs) class Store: def load(self, stock_data): pass def write(self, stock_code, data): pass class CSVStore(Store): def __init__(self, path, dtype): if dtype.lower() in ['d']: self.path = os.path.join(path, 'day') self.result_path = os.path.join(self.path, 'data') self.raw_path = os.path.join(self.path, 'raw_data') def write(self, stock_code, updated_data): if not os.path.exists(self.result_path): os.makedirs(self.result_path) if not os.path.exists(self.raw_path): os.makedirs(self.raw_path) csv_file_path = os.path.join(self.raw_path, '{}.csv'.format(stock_code)) if os.path.exists(csv_file_path): try: his = pd.read_csv(csv_file_path) except ValueError: return updated_data_start_date = updated_data[0][0] old_his = his[his.date < updated_data_start_date] updated_his = pd.DataFrame(updated_data, columns=his.columns) his = old_his.append(updated_his) else: his = pd.DataFrame(updated_data, columns=['date', 'open', 'high', 'close', 'low', 'volume', 'amount', 'factor']) his.to_csv(csv_file_path, index=False) date = his.iloc[-1].date self.write_summary(stock_code, date) self.write_factor_his(stock_code, his) def get_his_stock_date(self, stock_code): summary_path = os.path.join(self.raw_path, '{}_summary.json'.format(stock_code)) with open(summary_path) as f: summary = json.load(f) latest_date = datetime.strptime(summary['date'], '%Y-%m-%d') return latest_date def write_summary(self, stock_code, date): file_path = os.path.join(self.raw_path, '{}_summary.json'.format(stock_code)) with open(file_path, 'w') as f: latest_day = datetime.strptime(date, '%Y-%m-%d') summary = dict( year=latest_day.year, month=latest_day.month, day=latest_day.day, date=date ) json.dump(summary, f) def write_factor_his(self, stock_code, his): result_file_path = os.path.join(self.result_path, '{}.csv'.format(stock_code)) factor_cols = his.columns.difference(['date']) his[factor_cols] = his[factor_cols] / his.factor.max() his.to_csv(result_file_path, index=False) @property def init_stock_codes(self): stock_codes = easyutils.stock.get_all_stock_codes() exists_codes = set() if os.path.exists(self.raw_path): code_slice = slice(-4) exists_codes = {code[code_slice] for code in os.listdir(self.raw_path) if code.endswith('.csv')} return set(stock_codes).difference(exists_codes) @property def update_stock_codes(self): code_slice = slice(6) return [f[code_slice] for f in os.listdir(self.raw_path) if f.endswith('.json')] ================================================ FILE: requirements.txt ================================================ requests pandas easyutils ================================================ FILE: setup.py ================================================ # coding:utf8 from setuptools import setup import easyhistory long_desc = """ easyhistory =============== * easy to use rqalpha history data Installation -------------- pip install easytrader Upgrade --------------- pip install easytrader --upgrade """ setup( name='easyhistory', version=easyhistory.__version__, description='A utility for rqalpha history', long_description=long_desc, author='shidenggui', author_email='longlyshidenggui@gmail.com', license='BSD', url='https://github.com/shidenggui/easyhistory', keywords='China stock trade', install_requires=[ 'rqalpha', 'requests', 'six', 'easyutils', ], classifiers=['Development Status :: 4 - Beta', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'License :: OSI Approved :: BSD License'], packages=['easyhistory'], package_data={}, ) ================================================ FILE: test.py ================================================ import easyhistory easyhistory.update() ================================================ FILE: test_history.py ================================================ import unittest from datetime import datetime import easyhistory class TestHistory(unittest.TestCase): def test_get_history(self): test_date = '000001' normal_data = [str(y) for y in range(1991, datetime.now().year + 1)] res = easyhistory.Day().get_stock_time(test_date) self.assertListEqual(res, normal_data) def test_get_quarter_history(self): test_data = ['000001', 2016, 1] normal_data = [['2016-03-31', 1003.087, 1006.833, 996.53, 996.53, 41838792.0, 447266272.0, 93.659], ['2016-03-30', 981.545, 1002.15, 1002.15, 980.608, 53970000.0, 572627392.0, 93.659], ['2016-03-29', 984.354, 985.291, 976.862, 972.179, 31831788.0, 332422400.0, 93.659], ['2016-03-28', 994.657, 997.466, 981.544, 978.735, 35862100.0, 378973312.0, 93.659], ['2016-03-25', 984.354, 992.783, 991.847, 983.417, 23707048.0, 250338496.0, 93.659], ['2016-03-24', 993.72, 995.593, 985.291, 983.417, 37240624.0, 393411552.0, 93.659], ['2016-03-23', 1003.085, 1008.705, 1002.149, 993.72, 43027816.0, 458963264.0, 93.659], ['2016-03-22', 1008.705, 1024.627, 1004.022, 1001.213, 62548248.0, 675406592.0, 93.659], ['2016-03-21', 988.1, 1017.134, 1011.515, 988.1, 92043280.0, 987764480.0, 93.659], ['2016-03-18', 975.924, 989.973, 987.163, 974.051, 79721584.0, 836565568.0, 93.659], ['2016-03-17', 970.305, 981.544, 975.925, 964.685, 61099640.0, 635181312.0, 93.659], ['2016-03-16', 961.876, 978.735, 969.369, 960.003, 66488620.0, 690087744.0, 93.659], ['2016-03-15', 962.813, 970.305, 966.559, 951.573, 41792036.0, 428786688.0, 93.659], ['2016-03-14', 956.256, 979.671, 960.939, 956.256, 65515824.0, 679161280.0, 93.659], ['2016-03-11', 945.953, 957.192, 951.573, 940.334, 38373672.0, 389638944.0, 93.659], ['2016-03-10', 959.066, 969.368, 950.637, 948.763, 47402032.0, 486414240.0, 93.659], ['2016-03-09', 949.7, 957.193, 952.51, 940.334, 32590064.0, 330124896.0, 93.659], ['2016-03-08', 970.305, 970.305, 962.812, 930.968, 64315648.0, 650471616.0, 93.659], ['2016-03-07', 969.369, 982.481, 968.432, 964.686, 60635296.0, 630155584.0, 93.659], ['2016-03-04', 945.017, 983.417, 974.051, 943.144, 138124912.0, 1429273088.0, 93.659], ['2016-03-03', 945.018, 953.447, 946.891, 940.335, 55308940.0, 559045184.0, 93.659], ['2016-03-02', 913.174, 948.764, 945.954, 910.364, 67661376.0, 673626240.0, 93.659], ['2016-03-01', 900.998, 915.047, 908.491, 897.252, 37791080.0, 365149024.0, 93.659], ['2016-02-29', 916.92, 918.794, 895.379, 882.267, 56689640.0, 542184000.0, 93.659], ['2016-02-26', 915.047, 920.666, 916.92, 904.744, 39215440.0, 382634656.0, 93.659], ['2016-02-25', 947.827, 948.764, 905.681, 899.124, 62207284.0, 615004736.0, 93.659], ['2016-02-24', 942.208, 950.637, 950.637, 937.525, 30010360.0, 302498016.0, 93.659], ['2016-02-23', 963.75, 963.75, 947.828, 941.272, 42587436.0, 432315296.0, 93.659], ['2016-02-22', 948.764, 965.623, 963.75, 942.208, 61773944.0, 630251520.0, 93.659], ['2016-02-19', 945.019, 949.701, 940.336, 935.653, 31889824.0, 320939648.0, 93.659], ['2016-02-18', 953.448, 957.194, 945.018, 945.018, 40617824.0, 412337568.0, 93.659], ['2016-02-17', 939.398, 957.194, 950.637, 935.652, 58516704.0, 590538944.0, 93.659], ['2016-02-16', 921.603, 939.398, 937.525, 920.667, 42838640.0, 427507776.0, 93.659], ['2016-02-15', 904.744, 922.54, 916.92, 903.808, 27849946.0, 271173376.0, 93.659], ['2016-02-05', 932.842, 933.779, 929.096, 928.159, 27089334.0, 269184384.0, 93.659], ['2016-02-04', 926.286, 936.589, 931.906, 925.35, 37309948.0, 370586176.0, 93.659], ['2016-02-03', 922.54, 926.286, 922.54, 915.047, 27457216.0, 269997824.0, 93.659], ['2016-02-02', 917.857, 939.398, 931.906, 915.984, 36910416.0, 367360512.0, 93.659], ['2016-02-01', 934.716, 937.525, 917.857, 912.237, 41773216.0, 412635648.0, 93.659], ['2016-01-29', 912.237, 944.081, 936.589, 907.554, 54443576.0, 540544448.0, 93.659], ['2016-01-28', 919.73, 926.286, 907.554, 903.808, 30254078.0, 296055328.0, 93.659], ['2016-01-27', 930.033, 934.716, 925.35, 899.125, 56903704.0, 558510656.0, 93.659], ['2016-01-26', 966.56, 966.56, 924.413, 923.477, 64790112.0, 653561600.0, 93.659], ['2016-01-25', 974.052, 977.799, 971.243, 967.496, 37643172.0, 390734880.0, 93.659], ['2016-01-22', 974.053, 978.736, 974.053, 957.194, 46675216.0, 482984448.0, 93.659], ['2016-01-21', 981.545, 1006.833, 966.56, 966.56, 60614512.0, 638127872.0, 93.659], ['2016-01-20', 1002.151, 1011.516, 987.165, 977.799, 60375248.0, 640968960.0, 93.659], ['2016-01-19', 978.736, 1009.643, 1003.087, 974.989, 50110908.0, 532074688.0, 93.659], ['2016-01-18', 968.434, 989.039, 974.99, 964.687, 42104088.0, 439917824.0, 93.659], ['2016-01-15', 998.404, 1011.517, 979.673, 975.926, 44820216.0, 474908128.0, 93.659], ['2016-01-14', 991.849, 1011.517, 1008.707, 981.546, 66631456.0, 708534976.0, 93.659], ['2016-01-13', 1019.947, 1024.63, 1003.088, 1002.152, 39170948.0, 424371712.0, 93.659], ['2016-01-12', 1014.327, 1021.82, 1012.454, 996.532, 56164232.0, 605970816.0, 93.659], ['2016-01-11', 1030.249, 1037.742, 1007.771, 1000.278, 73201400.0, 800683648.0, 93.659], ['2016-01-08', 1049.918, 1057.41, 1041.488, 1020.883, 74752760.0, 831334528.0, 93.659], ['2016-01-07', 1068.65, 1068.65, 1024.63, 1021.82, 17476110.0, 194869488.0, 93.659], ['2016-01-06', 1069.587, 1082.699, 1079.889, 1066.777, 51570644.0, 591698496.0, 93.659], ['2016-01-05', 1055.537, 1083.635, 1067.713, 1044.298, 66326996.0, 755531328.0, 93.659], ['2016-01-04', 1123.909, 1126.718, 1061.157, 1051.791, 56349788.0, 660376128.0, 93.659]] res = easyhistory.Day().get_quarter_history(*test_data) self.assertListEqual(res, normal_data) def test_day_data_type_convert(self): test_data = ['2016-02-19', '945.019', '949.701', '940.336', '935.653', '31889824.000', '320939648.000', '93.659'] normal_data = ['2016-02-19', 945.019, 949.701, 940.336, 935.653, 31889824.000, 320939648.000, 93.659] easyhistory.Day().convert_stock_data_type(test_data) self.assertListEqual(test_data, normal_data) if __name__ == '__main__': unittest.main()