Repository: DemonDamon/Listed-company-news-crawl-and-text-analysis
Branch: main
Commit: d7a20a1f7ee8
Files: 293
Total size: 2.5 MB
Directory structure:
gitextract_w0u594fz/
├── .deepsource.toml
├── .gitignore
├── LICENSE
├── README.md
├── README_zn.md
├── backend/
│ ├── .gitignore
│ ├── README.md
│ ├── README_zn.md
│ ├── add_raw_html_column.py
│ ├── app/
│ │ ├── __init__.py
│ │ ├── agents/
│ │ │ ├── __init__.py
│ │ │ ├── data_collector.py
│ │ │ ├── data_collector_v2.py
│ │ │ ├── debate_agents.py
│ │ │ ├── news_analyst.py
│ │ │ ├── orchestrator.py
│ │ │ ├── quantitative_agent.py
│ │ │ └── search_analyst.py
│ │ ├── alpha_mining/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── backtest/
│ │ │ │ ├── __init__.py
│ │ │ │ └── evaluator.py
│ │ │ ├── config.py
│ │ │ ├── dsl/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ops.py
│ │ │ │ └── vocab.py
│ │ │ ├── features/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── market.py
│ │ │ │ └── sentiment.py
│ │ │ ├── model/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── alpha_generator.py
│ │ │ │ └── trainer.py
│ │ │ ├── tools/
│ │ │ │ ├── __init__.py
│ │ │ │ └── alpha_mining_tool.py
│ │ │ ├── utils.py
│ │ │ └── vm/
│ │ │ ├── __init__.py
│ │ │ └── factor_vm.py
│ │ ├── api/
│ │ │ ├── __init__.py
│ │ │ └── v1/
│ │ │ ├── __init__.py
│ │ │ ├── agents.py
│ │ │ ├── alpha_mining.py
│ │ │ ├── analysis.py
│ │ │ ├── debug.py
│ │ │ ├── knowledge_graph.py
│ │ │ ├── llm_config.py
│ │ │ ├── news.py
│ │ │ ├── news_v2.py
│ │ │ ├── stocks.py
│ │ │ └── tasks.py
│ │ ├── config/
│ │ │ ├── __init__.py
│ │ │ └── debate_modes.yaml
│ │ ├── core/
│ │ │ ├── __init__.py
│ │ │ ├── celery_app.py
│ │ │ ├── config.py
│ │ │ ├── database.py
│ │ │ ├── neo4j_client.py
│ │ │ └── redis_client.py
│ │ ├── financial/
│ │ │ ├── __init__.py
│ │ │ ├── models/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── news.py
│ │ │ │ └── stock.py
│ │ │ ├── providers/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base.py
│ │ │ │ ├── eastmoney/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fetchers/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── news.py
│ │ │ │ │ └── provider.py
│ │ │ │ ├── nbd/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fetchers/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── news.py
│ │ │ │ │ └── provider.py
│ │ │ │ ├── netease/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fetchers/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── news.py
│ │ │ │ │ └── provider.py
│ │ │ │ ├── sina/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fetchers/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── news.py
│ │ │ │ │ └── provider.py
│ │ │ │ ├── tencent/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fetchers/
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── news.py
│ │ │ │ │ └── provider.py
│ │ │ │ └── yicai/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── fetchers/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── news.py
│ │ │ │ └── provider.py
│ │ │ ├── registry.py
│ │ │ └── tools.py
│ │ ├── knowledge/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── graph_models.py
│ │ │ ├── graph_service.py
│ │ │ ├── knowledge_extractor.py
│ │ │ └── parallel_search.py
│ │ ├── main.py
│ │ ├── models/
│ │ │ ├── __init__.py
│ │ │ ├── analysis.py
│ │ │ ├── crawl_task.py
│ │ │ ├── database.py
│ │ │ ├── debate_history.py
│ │ │ ├── news.py
│ │ │ └── stock.py
│ │ ├── scripts/
│ │ │ └── init_stocks.py
│ │ ├── services/
│ │ │ ├── __init__.py
│ │ │ ├── analysis_service.py
│ │ │ ├── embedding_service.py
│ │ │ ├── llm_service.py
│ │ │ └── stock_data_service.py
│ │ ├── storage/
│ │ │ ├── __init__.py
│ │ │ └── vector_storage.py
│ │ ├── tasks/
│ │ │ ├── __init__.py
│ │ │ └── crawl_tasks.py
│ │ └── tools/
│ │ ├── __init__.py
│ │ ├── bochaai_search.py
│ │ ├── caijing_crawler.py
│ │ ├── crawler_base.py
│ │ ├── crawler_enhanced.py
│ │ ├── dynamic_crawler_example.py
│ │ ├── eastmoney_crawler.py
│ │ ├── eeo_crawler.py
│ │ ├── interactive_crawler.py
│ │ ├── jingji21_crawler.py
│ │ ├── jwview_crawler.py
│ │ ├── nbd_crawler.py
│ │ ├── netease163_crawler.py
│ │ ├── search_engine_crawler.py
│ │ ├── sina_crawler.py
│ │ ├── tencent_crawler.py
│ │ ├── text_cleaner.py
│ │ └── yicai_crawler.py
│ ├── clear_news_data.py
│ ├── env.example
│ ├── init_db.py
│ ├── init_knowledge_graph.py
│ ├── requirements.txt
│ ├── reset_database.py
│ ├── setup_env.sh
│ ├── start.sh
│ ├── start_celery.sh
│ └── tests/
│ ├── __init__.py
│ ├── check_milvus_data.py
│ ├── check_news_embedding_status.py
│ ├── financial/
│ │ ├── __init__.py
│ │ ├── test_smoke_openbb_models.py
│ │ ├── test_smoke_openbb_provider.py
│ │ └── test_smoke_openbb_tools.py
│ ├── manual_vectorize.py
│ ├── test_alpha_mining/
│ │ ├── __init__.py
│ │ ├── test_integration_p2.py
│ │ ├── test_smoke_p0.py
│ │ └── test_smoke_p1.py
│ └── test_smoke_alpha_mining.py
├── deploy/
│ ├── Dockerfile.celery
│ ├── celery-entrypoint.sh
│ └── docker-compose.dev.yml
├── docs/
│ ├── BochaAI_Web_Search_API_20251222_121535.md
│ └── 天眼查MCP服务_20260104_171528.md
├── frontend/
│ ├── .gitignore
│ ├── QUICKSTART.md
│ ├── README.md
│ ├── index.html
│ ├── package.json
│ ├── postcss.config.js
│ ├── src/
│ │ ├── App.tsx
│ │ ├── components/
│ │ │ ├── DebateChatRoom.tsx
│ │ │ ├── DebateConfig.tsx
│ │ │ ├── DebateHistorySidebar.tsx
│ │ │ ├── HighlightText.tsx
│ │ │ ├── KLineChart.tsx
│ │ │ ├── MentionInput.tsx
│ │ │ ├── ModelSelector.tsx
│ │ │ ├── NewsDetailDrawer.tsx
│ │ │ ├── StockSearch.tsx
│ │ │ ├── alpha-mining/
│ │ │ │ ├── AgentDemo.tsx
│ │ │ │ ├── MetricsDashboard.tsx
│ │ │ │ ├── OperatorGrid.tsx
│ │ │ │ ├── SentimentCompare.tsx
│ │ │ │ ├── TrainingMonitor.tsx
│ │ │ │ └── index.ts
│ │ │ └── ui/
│ │ │ ├── badge.tsx
│ │ │ ├── button.tsx
│ │ │ ├── card.tsx
│ │ │ ├── dropdown-menu.tsx
│ │ │ ├── sheet.tsx
│ │ │ └── tabs.tsx
│ │ ├── context/
│ │ │ └── NewsToolbarContext.tsx
│ │ ├── hooks/
│ │ │ └── useDebounce.ts
│ │ ├── index.css
│ │ ├── layout/
│ │ │ └── MainLayout.tsx
│ │ ├── lib/
│ │ │ ├── api-client.ts
│ │ │ └── utils.ts
│ │ ├── main.tsx
│ │ ├── pages/
│ │ │ ├── AgentMonitorPage.tsx
│ │ │ ├── AlphaMiningPage.tsx
│ │ │ ├── Dashboard.tsx
│ │ │ ├── NewsListPage.tsx
│ │ │ ├── StockAnalysisPage.tsx
│ │ │ ├── StockSearchPage.tsx
│ │ │ └── TaskManagerPage.tsx
│ │ ├── store/
│ │ │ ├── useDebateStore.ts
│ │ │ ├── useLanguageStore.ts
│ │ │ ├── useNewsStore.ts
│ │ │ └── useTaskStore.ts
│ │ └── types/
│ │ └── api.ts
│ ├── tailwind.config.js
│ ├── tsconfig.json
│ ├── tsconfig.node.json
│ └── vite.config.ts
├── legacy_v1/
│ ├── .deepsource.toml
│ ├── Chinese_Stop_Words.txt
│ ├── Crawler/
│ │ ├── __init__.py
│ │ ├── crawler_cnstock.py
│ │ ├── crawler_jrj.py
│ │ ├── crawler_nbd.py
│ │ ├── crawler_sina.py
│ │ ├── crawler_stcn.py
│ │ └── crawler_tushare.py
│ ├── README_OLD.md
│ ├── Text_Analysis/
│ │ ├── __init__.py
│ │ ├── text_mining.py
│ │ └── text_processing.py
│ ├── finance_dict.txt
│ ├── run_crawler_cnstock.py
│ ├── run_crawler_jrj.py
│ ├── run_crawler_nbd.py
│ ├── run_crawler_sina.py
│ ├── run_crawler_stcn.py
│ ├── run_crawler_tushare.py
│ ├── run_main.py
│ └── src/
│ ├── Gon/
│ │ ├── __init__.py
│ │ ├── cnstockspyder.py
│ │ ├── history_starter_cnstock.py
│ │ ├── history_starter_jrj.py
│ │ ├── history_starter_nbd.py
│ │ ├── history_starter_stock_price.py
│ │ ├── ifengspyder.py
│ │ ├── jrjspyder.py
│ │ ├── kill_realtime_spyder_tasks.py
│ │ ├── money163spyder.py
│ │ ├── nbdspyder.py
│ │ ├── realtime_starter_cnstock.py
│ │ ├── realtime_starter_jrj.py
│ │ ├── realtime_starter_nbd.py
│ │ ├── realtime_starter_redis_queue.py
│ │ ├── realtime_starter_stock_price.py
│ │ ├── sinaspyder.py
│ │ ├── spyder.py
│ │ └── stockinfospyder.py
│ ├── Hisoka/
│ │ └── classifier.py
│ ├── Killua/
│ │ ├── __init__.py
│ │ ├── buildstocknewsdb.py
│ │ ├── deduplication.py
│ │ └── denull.py
│ ├── Kite/
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── database.py
│ │ ├── log.py
│ │ ├── utils.py
│ │ └── webserver.py
│ ├── Leorio/
│ │ ├── __init__.py
│ │ ├── chnstopwords.txt
│ │ ├── financedict.txt
│ │ ├── tokenization.py
│ │ └── topicmodelling.py
│ ├── __init__.py
│ ├── history_spyder_startup.bat
│ ├── main.py
│ ├── realtime_spyder_startup.bat
│ └── realtime_spyder_stopall.bat
├── reset_all_data.sh
└── thirdparty/
├── DISC-FinLLM.md
├── ElegantRL.md
├── FinCast-fts.md
├── FinGPT.md
├── FinGenius.md
├── FinRL-Meta.md
├── FinRL.md
├── FinRobot.md
├── FinceptTerminal.md
├── Kronos.md
├── Lean.md
├── README.md
├── TradingAgents-CN.md
├── TradingAgents.md
├── TrendRadar.md
├── agentic-trading.md
├── awesome-quant.md
├── backtrader.md
├── investor-agent.md
├── panda_quantflow.md
├── qlib.md
└── vnpy.md
================================================
FILE CONTENTS
================================================
================================================
FILE: .deepsource.toml
================================================
version = 1
[[analyzers]]
name = "python"
[analyzers.meta]
runtime_version = "3.x.x"
================================================
FILE: .gitignore
================================================
# Development documentation (local only, not for Git)
devlogs/
conclusions/
researches/
# Python
__pycache__/
*.py[cod]
*$py.class
# Virtual environments
venv/
env/
ENV/
# IDE
.vscode/
.idea/
*.swp
# OS
.DS_Store
node_modules/
**/node_modules/backend/celerybeat-schedule*
backend/.crawl_cache/
backend/celerybeat-schedule
backend/reproduce_sina.py
backend/checkpoints/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2025 Ziran Li
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# FinnewsHunter: Multi-Agent Investment Decision Platform Driven by Financial News
)
}
================================================
FILE: frontend/src/store/useDebateStore.ts
================================================
import { create } from 'zustand'
import { persist } from 'zustand/middleware'
// 聊天消息类型(与 DebateChatRoom 一致)
export type ChatRole = 'user' | 'bull' | 'bear' | 'manager' | 'system' | 'data_collector' | 'search'
export interface ChatMessage {
id: string
role: ChatRole
content: string
timestamp: Date
round?: number
isStreaming?: boolean
mentions?: string[] // 消息中的 @ 提及
searchPlan?: any // 搜索计划
searchStatus?: 'pending' | 'executing' | 'completed' | 'cancelled'
}
// 分析结果(用于保存并行/快速分析模式的结果)
export interface AnalysisResult {
bull?: string
bear?: string
manager?: string
quick?: string
finalDecision?: {
rating?: string
decision?: string
}
executionTime?: number
}
// 辩论会话
export interface DebateSession {
id: string
stockCode: string
stockName: string
messages: ChatMessage[]
mode: string
createdAt: Date
updatedAt: Date
// 新增:并行/快速分析模式的结果
analysisResult?: AnalysisResult
// 新增:会话状态
status?: 'in_progress' | 'completed' | 'interrupted'
}
// 本地存储的会话格式(日期需要序列化)
interface SerializedSession {
id: string
stockCode: string
stockName: string
messages: Array & { timestamp: string }>
mode: string
createdAt: string
updatedAt: string
}
interface DebateStore {
// 当前会话
currentSession: DebateSession | null
// 历史会话列表(按股票代码索引)
sessions: Record
// 操作方法
startSession: (stockCode: string, stockName: string, mode: string) => string
addMessage: (message: ChatMessage) => void
updateMessage: (messageId: string, updates: Partial) => void
clearCurrentSession: () => void
// 批量同步消息(用于辩论完成时一次性同步所有消息)
syncMessages: (messages: ChatMessage[]) => void
// 新增:保存分析结果(用于并行/快速分析模式)
saveAnalysisResult: (result: AnalysisResult) => void
// 新增:更新会话状态
updateSessionStatus: (status: 'in_progress' | 'completed' | 'interrupted') => void
// 新增:恢复会话到页面状态
restoreSession: (sessionId: string) => DebateSession | null
// 新增:获取最近未完成的会话
getLatestInProgressSession: (stockCode: string) => DebateSession | null
// 历史管理
loadSession: (stockCode: string, sessionId?: string) => DebateSession | null
getStockSessions: (stockCode: string) => DebateSession[]
deleteSession: (stockCode: string, sessionId: string) => void
clearStockHistory: (stockCode: string) => Promise
// 同步到后端(可选)
syncToBackend: (stockCode: string) => Promise
loadFromBackend: (stockCode: string) => Promise
}
// 序列化会话(用于持久化)
const serializeSession = (session: DebateSession): SerializedSession => ({
...session,
messages: session.messages.map(m => ({
...m,
timestamp: m.timestamp.toISOString()
})),
createdAt: session.createdAt.toISOString(),
updatedAt: session.updatedAt.toISOString()
})
// 反序列化会话(从持久化恢复)
const deserializeSession = (session: SerializedSession): DebateSession => ({
...session,
messages: session.messages.map(m => ({
...m,
timestamp: new Date(m.timestamp)
})),
createdAt: new Date(session.createdAt),
updatedAt: new Date(session.updatedAt)
})
export const useDebateStore = create()(
persist(
(set, get) => ({
currentSession: null,
sessions: {},
startSession: (stockCode, stockName, mode) => {
const sessionId = `debate-${stockCode}-${Date.now()}`
const newSession: DebateSession = {
id: sessionId,
stockCode,
stockName,
messages: [],
mode,
createdAt: new Date(),
updatedAt: new Date(),
status: 'in_progress'
}
set(state => ({
currentSession: newSession,
sessions: {
...state.sessions,
[stockCode]: [
newSession,
...(state.sessions[stockCode] || []).slice(0, 9) // 最多保留10个历史会话
]
}
}))
return sessionId
},
addMessage: (message) => {
set(state => {
if (!state.currentSession) return state
const updatedSession = {
...state.currentSession,
messages: [...state.currentSession.messages, message],
updatedAt: new Date()
}
// 同时更新 sessions 中的记录
const stockCode = updatedSession.stockCode
const updatedSessions = (state.sessions[stockCode] || []).map(s =>
s.id === updatedSession.id ? updatedSession : s
)
return {
currentSession: updatedSession,
sessions: {
...state.sessions,
[stockCode]: updatedSessions
}
}
})
},
// 批量同步消息(替换当前会话的所有消息)
syncMessages: (messages) => {
set(state => {
if (!state.currentSession) return state
// 优化过滤逻辑:只要有内容就保存,并强制标记为非流式
const validMessages = messages
.filter(m => m.content || m.searchPlan || m.role === 'system')
.map(m => ({
...m,
isStreaming: false // 强制标记为已完成
}))
const updatedSession = {
...state.currentSession,
messages: validMessages,
updatedAt: new Date()
}
const stockCode = updatedSession.stockCode
const updatedSessions = (state.sessions[stockCode] || []).map(s =>
s.id === updatedSession.id ? updatedSession : s
)
return {
currentSession: updatedSession,
sessions: {
...state.sessions,
[stockCode]: updatedSessions
}
}
})
},
updateMessage: (messageId, updates) => {
set(state => {
if (!state.currentSession) return state
const updatedMessages = state.currentSession.messages.map(m =>
m.id === messageId ? { ...m, ...updates } : m
)
const updatedSession = {
...state.currentSession,
messages: updatedMessages,
updatedAt: new Date()
}
const stockCode = updatedSession.stockCode
const updatedSessions = (state.sessions[stockCode] || []).map(s =>
s.id === updatedSession.id ? updatedSession : s
)
return {
currentSession: updatedSession,
sessions: {
...state.sessions,
[stockCode]: updatedSessions
}
}
})
},
clearCurrentSession: () => {
set({ currentSession: null })
},
// 保存分析结果(用于并行/快速分析模式)
saveAnalysisResult: (result) => {
set(state => {
if (!state.currentSession) return state
const updatedSession = {
...state.currentSession,
analysisResult: result,
updatedAt: new Date()
}
const stockCode = updatedSession.stockCode
const updatedSessions = (state.sessions[stockCode] || []).map(s =>
s.id === updatedSession.id ? updatedSession : s
)
return {
currentSession: updatedSession,
sessions: {
...state.sessions,
[stockCode]: updatedSessions
}
}
})
},
// 更新会话状态
updateSessionStatus: (status) => {
set(state => {
if (!state.currentSession) return state
const updatedSession = {
...state.currentSession,
status,
updatedAt: new Date()
}
const stockCode = updatedSession.stockCode
const updatedSessions = (state.sessions[stockCode] || []).map(s =>
s.id === updatedSession.id ? updatedSession : s
)
return {
currentSession: updatedSession,
sessions: {
...state.sessions,
[stockCode]: updatedSessions
}
}
})
},
// 恢复会话
restoreSession: (sessionId) => {
const state = get()
for (const stockCode of Object.keys(state.sessions)) {
const session = state.sessions[stockCode].find(s => s.id === sessionId)
if (session) {
set({ currentSession: session })
return session
}
}
return null
},
// 获取最近未完成的会话
getLatestInProgressSession: (stockCode) => {
const state = get()
const stockSessions = state.sessions[stockCode] || []
return stockSessions.find(s => s.status === 'in_progress') || null
},
loadSession: (stockCode, sessionId) => {
const state = get()
const stockSessions = state.sessions[stockCode] || []
if (sessionId) {
const session = stockSessions.find(s => s.id === sessionId)
if (session) {
set({ currentSession: session })
return session
}
}
// 如果没有指定 sessionId,返回最新的会话
if (stockSessions.length > 0) {
const latestSession = stockSessions[0]
set({ currentSession: latestSession })
return latestSession
}
return null
},
getStockSessions: (stockCode) => {
return get().sessions[stockCode] || []
},
deleteSession: (stockCode, sessionId) => {
set(state => {
const updatedSessions = (state.sessions[stockCode] || []).filter(
s => s.id !== sessionId
)
return {
sessions: {
...state.sessions,
[stockCode]: updatedSessions
},
// 如果删除的是当前会话,清空当前会话
currentSession: state.currentSession?.id === sessionId
? null
: state.currentSession
}
})
},
clearStockHistory: async (stockCode) => {
// 1. 先清除本地 Store
set(state => {
const { [stockCode]: _, ...rest } = state.sessions
return {
sessions: rest,
currentSession: state.currentSession?.stockCode === stockCode
? null
: state.currentSession
}
})
// 2. 同时清除后端数据库中的历史
try {
const response = await fetch(`/api/v1/agents/debate/history/${stockCode}`, {
method: 'DELETE'
})
if (response.ok) {
console.log('✅ 已清除后端历史记录')
} else {
console.error('❌ 清除后端历史失败')
}
} catch (error) {
console.error('❌ 清除后端历史出错:', error)
}
},
// 同步到后端
syncToBackend: async (stockCode) => {
const state = get()
const sessions = state.sessions[stockCode]
console.log('💾 syncToBackend called for:', stockCode)
console.log('💾 Sessions count:', sessions?.length || 0)
if (!sessions || sessions.length === 0) {
console.warn('⚠️ syncToBackend: no sessions to sync')
return
}
// 打印每个会话的消息数量
sessions.forEach((s, i) => {
console.log(`💾 Session ${i}: ${s.id}, messages: ${s.messages.length}`)
console.log(`💾 Session ${i} roles:`, s.messages.map(m => m.role))
})
try {
const serialized = sessions.map(serializeSession)
console.log('💾 Sending to backend:', JSON.stringify(serialized).slice(0, 500) + '...')
const response = await fetch(`/api/v1/agents/debate/history`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
stock_code: stockCode,
sessions: serialized
})
})
if (!response.ok) {
console.error('Failed to sync debate history to backend')
} else {
console.log('✅ Synced to backend successfully')
}
} catch (error) {
console.error('Error syncing debate history:', error)
}
},
// 从后端加载
loadFromBackend: async (stockCode) => {
console.log('📥 loadFromBackend called for:', stockCode)
try {
const response = await fetch(`/api/v1/agents/debate/history/${stockCode}`)
if (response.ok) {
const data = await response.json()
console.log('📥 Loaded from backend:', data)
if (data.sessions && data.sessions.length > 0) {
const sessions = data.sessions.map(deserializeSession)
console.log('📥 Deserialized sessions:', sessions.length)
sessions.forEach((s: any, i: number) => {
console.log(`📥 Session ${i}: ${s.id}, messages: ${s.messages.length}`)
console.log(`📥 Session ${i} roles:`, s.messages.map((m: any) => m.role))
})
set(state => ({
sessions: {
...state.sessions,
[stockCode]: sessions
}
}))
} else {
console.log('📥 No sessions in response')
}
} else {
console.error('📥 Failed to load:', response.status)
}
} catch (error) {
console.error('Error loading debate history from backend:', error)
}
}
}),
{
name: 'finnews-debate-history',
// 自定义序列化
serialize: (state) => {
const serialized = {
...state,
state: {
...state.state,
currentSession: state.state.currentSession
? serializeSession(state.state.currentSession)
: null,
sessions: Object.fromEntries(
Object.entries(state.state.sessions).map(([k, v]) => [
k,
(v as DebateSession[]).map(serializeSession)
])
)
}
}
return JSON.stringify(serialized)
},
// 自定义反序列化
deserialize: (str) => {
const parsed = JSON.parse(str)
return {
...parsed,
state: {
...parsed.state,
currentSession: parsed.state.currentSession
? deserializeSession(parsed.state.currentSession)
: null,
sessions: Object.fromEntries(
Object.entries(parsed.state.sessions).map(([k, v]) => [
k,
(v as SerializedSession[]).map(deserializeSession)
])
)
}
}
}
}
)
)
================================================
FILE: frontend/src/store/useLanguageStore.ts
================================================
/**
* 全局语言状态管理
*/
import { create } from 'zustand';
import { persist } from 'zustand/middleware';
export type Lang = 'zh' | 'en';
interface LanguageState {
lang: Lang;
setLang: (lang: Lang) => void;
toggleLang: () => void;
}
export const useLanguageStore = create()(
persist(
(set, get) => ({
lang: 'zh',
setLang: (lang) => set({ lang }),
toggleLang: () => set({ lang: get().lang === 'zh' ? 'en' : 'zh' }),
}),
{
name: 'finnews-language',
}
)
);
// 全局国际化文案
export const globalI18n = {
zh: {
nav: {
home: '首页',
news: '新闻流',
stock: '个股分析',
alphaMining: 'Alpha因子挖掘',
agents: '智能体监控',
tasks: '任务管理',
},
header: {
title: 'FinnewsHunter',
poweredBy: 'Powered by',
},
dashboard: {
title: '仪表盘',
subtitle: '金融新闻智能分析平台 - Powered by AgenticX',
totalNews: '总新闻数',
savedToDb: '已保存到数据库',
totalTasks: '总任务数',
recentCompleted: '最近完成',
units: '个',
crawlRate: '爬取成功率',
liveMonitor: '实时监控',
running: '运行中',
autoInterval: '每1分钟自动爬取',
newsStats: '新闻来源统计',
newsStatsDesc: '各新闻源的内容数量分布',
latestNews: '最新新闻',
latestNewsDesc: '最近爬取的新闻动态',
allSources: '全部来源',
noNews: '暂无新闻数据,请先爬取新闻',
noNewsFrom: '暂无来自该来源的新闻',
},
news: {
search: '搜索新闻、股票代码...',
all: '全部',
pending: '待分析',
positive: '利好',
negative: '利空',
neutral: '中性',
items: '条新闻',
source: '来源',
analyzing: '分析中...',
reanalyze: '重新分析',
analyze: '分析',
analysisComplete: '分析完成!',
analysisFailed: '分析失败',
crawling: '正在爬取中,请稍候...',
refreshNow: '立即刷新',
crawlingProgress: '爬取中...(约2分钟)',
collapse: '收起',
expandMore: '展开更多',
stocks: '只股票',
noNews: '暂无新闻',
noNewsFound: '没有找到与',
relatedNews: '相关的新闻',
tryOtherKeywords: '试试其他关键词,如股票代码或公司名称',
pleaseCrawl: '请先爬取新闻',
selectedItems: '已选择 {count} 项',
cancelSelection: '取消选择',
deleteNews: '删除新闻',
deleteSelected: '删除选中',
confirmDelete: '确定要删除选中的 {count} 条新闻吗?此操作不可恢复。',
selectAll: '全选',
deselectAll: '取消全选',
analyzeAll: '全部分析',
reanalyzeAll: '重新分析',
analyzingSelected: '正在分析选中的 {count} 条新闻...',
analysisComplete: '分析完成!成功 {success} 条,失败 {failed} 条',
},
stock: {
title: '个股智能分析',
subtitle: '输入股票代码或名称,开启 AI 驱动的投资洞察',
searchPlaceholder: '搜索股票代码或名称...',
searching: '搜索中...',
notFound: '未找到匹配的股票',
tryInput: '尝试输入股票代码或名称',
emptyDb: '股票数据库为空',
initTip: '点击下方按钮初始化股票数据',
initBtn: '初始化股票数据',
importing: '正在导入股票数据...',
hotStocks: '热门股票',
kline: 'K线分析',
klineDesc: '多周期行情数据',
aiSentiment: 'AI 情感分析',
aiSentimentDesc: '新闻舆情智能解读',
debate: '多空辩论',
debateDesc: 'Bull vs Bear 对决',
nav: '导航',
select: '选择',
close: '关闭',
},
agents: {
title: '智能体监控台',
subtitle: '实时查看智能体执行状态、性能指标和思考链',
autoRefreshing: '自动刷新中',
refresh: '手动刷新',
clearLogs: '清空日志',
totalExec: '总执行次数',
successExec: '成功执行',
successRate: '成功率',
failedExec: '失败次数',
avgTime: '平均耗时',
availableAgents: '可用智能体',
availableAgentsDesc: '系统中已注册的智能体和工作流',
agents: '智能体',
workflows: '工作流',
active: '活跃',
inactive: '未激活',
execLogs: '执行日志',
execLogsDesc: '实时智能体执行日志和状态追踪',
records: '条记录',
noLogs: '暂无执行日志',
noLogsHint: '执行分析任务或辩论后,日志将在此显示',
execTimes: '执行',
times: '次',
success: '成功',
avg: '平均',
recentActivity: '最近活动',
confirmClearLogs: '确定要清空所有执行日志吗?此操作不可恢复。',
},
tasks: {
title: '任务管理',
subtitle: '爬取任务监控和管理',
task: '任务',
completed: '已完成',
running: '运行中',
pending: '待执行',
failed: '失败',
realtime: '实时',
coldStart: '冷启动',
crawled: '爬取数',
saved: '保存数',
duration: '耗时',
createdAt: '创建时间',
progress: '进度',
noTasks: '暂无任务记录',
loading: '加载中...',
},
common: {
loading: '加载中...',
noData: '暂无数据',
confirm: '确定',
cancel: '取消',
},
time: {
justNow: '刚刚',
minutesAgo: '分钟前',
hoursAgo: '小时前',
daysAgo: '天前',
},
model: {
loading: '加载中...',
notConfigured: '未配置LLM',
selectModel: '选择模型',
selectTip: '选择模型 · 兼顾质量与成本',
noApiKey: '未配置API Key',
current: '当前',
},
debateRoom: {
title: '投资辩论',
titlePlaceholder: '多空辩论室',
subtitle: '多方 vs 空方 · 投资经理主持',
roundPrefix: '第',
roundSuffix: '轮',
typing: '正在输入...',
thinking: '思考中...',
noMessages: '尚无消息',
clickStartDebate: '点击「开始辩论」启动多空对决',
canSpeakDuringDebate: '您也可以在辩论过程中发言提问',
debateInProgress: '辩论进行中,输入 @提及智能体...',
mentionTip: '提示:使用@多方辩手@空方辩手可以指定角色回答',
roundStarted: '轮辩论开始',
debateEnded: '辩论结束,投资经理已做出最终决策',
debateStarted: '辩论开始,数据专员正在准备资料...',
searchPlanConfirm: '搜索计划确认',
searchPlanExecuting: '正在搜索中...',
searchPlanCompleted: '执行完成',
searchPlanCancel: '取消',
searchPlanConfirmBtn: '确认执行',
estimatedTime: '预计耗时',
seconds: '秒',
},
mentionInput: {
placeholder: '输入消息,使用 @ 提及智能体或数据源...',
agents: '智能体',
sources: '数据源',
stocks: '股票',
},
debateHistory: {
history: '历史',
noMessages: '尚无消息',
messages: '条消息',
justNow: '刚刚',
minutesAgo: '分钟前',
hoursAgo: '小时前',
daysAgo: '天前',
today: '今天',
yesterday: '昨天',
thisWeek: '本周',
older: '更早',
expandHistory: '展开历史记录',
continueDebate: '继续辩论',
delete: '删除',
searchPlaceholder: '搜索历史记录...',
noMatchingRecords: '未找到匹配的记录',
noHistoryYet: '暂无历史记录',
tryOtherKeywords: '尝试其他关键词',
historyAutoSave: '开始辩论后会自动保存',
roleNames: {
user: '我',
bull: '多方',
bear: '空方',
manager: '经理',
data_collector: '数据专员',
},
},
stockDetail: {
title: '个股分析 · 智能体驱动的投资决策',
relatedNews: '关联新闻',
analyzed: '已分析',
items: '条',
overallSentiment: '整体情感',
recent7d: '近7天情感',
unknown: '未知',
trend: '趋势',
up: '上升',
down: '下降',
stable: '稳定',
latestNews: '最新新闻',
none: '暂无',
kline: 'K线图 · 真实行情',
dataSource: '数据来源',
supportZoom: '支持缩放拖拽',
close: '收盘',
change: '涨跌',
volume: '成交额',
billion: '亿',
period: '周期',
adjust: '复权',
daily: '日K',
dailyK: '日K',
min60: '60分',
min30: '30分',
min15: '15分',
min5: '5分',
min1: '1分',
qfq: '前复权',
qfqTip: '消除除权缺口,保持走势连续(推荐)',
noAdjust: '不复权',
noAdjustTip: '显示真实交易价格,会有除权缺口',
hfq: '后复权',
hfqTip: '以上市首日为基准,价格可能很高',
recommendLabel: 'Recommend',
timeLabel: '时间',
openLabel: '开',
highLabel: '高',
lowLabel: '低',
closeLabel: '收',
volumeLabel: '量',
parallelAnalysis: '并行分析',
parallelAnalysisDesc: 'Bull/Bear并行分析,投资经理汇总决策',
realtimeDebate: '实时辩论',
realtimeDebateDesc: '四人实时对话,投资经理主持,多空双方交替发言',
quickAnalysis: '快速分析',
quickAnalysisDesc: '单一分析师快速给出建议,适合时间紧迫场景',
result: '结果',
historySessionLoaded: '已加载历史会话',
detectIncompleteSession: '检测到有未完成的',
session: '会话',
messages: '条消息',
restore: '是否恢复',
analysis: '分析',
analysisModeConfig: '分析模式配置',
default: '默认',
parallelExecution: '并行执行',
about2to3min: '约2-3分钟',
realtimeDialogue: '实时对话',
fourAgents: '4位智能体',
about5to10min: '约5-10分钟',
singleAgent: '单智能体',
about1min: '约1分钟',
advancedConfig: '高级配置',
maxExecutionTime: '最大执行时间',
seconds: '秒',
maxDebateRounds: '最大辩论回合数',
rounds: '轮',
managerCanInterrupt: '投资经理可打断辩论',
collectDataBeforeDebate: '辩论前搜集数据',
executionTime: '耗时',
news: '关联新闻',
newsContain: '包含',
newsTotal: '条',
fold: '折叠',
expand: '展开',
clearData: '清除数据',
clearing: '清除中...',
crawlComplete: '爬取完成',
crawlFailed: '爬取失败',
crawling: '爬取中...',
stop: '停止',
updateCrawl: '更新爬取',
targetCrawl: '定向爬取',
noRelatedNews: '暂无关联新闻',
clickCrawl: '点击「定向爬取」获取该股票的相关新闻',
loadMore: '继续扩展',
remaining: '还有',
showAll: '已显示全部',
newsFolded: '新闻已折叠,点击"展开"查看',
sentimentTrend: '新闻情感趋势',
sentimentDesc: '近30天新闻情感分布与平均值',
positive: '利好',
negative: '利空',
neutral: '中性',
avgSentiment: '平均情感',
bullBear: 'Bull vs Bear 智能体辩论',
bullBearDesc: '看多研究员 vs 看空研究员,投资经理综合裁决',
startDebate: '开始辩论',
debating: '辩论中...',
analysisMode: '分析模式',
bullView: '看多观点',
bearView: '看空观点',
managerDecision: '投资经理决策',
waitingAnalysis: '等待分析...',
waitingDecision: '等待多空分析完成后进行决策...',
clickDebate: '点击"开始辩论"启动智能体分析',
debateDesc: '系统将自动调用 Bull/Bear 研究员进行多角度分析,并由投资经理给出综合决策',
backToSearch: '返回搜索',
history: '历史',
copy: '复制',
export: '导出',
regenerate: '重新生成',
stronglyRec: '强烈推荐',
recommend: '推荐',
avoid: '回避',
caution: '谨慎',
strongBull: '强烈利好',
strongBear: '强烈利空',
noKline: '暂无K线数据',
checkCode: '请检查股票代码是否正确',
sessionRestored: '已恢复上次会话',
debateComplete: '辩论分析完成!',
outputting: '输出中...',
deciding: '决策中...',
analysisComplete: '分析完成',
analysisGenerating: '分析生成中...',
decisionGenerating: '决策生成中...',
debateFailed: '辩论分析失败',
sessionDeleted: '已删除会话',
allHistoryCleared: '已清除所有历史记录',
searchCancelled: '已取消搜索任务',
crawlTaskStarted: '定向爬取任务已启动',
crawlingInProgress: '正在爬取中...',
crawlTaskExists: '该股票已有正在进行的爬取任务,正在同步状态...',
crawlTaskStopped: '已停止爬取任务',
crawlTaskStopFailed: '停止任务失败',
newsCleared: '已清除',
newsItems: '条新闻',
clearNewsConfirm: '确定要清除「',
clearNewsConfirmEnd: '」的所有新闻吗?此操作不可恢复!',
stopCrawlConfirm: '确定要停止当前的爬取任务吗?',
knowledgeGraph: '知识图谱 · 智能检索',
knowledgeGraphDesc: '基于多维度关键词并发检索,提升召回率',
nameVariants: '名称变体',
mainBusiness: '主营业务',
relatedConcepts: '关联概念',
concurrentQueries: '并发检索查询',
bullResearcher: '看多研究员',
bearResearcher: '看空研究员',
investmentManager: '投资经理',
generatingSearchPlan: '正在生成搜索计划...',
deleteSessionConfirm: '确定要删除这条记录吗?',
clearAllHistoryConfirm: '确定要清除所有历史记录吗?此操作不可恢复!',
clearAllRecords: '清除所有记录',
crawlSuccess: '定向爬取完成!新增',
unknownError: '未知错误',
taskCreated: '任务已创建,等待执行...',
},
alphaMining: {
training: {
title: 'RL 训练监控',
desc: 'Transformer + REINFORCE 算法实时训练进度',
ready: '就绪',
running: '训练中',
completed: '完成',
error: '错误',
steps: '训练步数',
useSentiment: '使用情感特征',
stop: '停止',
start: '开始训练',
progress: '训练进度',
bestFactor: '当前最优因子',
convergence: '收敛曲线',
trainingFailed: '训练失败',
},
metrics: {
noData: '暂无评估数据',
hint: '请先评估一个因子表达式',
currentFactor: '当前因子',
multiDim: '多维度评估',
riskMetrics: '风险指标',
maxDrawdown: '最大回撤',
safe: '安全',
danger: '危险',
dailyTurnover: '日均换手率',
winRate: '胜率',
totalReturn: '累计收益',
returnsCurve: '收益曲线',
returnsDesc: '策略累计收益 vs 基准',
strategy: '策略',
benchmark: '基准',
metricDesc: '指标说明',
sortinoDesc: 'Sortino: 越高越好,>1优秀',
sharpeDesc: 'Sharpe: 越高越好,>0.5良好',
icDesc: 'IC: 绝对值>0.03有效',
maxDDDesc: 'Max DD: <20%安全',
excellent: '优秀',
good: '良好',
average: '一般',
poor: '较差',
lowTurnover: '低换手',
},
sentiment: {
title: '情感融合效果对比',
desc: '对比纯技术因子 vs 情感增强因子的挖掘效果',
steps: '训练步数',
comparing: '对比中...',
start: '开始对比',
techOnly: '纯技术因子',
techDesc: '个特征(RET, VOL, VOLUME_CHG, TURNOVER)',
enhanced: '情感增强因子',
enhancedDesc: '个特征(+SENTIMENT, NEWS_COUNT)',
bestFactor: '最优因子',
none: '无',
improvement: '改进幅度',
improved: '情感特征提升了因子效果',
degraded: '情感特征降低了因子效果',
scoreDiff: 'Score 差异',
comparison: 'Score 对比',
techOnlyBar: '纯技术',
enhancedBar: '情感增强',
conclusion: '结论:',
conclusionPositive: '情感特征(SENTIMENT, NEWS_COUNT)对因子挖掘有正向贡献,建议在实际应用中开启情感融合功能。',
conclusionNegative: '本次实验中情感特征未能提升效果,可能原因包括:样本量不足、情感数据噪音、训练步数过少等。建议增加训练步数后重试。',
comparingText: '正在进行对比实验...',
comparingHint: '分别训练纯技术因子和情感增强因子,每种',
stepsText: '步',
startHint: '点击"开始对比"运行情感融合实验',
startDesc: '将分别训练纯技术因子和情感增强因子进行效果对比',
comparisonFailed: '对比失败',
},
agent: {
title: 'AgenticX Agent 调用演示',
desc: '展示 Agent 如何调用 AlphaMiningTool 进行因子挖掘',
success: '成功',
failed: '失败',
toolParams: 'Tool 参数',
stockCode: '股票代码(可选)',
stockPlaceholder: '如 SH600519',
steps: '训练步数',
useSentiment: '使用情感特征',
executing: '执行中...',
execute: '执行 Agent 调用',
inputParams: '输入参数',
output: '输出结果',
executionTime: '耗时',
bestFactor: '最优因子',
logs: '执行日志',
codeExample: 'Python 调用示例',
executeFailed: '执行失败',
startHint: '配置参数后点击"执行 Agent 调用"',
startDesc: '将演示 QuantitativeAgent 如何通过 AlphaMiningTool 进行因子挖掘',
miningTask: '为 {code} 挖掘量化因子',
createAgent: '创建 Agent',
registerTool: '注册 Tool',
executeMining: '执行因子挖掘',
},
operators: {
all: '全部',
availableFeatures: '可用特征',
techFeature: '技术特征',
sentimentFeature: '情感特征',
totalOperators: '共 {count} 个操作符',
totalFeatures: '{count} 个特征',
params: '参',
categoryArithmetic: '算术运算',
categoryUnary: '一元运算',
categoryTimeseries: '时序运算',
categoryConditional: '条件运算',
categorySpecial: '特殊运算',
add: '加法',
sub: '减法',
mul: '乘法',
div: '除法(安全)',
neg: '取负',
abs: '绝对值',
sign: '符号函数',
gate: '条件选择',
max: '取最大',
min: '取最小',
delay1: '延迟1期',
delay5: '延迟5期',
delta1: '1期差分',
delta5: '5期差分',
ma5: '5期均线',
ma10: '10期均线',
std5: '5期标准差',
std10: '10期标准差',
jump: '跳跃检测',
jumpExample: '检测>3σ异常值',
decay: '衰减加权',
max3: '3期最大',
},
},
},
en: {
nav: {
home: 'Home',
news: 'News Feed',
stock: 'Stock Analysis',
alphaMining: 'Alpha Mining',
agents: 'Agent Monitor',
tasks: 'Task Manager',
},
header: {
title: 'FinnewsHunter',
poweredBy: 'Powered by',
},
dashboard: {
title: 'Dashboard',
subtitle: 'Financial News AI Analytics Platform - Powered by AgenticX',
totalNews: 'Total News',
savedToDb: 'Saved to database',
totalTasks: 'Total Tasks',
recentCompleted: 'Recently completed',
units: '',
crawlRate: 'Crawl Success Rate',
liveMonitor: 'Live Monitor',
running: 'Running',
autoInterval: 'Auto crawl every minute',
newsStats: 'News Source Stats',
newsStatsDesc: 'Content distribution by news source',
latestNews: 'Latest News',
latestNewsDesc: 'Recently crawled news',
allSources: 'All Sources',
noNews: 'No news data, please crawl news first',
noNewsFrom: 'No news from this source',
},
news: {
search: 'Search news, stock codes...',
all: 'All',
pending: 'Pending',
positive: 'Positive',
negative: 'Negative',
neutral: 'Neutral',
items: 'items',
source: 'Source',
analyzing: 'Analyzing...',
reanalyze: 'Re-analyze',
analyze: 'Analyze',
analysisComplete: 'Analysis complete!',
analysisFailed: 'Analysis failed',
crawling: 'Crawling in progress, please wait...',
refreshNow: 'Refresh Now',
crawlingProgress: 'Crawling... (~2 min)',
collapse: 'Collapse',
expandMore: 'Expand More',
stocks: 'stocks',
noNews: 'No news',
noNewsFound: 'No news found for',
relatedNews: '',
tryOtherKeywords: 'Try other keywords like stock codes or company names',
pleaseCrawl: 'Please crawl news first',
selectedItems: 'Selected {count} items',
cancelSelection: 'Cancel Selection',
deleteNews: 'Delete News',
deleteSelected: 'Delete Selected',
confirmDelete: 'Are you sure you want to delete {count} selected news? This action cannot be undone.',
selectAll: 'Select All',
deselectAll: 'Deselect All',
analyzeAll: 'Analyze All',
reanalyzeAll: 'Re-analyze All',
analyzingSelected: 'Analyzing {count} selected news...',
analysisComplete: 'Analysis complete! {success} succeeded, {failed} failed',
},
stock: {
title: 'Stock Intelligence',
subtitle: 'Enter stock code or name for AI-powered investment insights',
searchPlaceholder: 'Search stock code or name...',
searching: 'Searching...',
notFound: 'No matching stocks found',
tryInput: 'Try entering stock code or name',
emptyDb: 'Stock database is empty',
initTip: 'Click below to initialize stock data',
initBtn: 'Initialize Stock Data',
importing: 'Importing stock data...',
hotStocks: 'Popular Stocks',
kline: 'K-Line Analysis',
klineDesc: 'Multi-period market data',
aiSentiment: 'AI Sentiment',
aiSentimentDesc: 'News sentiment analysis',
debate: 'Bull vs Bear',
debateDesc: 'Bull vs Bear debate',
nav: 'Navigate',
select: 'Select',
close: 'Close',
},
agents: {
title: 'Agent Monitor',
subtitle: 'Real-time agent execution status, metrics and reasoning chain',
autoRefreshing: 'Auto-refreshing',
refresh: 'Refresh',
clearLogs: 'Clear Logs',
totalExec: 'Total Executions',
successExec: 'Successful',
successRate: 'Success Rate',
failedExec: 'Failed',
avgTime: 'Avg Time',
availableAgents: 'Available Agents',
availableAgentsDesc: 'Registered agents and workflows',
agents: 'Agents',
workflows: 'Workflows',
active: 'Active',
inactive: 'Inactive',
execLogs: 'Execution Logs',
execLogsDesc: 'Real-time agent execution logs and status',
records: 'records',
noLogs: 'No execution logs',
noLogsHint: 'Logs will appear here after running analysis or debates',
execTimes: 'Executions',
times: '',
success: 'Success',
avg: 'Avg',
recentActivity: 'Recent Activity',
confirmClearLogs: 'Are you sure you want to clear all execution logs? This action cannot be undone.',
},
tasks: {
title: 'Task Manager',
subtitle: 'Crawl task monitoring and management',
task: 'Task',
completed: 'Completed',
running: 'Running',
pending: 'Pending',
failed: 'Failed',
realtime: 'Realtime',
coldStart: 'Cold Start',
crawled: 'Crawled',
saved: 'Saved',
duration: 'Duration',
createdAt: 'Created',
progress: 'Progress',
noTasks: 'No tasks',
loading: 'Loading...',
},
common: {
loading: 'Loading...',
noData: 'No data',
confirm: 'Confirm',
cancel: 'Cancel',
},
time: {
justNow: 'just now',
minutesAgo: ' min ago',
hoursAgo: ' hours ago',
daysAgo: ' days ago',
},
model: {
loading: 'Loading...',
notConfigured: 'LLM not configured',
selectModel: 'Select Model',
selectTip: 'Select Model - Balance quality & cost',
noApiKey: 'API Key not configured',
current: 'Current',
},
debateRoom: {
title: 'Investment Debate',
titlePlaceholder: 'Bull vs Bear Debate Room',
subtitle: 'Bull vs Bear · Investment Manager moderates',
roundPrefix: 'Round',
roundSuffix: '',
typing: 'is typing...',
thinking: 'Thinking...',
noMessages: 'No messages yet',
clickStartDebate: 'Click "Start Debate" to initiate bull-bear confrontation',
canSpeakDuringDebate: 'You can also speak and ask questions during the debate',
debateInProgress: 'Debate in progress, enter @ to mention agents...',
mentionTip: 'Tip: Use @BullDebater @BearDebater to specify a role for replies',
roundStarted: 'round debate started',
debateEnded: 'Debate ended, Investment Manager has made final decision',
debateStarted: 'Debate started, Data Collector is preparing materials...',
searchPlanConfirm: 'Search Plan Confirmation',
searchPlanExecuting: 'Searching...',
searchPlanCompleted: 'Execution completed',
searchPlanCancel: 'Cancel',
searchPlanConfirmBtn: 'Confirm Execution',
estimatedTime: 'Estimated time',
seconds: 's',
},
mentionInput: {
placeholder: 'Enter message, use @ to mention agents or data sources...',
agents: 'Agents',
sources: 'Data Sources',
stocks: 'Stocks',
},
debateHistory: {
history: 'History',
noMessages: 'No messages yet',
messages: 'messages',
justNow: 'just now',
minutesAgo: 'min ago',
hoursAgo: 'hours ago',
daysAgo: 'days ago',
today: 'Today',
yesterday: 'Yesterday',
thisWeek: 'This Week',
older: 'Older',
expandHistory: 'Expand history',
continueDebate: 'Continue debate',
delete: 'Delete',
searchPlaceholder: 'Search history...',
noMatchingRecords: 'No matching records',
noHistoryYet: 'No history yet',
tryOtherKeywords: 'Try other keywords',
historyAutoSave: 'History will be saved after starting debate',
roleNames: {
user: 'Me',
bull: 'Bull',
bear: 'Bear',
manager: 'Manager',
data_collector: 'Data Collector',
},
},
stockDetail: {
title: 'Stock Analysis - Agent-driven Investment Decisions',
relatedNews: 'Related News',
analyzed: 'Analyzed',
items: '',
overallSentiment: 'Overall Sentiment',
recent7d: '7-Day Sentiment',
unknown: 'Unknown',
trend: 'Trend',
up: 'Rising',
down: 'Falling',
stable: 'Stable',
latestNews: 'Latest News',
none: 'None',
kline: 'K-Line Chart - Real Market Data',
dataSource: 'Data source',
supportZoom: 'Supports zoom & drag',
close: 'Close',
change: 'Change',
volume: 'Volume',
billion: 'B',
period: 'Period',
adjust: 'Adjust',
daily: 'Daily',
dailyK: 'Daily',
min60: '60min',
min30: '30min',
min15: '15min',
min5: '5min',
min1: '1min',
qfq: 'Forward Adjusted',
qfqTip: 'Eliminates ex-dividend gaps, maintains continuity (Recommended)',
noAdjust: 'No Adjustment',
noAdjustTip: 'Shows actual trading prices, may have ex-dividend gaps',
hfq: 'Backward Adjusted',
hfqTip: 'Based on IPO date, prices may be very high',
recommendLabel: 'Recommend',
timeLabel: 'Time',
openLabel: 'Open',
highLabel: 'High',
lowLabel: 'Low',
closeLabel: 'Close',
volumeLabel: 'Volume',
parallelAnalysis: 'Parallel Analysis',
parallelAnalysisDesc: 'Bull/Bear parallel analysis, Investment Manager summarizes decision',
realtimeDebate: 'Real-time Debate',
realtimeDebateDesc: 'Four agents real-time dialogue, Investment Manager moderates, Bull/Bear alternate',
quickAnalysis: 'Quick Analysis',
quickAnalysisDesc: 'Single analyst quick recommendation, suitable for time-sensitive scenarios',
result: 'Result',
historySessionLoaded: 'Loaded history session',
detectIncompleteSession: 'Detected incomplete',
session: 'session',
messages: 'messages',
restore: 'Restore?',
analysis: 'Analysis',
analysisModeConfig: 'Analysis Mode Config',
default: 'Default',
parallelExecution: 'Parallel Execution',
about2to3min: '~2-3 min',
realtimeDialogue: 'Real-time Dialogue',
fourAgents: '4 Agents',
about5to10min: '~5-10 min',
singleAgent: 'Single Agent',
about1min: '~1 min',
advancedConfig: 'Advanced Config',
maxExecutionTime: 'Max Execution Time',
seconds: 's',
maxDebateRounds: 'Max Debate Rounds',
rounds: 'rounds',
managerCanInterrupt: 'Manager Can Interrupt',
collectDataBeforeDebate: 'Collect Data Before Debate',
executionTime: 'Time',
news: 'Related News',
newsContain: 'Contains',
newsTotal: '',
fold: 'Collapse',
expand: 'Expand',
clearData: 'Clear Data',
clearing: 'Clearing...',
crawlComplete: 'Crawl Complete',
crawlFailed: 'Crawl Failed',
crawling: 'Crawling...',
stop: 'Stop',
updateCrawl: 'Update Crawl',
targetCrawl: 'Target Crawl',
noRelatedNews: 'No related news',
clickCrawl: 'Click "Target Crawl" to fetch news for this stock',
loadMore: 'Load More',
remaining: '',
showAll: 'Showing all',
newsFolded: 'News collapsed, click "Expand" to view',
sentimentTrend: 'News Sentiment Trend',
sentimentDesc: '30-day sentiment distribution and average',
positive: 'Positive',
negative: 'Negative',
neutral: 'Neutral',
avgSentiment: 'Avg Sentiment',
bullBear: 'Bull vs Bear Agent Debate',
bullBearDesc: 'Bull Researcher vs Bear Researcher, Investment Manager decides',
startDebate: 'Start Debate',
debating: 'Debating...',
analysisMode: 'Analysis Mode',
bullView: 'Bull View',
bearView: 'Bear View',
managerDecision: 'Manager Decision',
waitingAnalysis: 'Waiting for analysis...',
waitingDecision: 'Waiting for bull/bear analysis to complete...',
clickDebate: 'Click "Start Debate" to begin agent analysis',
debateDesc: 'System will call Bull/Bear researchers for multi-angle analysis, with Investment Manager making final decision',
backToSearch: 'Back to Search',
history: 'History',
copy: 'Copy',
export: 'Export',
regenerate: 'Regenerate',
stronglyRec: 'Strongly Recommend',
recommend: 'Recommend',
avoid: 'Avoid',
caution: 'Caution',
strongBull: 'Strong Positive',
strongBear: 'Strong Negative',
noKline: 'No K-line data',
checkCode: 'Please check if the stock code is correct',
sessionRestored: 'Session restored',
debateComplete: 'Debate analysis complete!',
outputting: 'Outputting...',
deciding: 'Deciding...',
analysisComplete: 'Analysis complete',
analysisGenerating: 'Analysis generating...',
decisionGenerating: 'Decision generating...',
debateFailed: 'Debate analysis failed',
sessionDeleted: 'Session deleted',
allHistoryCleared: 'All history cleared',
searchCancelled: 'Search task cancelled',
crawlTaskStarted: 'Targeted crawl task started',
crawlingInProgress: 'Crawling in progress...',
crawlTaskExists: 'This stock already has a crawl task in progress, syncing status...',
crawlTaskStopped: 'Crawl task stopped',
crawlTaskStopFailed: 'Failed to stop task',
newsCleared: 'Cleared',
newsItems: 'news items',
clearNewsConfirm: 'Are you sure you want to clear all news for "',
clearNewsConfirmEnd: '"? This action cannot be undone!',
stopCrawlConfirm: 'Are you sure you want to stop the current crawl task?',
knowledgeGraph: 'Knowledge Graph · Intelligent Retrieval',
knowledgeGraphDesc: 'Concurrent retrieval based on multi-dimensional keywords to improve recall',
nameVariants: 'Name Variants',
mainBusiness: 'Main Business',
relatedConcepts: 'Related Concepts',
concurrentQueries: 'Concurrent Retrieval Queries',
bullResearcher: 'Bull Researcher',
bearResearcher: 'Bear Researcher',
investmentManager: 'Investment Manager',
generatingSearchPlan: 'Generating search plan...',
deleteSessionConfirm: 'Are you sure you want to delete this record?',
clearAllHistoryConfirm: 'Are you sure you want to clear all history? This action cannot be undone!',
clearAllRecords: 'Clear All Records',
crawlSuccess: 'Targeted crawl complete! Added',
unknownError: 'Unknown error',
taskCreated: 'Task created, waiting for execution...',
},
alphaMining: {
training: {
title: 'RL Training Monitor',
desc: 'Transformer + REINFORCE algorithm real-time training progress',
ready: 'Ready',
running: 'Training',
completed: 'Completed',
error: 'Error',
steps: 'Training Steps',
useSentiment: 'Use Sentiment Features',
stop: 'Stop',
start: 'Start Training',
progress: 'Training Progress',
bestFactor: 'Current Best Factor',
convergence: 'Convergence Curve',
trainingFailed: 'Training failed',
},
metrics: {
noData: 'No evaluation data',
hint: 'Please evaluate a factor expression first',
currentFactor: 'Current Factor',
multiDim: 'Multi-dimensional Evaluation',
riskMetrics: 'Risk Metrics',
maxDrawdown: 'Max Drawdown',
safe: 'Safe',
danger: 'Danger',
dailyTurnover: 'Daily Turnover',
winRate: 'Win Rate',
totalReturn: 'Total Return',
returnsCurve: 'Returns Curve',
returnsDesc: 'Strategy cumulative returns vs benchmark',
strategy: 'Strategy',
benchmark: 'Benchmark',
metricDesc: 'Metric Description',
sortinoDesc: 'Sortino: Higher is better, >1 excellent',
sharpeDesc: 'Sharpe: Higher is better, >0.5 good',
icDesc: 'IC: |value|>0.03 effective',
maxDDDesc: 'Max DD: <20% safe',
excellent: 'Excellent',
good: 'Good',
average: 'Average',
poor: 'Poor',
lowTurnover: 'Low Turnover',
},
sentiment: {
title: 'Sentiment Fusion Comparison',
desc: 'Compare pure technical factors vs sentiment-enhanced factors',
steps: 'Training Steps',
comparing: 'Comparing...',
start: 'Start Comparison',
techOnly: 'Pure Technical Factors',
techDesc: ' features (RET, VOL, VOLUME_CHG, TURNOVER)',
enhanced: 'Sentiment-Enhanced Factors',
enhancedDesc: ' features (+SENTIMENT, NEWS_COUNT)',
bestFactor: 'Best Factor',
none: 'None',
improvement: 'Improvement',
improved: 'Sentiment features improved factor performance',
degraded: 'Sentiment features degraded factor performance',
scoreDiff: 'Score Difference',
comparison: 'Score Comparison',
techOnlyBar: 'Technical Only',
enhancedBar: 'With Sentiment',
conclusion: 'Conclusion:',
conclusionPositive: 'Sentiment features (SENTIMENT, NEWS_COUNT) contribute positively to factor mining. It is recommended to enable sentiment fusion in practical applications.',
conclusionNegative: 'In this experiment, sentiment features did not improve performance. Possible reasons include insufficient sample size, sentiment data noise, or too few training steps. It is recommended to increase training steps and retry.',
comparingText: 'Comparison experiment in progress...',
comparingHint: 'Training pure technical factors and sentiment-enhanced factors separately,',
stepsText: ' steps each',
startHint: 'Click "Start Comparison" to run sentiment fusion experiment',
startDesc: 'Will train pure technical factors and sentiment-enhanced factors separately for comparison',
comparisonFailed: 'Comparison failed',
},
agent: {
title: 'AgenticX Agent Call Demo',
desc: 'Demonstrates how Agent calls AlphaMiningTool for factor mining',
success: 'Success',
failed: 'Failed',
toolParams: 'Tool Parameters',
stockCode: 'Stock Code (Optional)',
stockPlaceholder: 'e.g. SH600519',
steps: 'Training Steps',
useSentiment: 'Use Sentiment Features',
executing: 'Executing...',
execute: 'Execute Agent Call',
inputParams: 'Input Parameters',
output: 'Output Result',
executionTime: 'Execution Time',
bestFactor: 'Best Factor',
logs: 'Execution Logs',
codeExample: 'Python Call Example',
executeFailed: 'Execution failed',
startHint: 'Configure parameters and click "Execute Agent Call"',
startDesc: 'Will demonstrate how QuantitativeAgent performs factor mining through AlphaMiningTool',
miningTask: 'Mine quantitative factors for {code}',
createAgent: 'Create Agent',
registerTool: 'Register Tool',
executeMining: 'Execute factor mining',
},
operators: {
all: 'All',
availableFeatures: 'Available Features',
techFeature: 'Technical Feature',
sentimentFeature: 'Sentiment Feature',
totalOperators: '{count} Operators',
totalFeatures: '{count} Features',
params: ' params',
categoryArithmetic: 'Arithmetic',
categoryUnary: 'Unary',
categoryTimeseries: 'Time Series',
categoryConditional: 'Conditional',
categorySpecial: 'Special',
add: 'Addition',
sub: 'Subtraction',
mul: 'Multiplication',
div: 'Division (Safe)',
neg: 'Negate',
abs: 'Absolute Value',
sign: 'Sign Function',
gate: 'Conditional Select',
max: 'Maximum',
min: 'Minimum',
delay1: 'Delay 1 Period',
delay5: 'Delay 5 Periods',
delta1: '1-Period Difference',
delta5: '5-Period Difference',
ma5: '5-Period Moving Average',
ma10: '10-Period Moving Average',
std5: '5-Period Standard Deviation',
std10: '10-Period Standard Deviation',
jump: 'Jump Detection',
jumpExample: 'Detect >3σ outliers',
decay: 'Decay Weighted',
max3: '3-Period Maximum',
},
},
},
};
export const useGlobalI18n = () => {
const { lang } = useLanguageStore();
return globalI18n[lang];
};
================================================
FILE: frontend/src/store/useNewsStore.ts
================================================
import { create } from 'zustand'
import type { News } from '@/types/api'
interface NewsStore {
newsList: News[]
selectedNews: News | null
setNewsList: (news: News[]) => void
setSelectedNews: (news: News | null) => void
updateNews: (newsId: number, updates: Partial) => void
}
export const useNewsStore = create((set) => ({
newsList: [],
selectedNews: null,
setNewsList: (news) => set({ newsList: news }),
setSelectedNews: (news) => set({ selectedNews: news }),
updateNews: (newsId, updates) =>
set((state) => ({
newsList: state.newsList.map((news) =>
news.id === newsId ? { ...news, ...updates } : news
),
})),
}))
================================================
FILE: frontend/src/store/useTaskStore.ts
================================================
import { create } from 'zustand'
import type { CrawlTask, TaskStats } from '@/types/api'
interface TaskStore {
tasks: CrawlTask[]
taskStats: TaskStats | null
setTasks: (tasks: CrawlTask[]) => void
setTaskStats: (stats: TaskStats) => void
addTask: (task: CrawlTask) => void
updateTask: (taskId: number, updates: Partial) => void
}
export const useTaskStore = create((set) => ({
tasks: [],
taskStats: null,
setTasks: (tasks) => set({ tasks }),
setTaskStats: (stats) => set({ taskStats: stats }),
addTask: (task) =>
set((state) => ({
tasks: [task, ...state.tasks],
})),
updateTask: (taskId, updates) =>
set((state) => ({
tasks: state.tasks.map((task) =>
task.id === taskId ? { ...task, ...updates } : task
),
})),
}))
================================================
FILE: frontend/src/types/api.ts
================================================
/**
* API 类型定义
* 与后端 API 响应结构保持一致
*/
export interface News {
id: number
title: string
content: string
url: string
source: string
publish_time: string | null
created_at: string
stock_codes: string[] | null
sentiment_score: number | null
author: string | null
keywords: string[] | null
}
export interface Analysis {
id: number
news_id: number
agent_name: string
agent_role: string | null
analysis_result: string
summary: string | null
sentiment: 'positive' | 'negative' | 'neutral' | null
sentiment_score: number | null
confidence: number | null
execution_time: number | null
created_at: string
}
export interface CrawlTask {
id: number
celery_task_id: string | null
mode: 'cold_start' | 'realtime' | 'targeted'
status: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled'
source: string
config: Record | null
progress: {
current_page?: number
total_pages?: number
percentage?: number
} | null
current_page: number | null
total_pages: number | null
result: Record | null
crawled_count: number
saved_count: number
error_message: string | null
execution_time: number | null
created_at: string
started_at: string | null
completed_at: string | null
}
export interface TaskStats {
total: number
by_status: Record
by_mode: Record
recent_completed: number
total_news_crawled: number
total_news_saved: number
}
export interface CrawlRequest {
source: string
start_page: number
end_page: number
}
export interface CrawlResponse {
success: boolean
message: string
crawled_count: number
saved_count: number
source: string
}
export interface AnalysisResponse {
success: boolean
analysis_id?: number
news_id: number
sentiment?: string
sentiment_score?: number
confidence?: number
summary?: string
execution_time?: number
error?: string
}
// ============ Phase 2: 个股分析类型 ============
export interface StockOverview {
code: string
name: string | null
total_news: number
analyzed_news: number
avg_sentiment: number | null
recent_sentiment: number | null
sentiment_trend: 'up' | 'down' | 'stable'
last_news_time: string | null
}
export interface StockNewsItem {
id: number
title: string
content: string
url: string
source: string
publish_time: string | null
sentiment_score: number | null
has_analysis: boolean
}
export interface SentimentTrendPoint {
date: string
avg_sentiment: number
news_count: number
positive_count: number
negative_count: number
neutral_count: number
}
export interface KLineDataPoint {
timestamp: number // 时间戳(毫秒)
date: string
open: number
high: number
low: number
close: number
volume: number
turnover?: number // 成交额
change_percent?: number // 涨跌幅
change_amount?: number // 涨跌额
amplitude?: number // 振幅
turnover_rate?: number // 换手率
}
export interface RealtimeQuote {
code: string
name: string
price: number
change_percent: number
change_amount: number
volume: number
turnover: number
high: number
low: number
open: number
prev_close: number
}
// ============ Phase 2: 智能体辩论类型 ============
export interface DebateRequest {
stock_code: string
stock_name?: string
context?: string
provider?: string
model?: string
mode?: 'parallel' | 'realtime_debate' | 'quick_analysis' // 辩论模式
language?: 'zh' | 'en' // 语言设置,影响AI回答的语言
}
export interface AgentAnalysis {
success: boolean
agent_name: string
agent_role?: string
stance: 'bull' | 'bear'
analysis?: string
error?: string
timestamp?: string
}
export interface FinalDecision {
success: boolean
agent_name: string
agent_role?: string
decision?: string
rating?: string
error?: string
timestamp?: string
}
export interface TrajectoryStep {
step: string
timestamp: string
data: Record
}
export interface QuickAnalysisResult {
success: boolean
analysis?: string
timestamp?: string
error?: string
}
export interface DebateHistoryItem {
round: number
agent: string
type: string
content: string
}
export interface DebateResponse {
success: boolean
debate_id?: string
stock_code: string
stock_name?: string
mode?: 'parallel' | 'realtime_debate' | 'quick_analysis'
bull_analysis?: AgentAnalysis
bear_analysis?: AgentAnalysis
final_decision?: FinalDecision
quick_analysis?: QuickAnalysisResult
debate_history?: DebateHistoryItem[]
trajectory?: TrajectoryStep[]
execution_time?: number
error?: string
}
// ============ Phase 2: 智能体监控类型 ============
export interface AgentLogEntry {
id: string
timestamp: string
agent_name: string
agent_role?: string
action: string
status: 'started' | 'completed' | 'failed'
details?: Record
execution_time?: number
}
export interface AgentMetrics {
total_executions: number
successful_executions: number
failed_executions: number
avg_execution_time: number
agent_stats: Record
recent_activity: Array<{
timestamp: string
agent_name: string
action: string
status: string
}>
}
export interface AgentInfo {
name: string
role: string
description: string
status: 'active' | 'inactive'
}
export interface WorkflowInfo {
name: string
description: string
agents: string[]
status: 'active' | 'inactive'
}
================================================
FILE: frontend/tailwind.config.js
================================================
/** @type {import('tailwindcss').Config} */
export default {
darkMode: ["class"],
content: [
'./pages/**/*.{ts,tsx}',
'./components/**/*.{ts,tsx}',
'./app/**/*.{ts,tsx}',
'./src/**/*.{ts,tsx}',
],
prefix: "",
theme: {
container: {
center: true,
padding: "2rem",
screens: {
"2xl": "1400px",
},
},
extend: {
colors: {
border: "hsl(var(--border))",
input: "hsl(var(--input))",
ring: "hsl(var(--ring))",
background: "hsl(var(--background))",
foreground: "hsl(var(--foreground))",
primary: {
DEFAULT: "hsl(var(--primary))",
foreground: "hsl(var(--primary-foreground))",
},
secondary: {
DEFAULT: "hsl(var(--secondary))",
foreground: "hsl(var(--secondary-foreground))",
},
destructive: {
DEFAULT: "hsl(var(--destructive))",
foreground: "hsl(var(--destructive-foreground))",
},
muted: {
DEFAULT: "hsl(var(--muted))",
foreground: "hsl(var(--muted-foreground))",
},
accent: {
DEFAULT: "hsl(var(--accent))",
foreground: "hsl(var(--accent-foreground))",
},
popover: {
DEFAULT: "hsl(var(--popover))",
foreground: "hsl(var(--popover-foreground))",
},
card: {
DEFAULT: "hsl(var(--card))",
foreground: "hsl(var(--card-foreground))",
},
},
borderRadius: {
lg: "var(--radius)",
md: "calc(var(--radius) - 2px)",
sm: "calc(var(--radius) - 4px)",
},
keyframes: {
"accordion-down": {
from: { height: "0" },
to: { height: "var(--radix-accordion-content-height)" },
},
"accordion-up": {
from: { height: "var(--radix-accordion-content-height)" },
to: { height: "0" },
},
},
animation: {
"accordion-down": "accordion-down 0.2s ease-out",
"accordion-up": "accordion-up 0.2s ease-out",
},
},
},
plugins: [require("tailwindcss-animate")],
}
================================================
FILE: frontend/tsconfig.json
================================================
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true,
/* Path mapping */
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["src"],
"references": [{ "path": "./tsconfig.node.json" }]
}
================================================
FILE: frontend/tsconfig.node.json
================================================
{
"compilerOptions": {
"composite": true,
"skipLibCheck": true,
"module": "ESNext",
"moduleResolution": "bundler",
"allowSyntheticDefaultImports": true
},
"include": ["vite.config.ts"]
}
================================================
FILE: frontend/vite.config.ts
================================================
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react-swc'
import path from 'path'
// https://vitejs.dev/config/
export default defineConfig({
plugins: [react()],
resolve: {
alias: {
'@': path.resolve(__dirname, './src'),
},
},
server: {
port: 3000,
proxy: {
'/api': {
target: 'http://localhost:8000',
changeOrigin: true,
},
},
},
})
================================================
FILE: legacy_v1/.deepsource.toml
================================================
version = 1
[[analyzers]]
name = "python"
[analyzers.meta]
runtime_version = "3.x.x"
================================================
FILE: legacy_v1/Chinese_Stop_Words.txt
================================================
ÿ
ǰ
ת
λ
֤ȯ
ο
Υ߱ؾ
£
:
&
*
һһ
~~~~
.
.һ
./
--
ۣ
ۢݣݣ
ۢ٣ģ
P
//
ۢڣ
ۢڣ
}
Ҳ
ۢ٢ޣ
ۢڣ£
ۢ٣
ۢܣ
ۢ٢ۣ
ۣۢ
ۣ
ۢڣ
ۢ٢
ۢݣ
ۢڣ
ۢܣ
ۢڢۣ
ۣۢ
ۢܣ
ۢ٢ݣ
ۢ٢ߣ
ۢ٣
ʣ
ۢ٢
ۢ٢ܣ
ۢ٣
ۢڣ
ۢڢ
ۢڢ٣
ۢ٣ã
ۣۢ
ۣۢ
ۢڢݣ
ۢڢڣ
һ.
ۢ٣
.
ۣ
ۢ٣£
/
ۢ٣
ۣۢ
ۢ٢٣
ۢܣ
ۢܣ
ۣۢ
ۢݣ
ۢ٣
ۢڢ
ۢڢߣ
ۢ٣
ۢڣ
ݣ
://
ۢڢ
ۢݣ
...
...................
ڣأƣɣԣ
ۣۢƣ
ۢ٣
ݡġ䣽
Ȧա
ڣ
ۢۢ٣
ң̣
ۢ٣ţ
ۣݣ
.
ۢڣ
ۢ
ۢڢߣ
ۢڢڣ
ۣۢ
ۢ٣
ۢ٣£
ۢ٣
ۢ٣
ۢ٣
ۢ٢ڣ
ۢڣ
ۢ
ۢ٣
ۢڣ
ۢڢޣ
ۣۢ
ۢڢ
Ԫ
ۢڢ
ۢ٣
::
ۢڣ
ۣۢ
ۢܣ
ۢݣ
ۢޣ
ۢߣ
ۢ
ۢ
?
,
'
?
?
<
>
[
]
(
)
-
+
/
"
;
#
@
գ
sub
exp
sup
sub
Lex
=
ۢݣ
ۢݣ
ۢڣ
ۢڣǣ
ۢ٣
̣
ۣ
......
ʵϰ
ѽ
Ӵ
ȷ
˴
˵
Ȼ
Ω
ֻ
֮
˼
Ӷ
Ļ
ȵ
˵
֮
ǵ
ͽ
µ
λ
ʴ
Ȼ
Ȼ
δ
ο
ʱ
仰˵
֮
ʹ
ʱ
Ȼ
̶
֮
ʹ
֮
˵
˵
˵
ʼ
ɼ
ͬ
һ
˵
˵
ð
ô
ÿ
ÿ
Ī
ij
ij
ijЩ
ı
Ķ
ĸ
Щ
DZ
Ƕ
Ǹ
ǻ
ô
ôЩ
ô
ʱ
Щ
Ը
Ŷ
Ż
ž
ƾ
ƾ
һ
ǡǡ෴
ǰ
ǰ
Ȼ
Ȼ
Ȼ
˼
κ
ƾ
ɶ
ʹ
ô
ʡ
ʱ
ʲô
ʲô
ʹ
ǵ
˭
˭֪
˳
˳
Ƶ
Ȼ
˵
Ȼ
Ȼ
ʹ
ͨ
ͬ
ͬʱ
һ
Ϊ
Ϊ
Ϊ
Ϊʲô
Ϊ
ι
غ
ں
Զ
ѽ
Ҫ
Ҫ
ҪȻ
Ҫ
Ҫô
Ҫ
Ҳ
Ҳ
Ҳ
һ
һ
һ
һ
һ
һ
һ
һ
Ա
Լ
ֻ
Ϊ
Ӵ
ɴ˿ɼ
е
й
Щ
Ǻ
ͬʱ
Խ
˵
ô
ô
ô
զ
˵
ô
ô
ôЩ
ô
ʱ
Щ
֨
֮
֮
֮
֮һ
ֻ
ֻ
ֻҪ
ֻ
λ
Դ
Ը
Ը
Լ
Լ
ܵ
ܵ˵
ܵ˵
֮ܶ
֮
Ȼ
ʹ
Ϊ
ѽ
Ӵ
Ұ
Ű
ʱ
˵
Ȼ
˳
װ
˵
Ͼ
ض
ؽ
û
û
Ȼ
ò
ɿ
ɿ
ܲ
ȻĻ
ʤ
ʱ
ͬ
Ҫ
ֺ
ɵ
ֶ
ô
֪
ֹ
ֹһ
Ե
һ
Ե
˵
˵ú
ȥ
˵
ҹ
ñ
û
˻
ʤ
϶
Ȼ
伫
ȥ
˶
ȥ
ȴ
Ϣ
˵
˺
ε
Ҵ
Ӳ
Ӵ
ӴԺ
ӹŵ
ӹ
ӽԺ
ӿ
ͷ
δ
С
絽
ﵩ
촰˵
Լ
Ը
ָ֮
ڶ
Ȼ
ͥ
ͷ
˵
˶
ĿǰΪֹ
ͷ
ͷ
ȷ
ȵ
Ȼ
Ȼ
ʱ
ǰ
˵
û˵
֮Ȼ
֮
dz
ǵ
ڷ
ͷ
Ȼ
¸
õ
Ͽ
粻
ղ
պ
ߵ
ҹ
ʽ
һ
Ϊ
Ȼ
Ƶ
ʶ
ֲ
߳
ޱ
α
γ
η
ο
ֶΪ
ֹ
ܶ
Ȼ
Ȼ
˵
Ȼ
Ȼ
ͬ
Ϊ
Ҵ
˵
...
֮
֮
֮
ֱ
Ҫ
ϱ
Ϊ
Կ
Ȼ
ʱ
ȥ
Ȼ
Ľ
ľ
Ȼ
ʹ
͵
Ȼ
ٷ
ݳ
ݴ
ʵ
˵
֪
Ϥ
˵
ȥ
ɺ
Ҫ
ü
ϴ
ʵʵ
۴
Ӧ
ʱ
ٵ
һ
·
Ŵ
Ŵ
ʶ
Ȼ
Լ
Ϊ
˵
û
û
ÿ
ÿÿ
ÿʱÿ
Ȼ
Ȼ
Ī
Ī
Ī
Ī
ĬĬ
ĬȻ
ĩ
ѵ
ѵ
ѹ
˵
긴һ
ż
ż
Ʃ
ƫƫ
ƹ
ƽ
ͨ
ʵ
ͷ
ֹ
ǡ
ǡ
ǡǡ
ǡ
ǡ
ǡ
ǧ
ǧ
ǧǧ
в
Ī
̼
֮
ȡ
ȥ
Ȩʱ
ȫ
ȫ
ȫ
ȫȻ
ȫ
Ȼ
Ծ
Ȼ
ոһ
ռ
ս
糣
˵ȵ
ǰ
ͷ
ɪɪ
ɳɳ
ȥ
һ.
һһ
һ
һ
һЩ
һ
һͨ
һ
һ
һʱ
һ
һƬ
һ
һֱ
һ
һ
һת
һ
һ
ȥ
һ
Ȼ
˵
ר
Ҳ˵
˵
ϸ
С
м
ḻ
Ϊ
Ϊʲ
Ϊֹ
Ϊ
Ҫ
֮ǰ
֮
֮
Ҳ˵
Ҳ
˽
ȡ
ƶ
Щ
ʲ
Ϊ
ǰ
Ժ
Թ
ͼ
ΰ
ƺ
ʹ
ʹ
ٽ
Ȼ
Ԫ
Ȳ
Ⱥ
ȫ
ȫ
ȫ
ͬ
֮
ٴ
˵
ֱ
ǰ
ǰ
ǰ
ǿ
ʮ
ȴ
ȴ
ԭ
ּ
ʱ
˫
Ӧ
ӳ
ȡ
ܵ
Ϥ
ֻ
ֻ
ֻ
ֻ
ٿ
ͬһ
ͬ
ʹ
Χ
Ǻ
Ψ
ॵ
ٺ
ô
ʧȥ
õ
ͬ
ʼ
֪
ǵ
ȫ
ȫ
ʵ
ʵ
Ӧ
Դ
Է
Ա
С
Ҫ
Ѿ
Ͱ
㷺
Ӧ
Ӧ
Ӧ
չ
ǿ
ǿ
ǰ
ʱ
γ
ʱ
ó
õ
Ȼ
Ҫ
ܽ
Ω
˼
Ը
Ϊ
ҵ
Ի
ս
ν
/
ȷ
Dz
Ƿ
Ȼ
ͨ
ձ
м
Ч
ʱ
е
е
ĩ##ĩ
˵
ijij
ӭ
ֵ
˵
˴
ʱ
˴
ÿ
ÿ
ÿ
ȼ
Ƚ
ûκ
ע
Ȼ
ر
ص
ִ
ɴ
Ŀǰ
ֱ
ֱ
෴
ͬ
Ӧ
൱
գ
Ӻ
֪
ȷ
ƶ
ͻ
ͻȻ
ڶ
ϰ
̺
ά
ϵ
ܷ
ܹ
Ժ
Դ
Χ
ĪȻ
Ϊ
ж
ʾ
Ҫ
涨
Ʃ
Ϊ
ʶ
˵
˵
˵˵
˭
˭
ת
ת
ת
ﵽ
Ѹ
ȥ
Ҫ
һ
Ӧ
ʵ
ͨ
ѭ
ǰ
ȡ
ش
Ҫ
ֹ
ʱ
ѵ˵
Ҫ
Ƕ
================================================
FILE: legacy_v1/Crawler/__init__.py
================================================
================================================
FILE: legacy_v1/Crawler/crawler_cnstock.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 3 13:41:50 2018
@author: Damon Li
"""
import time, re, requests
from concurrent import futures
from bs4 import BeautifulSoup
from pymongo import MongoClient
import Text_Analysis.text_mining as tm
import gevent
from gevent import monkey,pool
monkey.patch_all()
class WebCrawlFromcnstock(object):
'''Crawl company news from 'http://company.cnstock.com/company/scp_gsxw/1',
'http://ggjd.cnstock.com/gglist/search/qmtbbdj/1',
'http://ggjd.cnstock.com/gglist/search/ggkx/1' website.
# Arguments:
totalPages: Number of pages set to be crawled.
Range: Divide total web pages into totalPages/Range parts
for multi-threading processing.
ThreadsNum: Number of threads needed to be start.
dbName: Name of database.
colName: Name of collection.
IP: Local IP address.
PORT: Port number corresponding to IP address.
'''
def __init__(self,**kwarg):
self.ThreadsNum = kwarg['ThreadsNum']
self.dbName = kwarg['dbName']
self.colName = kwarg['collectionName']
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.Prob = .5
self.realtimeNewsURL = []
self.tm = tm.TextMining(IP="localhost",PORT=27017)
def ConnDB(self):
'''Connect mongodb.
'''
Conn = MongoClient(self.IP, self.PORT)
db = Conn[self.dbName]
self._collection = db.get_collection(self.colName)
def countchn(self,string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chnnum = len(result)
possible = chnnum/len(str(string))
return (chnnum, possible)
def getUrlInfo(self,url):
'''Analyze website and extract useful information.
'''
respond = requests.get(url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
span_list = bs.find_all('span')
part = bs.find_all('p')
article = ''
date = ''
for span in span_list:
if 'class' in span.attrs and span['class'] == ['timer']:
date = span.text
break
for paragraph in part:
chnstatus = self.countchn(str(paragraph))
possible = chnstatus[1]
if possible > self.Prob:
article += str(paragraph)
while article.find('<') != -1 and article.find('>') != -1:
string = article[article.find('<'):article.find('>')+1]
article = article.replace(string,'')
while article.find('\u3000') != -1:
article = article.replace('\u3000','')
article = ' '.join(re.split(' +|\n+', article)).strip()
return date, article
def GenPagesLst(self,totalPages,Range,initPageID):
'''Generate page number list using Range parameter.
'''
PageLst = []
k = initPageID
while k+Range-1 <= totalPages:
PageLst.append((k,k+Range-1))
k += Range
if k+Range-1 < totalPages:
PageLst.append((k,totalPages))
return PageLst
def CrawlHistoryCompanyNews(self,startPage,endPage,url_Part_1):
'''Crawl historical company news
'''
self.ConnDB()
AddressLst = self.extractData(['Address'])[0]
if AddressLst == []:
urls = []
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId))
for url in urls:
print(url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.cnstock.com/company/') != -1 \
and a.parent.find('span'):
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .193
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
else:
urls = []
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId))
for url in urls:
print(' ', url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.cnstock.com/company/') != -1 \
and a.parent.find('span'):
if a['href'] not in AddressLst:
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
def CrawlRealtimeCompanyNews(self,url_part_lst):
'''Continue crawling company news from first website page
every once in a while and extract the useful information,
including summary, key words, released date, related stock
codes list and main body.
'''
doc_lst = []
self.ConnDB()
self._AddressLst = self.extractData(['Address'])[0]
for url_Part in url_part_lst:
url = url_Part + str(1)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
if len(self.realtimeNewsURL) == 0:
for a in a_list:
if ('href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.cnstock.com/company/') != -1 \
and a.parent.find('span')) or ('href' in a.attrs and 'target' in a.attrs \
and 'title' in a.attrs and a['href'].find('http://ggjd.cnstock.com/company/') != -1 \
and a.parent.find('span')):
if a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a['title'] + ' ' + article)
print(' [' + date + '] ' + a['title'])
else:
for a in a_list:
if ('href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.cnstock.com/company/') != -1 \
and a.parent.find('span')) or ('href' in a.attrs and 'target' in a.attrs \
and 'title' in a.attrs and a['href'].find('http://ggjd.cnstock.com/company/') != -1 \
and a.parent.find('span')):
if a['href'] not in self.realtimeNewsURL and a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a['title'] + ' ' + article)
print(' [' + date + '] ' + a['title'])
return doc_lst
def extractData(self,tag_list):
'''Extract column data with tag in 'tag_list' to the list.
'''
data = []
for tag in tag_list:
exec(tag + " = self._collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def coroutine_run(self,totalPages,Range,initPageID,**kwarg):
'''Coroutines running.
'''
jobs = []
page_ranges_lst = self.GenPagesLst(totalPages,Range,initPageID)
for page_range in page_ranges_lst:
jobs.append(gevent.spawn(self.CrawlHistoryCompanyNews,page_range[0],page_range[1],kwarg['url_Part_1']))
gevent.joinall(jobs)
def multi_threads_run(self,**kwarg):
'''Multi-threading running.
'''
page_ranges_lst = self.GenPagesLst()
print(' Using ' + str(self.ThreadsNum) + ' threads for collecting news ... ')
with futures.ThreadPoolExecutor(max_workers=self.ThreadsNum) as executor:
future_to_url = {executor.submit(self.CrawlHistoryCompanyNews,page_range[0],page_range[1]) : \
ind for ind, page_range in enumerate(page_ranges_lst)}
def classifyRealtimeStockNews(self):
'''Continue crawling and classifying news(articles/documents) every 60s.
'''
while True:
print(' * start crawling news from CNSTOCK ... ')
doc_list = self.CrawlRealtimeCompanyNews(['http://company.cnstock.com/company/scp_gsxw/',\
'http://ggjd.cnstock.com/gglist/search/qmtbbdj/',\
'http://ggjd.cnstock.com/gglist/search/ggkx/']) #
print(' * finish crawling ... ')
if len(doc_list) != 0:
self.tm.classifyRealtimeStockNews(doc_list)
time.sleep(60)
================================================
FILE: legacy_v1/Crawler/crawler_jrj.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 3 13:41:50 2018
@author: Damon Li
"""
import time, re, requests, datetime
from concurrent import futures
from bs4 import BeautifulSoup
from pymongo import MongoClient
import Text_Analysis.text_mining as tm
from bson.objectid import ObjectId
import gevent
from gevent import monkey,pool
monkey.patch_all()
class WebCrawlFromjrj(object):
'''Crawl company news from 'http://roll.finance.sina.com.cn/finance/zq1/ssgs/index.shtml' website.
# Arguments:
totalPages: Number of pages set to be crawled.
Range: Divide total web pages into totalPages/Range parts
for multi-threading processing.
ThreadsNum: Number of threads needed to be start.
dbName: Name of database.
colName: Name of collection.
IP: Local IP address.
PORT: Port number corresponding to IP address.
'''
def __init__(self,*arg,**kwarg):
self.startDate = arg[0]
self.endDate = arg[1]
self.Range = arg[2]
self.ThreadsNum = kwarg['ThreadsNum']
self.dbName = kwarg['dbName']
self.colName = kwarg['collectionName']
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.Prob = .5
self.realtimeNewsURL = []
self.tm = tm.TextMining(IP="localhost",PORT=27017)
def getEveryDay(self,begin_date,end_date):
'''Get date list from 'begin_date' to 'end_date' on the calendar.
'''
date_list = []
begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
end_date = datetime.datetime.strptime(end_date,"%Y-%m-%d")
while begin_date <= end_date:
date_str = begin_date.strftime("%Y-%m-%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
return date_list
def countchn(self,string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chnnum = len(result)
possible = chnnum/len(str(string))
return (chnnum, possible)
def getUrlInfo(self,url,specificDate):
'''Analyze website and extract useful information.
'''
respond = requests.get(url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
meta_list = bs.find_all('meta')
span_list = bs.find_all('span')
part = bs.find_all('p')
article = ''
date = ''
NotFoundPage = False
for span in span_list:
for child in span.children:
if child == 'jrj_final_date_start':
date = span.text.replace('\r','').replace('\n','')
if date.find('年') != -1:
date = date.replace('年','-').replace('月','-').replace('日','')
break
break
if date == '':
date = specificDate
for p in part:
if p.text.find('页面没有找到') != -1:
NotFoundPage = True
break
if not NotFoundPage:
for paragraph in part:
chnstatus = self.countchn(str(paragraph))
possible = chnstatus[1]
if possible > self.Prob:
article += str(paragraph)
while article.find('<') != -1 and article.find('>') != -1:
string = article[article.find('<'):article.find('>')+1]
article = article.replace(string,'')
while article.find('\u3000') != -1:
article = article.replace('\u3000','')
article = ' '.join(re.split(' +|\n+', article)).strip()
return date, article, NotFoundPage
def GenDatesLst(self):
'''Divide date list into parts using Range parameter.
'''
DatesLst = self.getEveryDay(self.startDate,self.endDate)
NewDatesLst = []
k = 0
while k < len(DatesLst):
if k+self.Range >= len(DatesLst):
break
else:
NewDatesLst.append(DatesLst[k:k+self.Range])
k += self.Range
NewDatesLst.append(DatesLst[k:])
return NewDatesLst
def findPagesOfSpecificDate(self,firstUrl,date):
'''Search the number of web pages of specific date.
# Arguments:
firstUrl: The first web page of specific date.
date: Desinated date.
'''
respond = requests.get(firstUrl)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
a_list = bs.find_all('a')
Nums = 1
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs:
if a['href'].find(date.replace('-','') + '_') != -1 and a.text.isdigit():
Nums += 1
return Nums
def CrawlRealtimeCompanyNews(self,today_Date):
'''Continue crawling company news from first website page
every once in a while and extract the useful information,
including summary, key words, released date, related stock
codes list and main body.
'''
doc_lst = []
if len(self.realtimeNewsURL) == 0:
self.ConnDB()
self._AddressLst = self.extractData(['Address'])[0]
urlsAndDates = []
url_Part_1 = 'http://stock.jrj.com.cn/xwk/'
url_Part_2 = '_1.shtml'
firstUrl = url_Part_1 + today_Date.replace('-','')[0:6] + '/' + today_Date.replace('-','') + url_Part_2
Nums = self.findPagesOfSpecificDate(firstUrl,today_Date)
for num in range(1,Nums+1):
urlsAndDates.append((url_Part_1 + today_Date.replace('-','')[0:6] + '/' + today_Date.replace('-','') \
+ '_' + str(num) + '.shtml', today_Date))
for url, specificDate in urlsAndDates:
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('/' + specificDate.replace('-','')[0:4] + '/' + specificDate.replace('-','')[4:6] + '/') != -1:
if a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
while article == '' and self.Prob >= .1 and not NotFoundPage:
self.Prob -= .1
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a.string + ' ' + article)
print(' [' + date + '] ' + a.string)
else:
urlsAndDates = []
url_Part_1 = 'http://stock.jrj.com.cn/xwk/'
url_Part_2 = '_1.shtml'
firstUrl = url_Part_1 + today_Date.replace('-','')[0:6] + '/' + today_Date.replace('-','') + url_Part_2
Nums = self.findPagesOfSpecificDate(firstUrl,today_Date)
for num in range(1,Nums+1):
urlsAndDates.append((url_Part_1 + today_Date.replace('-','')[0:6] + '/' + today_Date.replace('-','') \
+ '_' + str(num) + '.shtml', today_Date))
for url, specificDate in urlsAndDates:
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('/' + specificDate.replace('-','')[0:4] + '/' + specificDate.replace('-','')[4:6] + '/') != -1:
if a['href'] not in self._AddressLst and a['href'] not in self.realtimeNewsURL:
self.realtimeNewsURL.append(a['href'])
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
while article == '' and self.Prob >= .1 and not NotFoundPage:
self.Prob -= .1
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a.string + ' ' + article)
print(' [' + date + '] ' + a.string)
return doc_lst
def CrawlHistoryCompanyNews(self,datelst):
'''Crawl historical company news
'''
self.ConnDB()
AddressLst = self.extractData(['Address'])[0]
if AddressLst == []:
urlsAndDates = []
url_Part_1 = 'http://stock.jrj.com.cn/xwk/'
url_Part_2 = '_1.shtml'
for date in datelst:
firstUrl = url_Part_1 + date.replace('-','')[0:6] + '/' + date.replace('-','') + url_Part_2
Nums = self.findPagesOfSpecificDate(firstUrl,date)
for num in range(1,Nums+1):
urlsAndDates.append((url_Part_1 + date.replace('-','')[0:6] + '/' + date.replace('-','') \
+ '_' + str(num) + '.shtml', date))
for url, specificDate in urlsAndDates:
print(url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('/' + specificDate.replace('-','')[0:4] + '/' + specificDate.replace('-','')[4:6] + '/') != -1:
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
while article == '' and self.Prob >= .1 and not NotFoundPage:
self.Prob -= .1
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Article' : article}
self._collection.insert_one(data)
else:
urlsAndDates = []
url_Part_1 = 'http://stock.jrj.com.cn/xwk/'
url_Part_2 = '_1.shtml'
for date in datelst:
firstUrl = url_Part_1 + date.replace('-','')[0:6] + '/' + date.replace('-','') + url_Part_2
Nums = self.findPagesOfSpecificDate(firstUrl,date)
for num in range(1,Nums+1):
urlsAndDates.append((url_Part_1 + date.replace('-','')[0:6] + '/' + date.replace('-','') \
+ '_' + str(num) + '.shtml', date))
for url, specificDate in urlsAndDates:
print(' ', url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('/' + specificDate.replace('-','')[0:4] + '/' + specificDate.replace('-','')[4:6] + '/') != -1:
if a['href'] not in AddressLst:
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
while article == '' and self.Prob >= .1 and not NotFoundPage:
self.Prob -= .1
date, article, NotFoundPage = self.getUrlInfo(a['href'],specificDate)
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Article' : article}
self._collection.insert_one(data)
def ConnDB(self):
'''Connect mongodb.
'''
Conn = MongoClient(self.IP, self.PORT)
db = Conn[self.dbName]
self._collection = db.get_collection(self.colName)
def extractData(self,tag_list):
'''Extract column data with tag in 'tag_list' to the list.
'''
data = []
for tag in tag_list:
exec(tag + " = self._collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def StockCodeDuplicateRemoval(self):
'''Discarded.
'''
Conn = MongoClient(self.IP, self.PORT)
db = Conn[self.dbName]
collection = db.get_collection(self.colName)
idLst = collection.distinct('_id')
relevantStockSeries = []
for _id in idLst:
data = collection.find_one({'_id':ObjectId(_id)})
if 'relevantStock' in data.keys():
relevantStock = collection.find_one({'_id':ObjectId(_id)})['relevantStock']
if len(relevantStock) > 1:
relevantStockCodeDuplicateRemoval = list(set(relevantStock))
collection.update({"_id":_id},{"$set":{"relevantStock":' '.join(relevantStockCodeDuplicateRemoval)}})
print(relevantStockCodeDuplicateRemoval)
break
if len(relevantStock) == 1:
print(relevantStock)
print(len(relevantStock))
break
print('Duplicate Removal successfully ... ')
def coroutine_run(self):
'''Coroutines running.
'''
jobs = []
dateLst = self.GenDatesLst()
for datelst in dateLst:
jobs.append(gevent.spawn(self.CrawlHistoryCompanyNews,datelst))
gevent.joinall(jobs)
def multi_threads_run(self,**kwarg):
'''Multi-threading running.
'''
dateLst = self.GenDatesLst()
print(' Using ' + str(self.ThreadsNum) + ' threads for collecting news ... ')
with futures.ThreadPoolExecutor(max_workers=self.ThreadsNum) as executor:
future_to_url = {executor.submit(self.CrawlHistoryCompanyNews,datelst) : \
ind for ind, datelst in enumerate(dateLst)}
def classifyRealtimeStockNews(self):
'''Continue crawling and classifying news(articles/documents) every 60s.
'''
today_Date = datetime.datetime.now().strftime('%Y-%m-%d')
while True:
print(' * start crawling news from JRJ ... ')
doc_list = self.CrawlRealtimeCompanyNews(today_Date) #
print(' * finish crawling ... ')
if len(doc_list) != 0:
self.tm.classifyRealtimeStockNews(doc_list)
time.sleep(60)
================================================
FILE: legacy_v1/Crawler/crawler_nbd.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 23 17:19:50 2018
@author: Damon Li
"""
import re, os, time, requests
from bs4 import BeautifulSoup
import pymongo, threading, traceback
import gevent
from gevent import monkey,pool
monkey.patch_all()
class WebCrawlFromNBD(object):
'''Crawl company news from 'http://stocks.nbd.com.cn/columns/275' website.
# Arguments:
totalPages: Number of pages set to be crawled.
Range: Divide total web pages into totalPages/Range parts
for multi-threading processing.
ThreadsNum: Number of threads needed to be start.
dbName: Name of database.
colName: Name of collection.
IP: Local IP address.
PORT: Port number corresponding to IP address.
'''
def __init__(self,*arg,**kwarg):
self.totalPages = arg[0] #totalPages
self.Range = arg[1] #Range
self.ThreadsNum = kwarg['ThreadsNum']
self.dbName = kwarg['dbName']
self.colName = kwarg['collectionName']
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.url_lst_withoutArticles = []
self.title_lst_withoutArticles = []
self.url_lst_withoutNews = []
self.CrawledUrlsID = []
self.filePath = os.path.dirname(os.path.realpath(__file__))
def countchn(self,string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chnnum = len(result)
possible = chnnum/len(str(string))
return (chnnum, possible)
def getUrlInfo(self,url):
'''Analyze website and extract useful information.
'''
respond = requests.get(url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
span_list = bs.find_all('span')
part = bs.find_all('p')
article = ''
date = ''
for span in span_list:
if 'class' in span.attrs and span.text and span['class'] == ['time']:
string = span.text.split()
for dt in string:
if dt.find('-') != -1:
date += dt + ' '
elif dt.find(':') != -1:
date += dt
break
for paragraph in part:
chnstatus = self.countchn(str(paragraph))
possible = chnstatus[1]
if possible > 0.5:
article += str(paragraph)
while article.find('<') != -1 and article.find('>') != -1:
string = article[article.find('<'):article.find('>')+1]
article = article.replace(string,'')
while article.find('\u3000') != -1:
article = article.replace('\u3000','')
article = ' '.join(re.split(' +|\n+', article)).strip()
return article, date
def GenPagesLst(self):
'''Generate page number list using Range parameter.
'''
PageLst = []
k = 1
while k+self.Range-1 <= self.totalPages:
PageLst.append((k,k+self.Range-1))
k += self.Range
if k+self.Range-1 < self.totalPages:
PageLst.append((k,self.totalPages))
return PageLst
def ReCrawlNews(self,url_list):
'''Continue crawling pages without any return.
# Arguments:
url_list: List of web pages that without any values.
'''
try:
nums = 1
ulst = []
while url_list != []:
ulst.append(url_list[0])
print(' ', url_list[0])
if nums > 10:
print(' wait 1s before request url again ...')
time.sleep(1)
nums = 1
resp = requests.get(url_list[0])
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
if a_list != []:
for a in a_list:
if 'click-statistic' in a.attrs and a.string \
and a['click-statistic'].find('Article_') != -1 \
and a['href'].find('http://www.nbd.com.cn/articles/') != -1:
article, date = self.getUrlInfo(a['href'])
if date == '' or article == '':
self.url_lst_withoutArticles.append(a['href'])
self.title_lst_withoutArticles.append(a.string)
elif date != '' and article != '':
data = {'date' : date,
'address' : a['href'],
'title' : a.string,
'Article' : article}
self.collection.insert_one(data)
self.CrawledUrlsID.append(int(url_list[0].split('/')[-1]))
url_list.remove(url_list[0])
if len(ulst) >= 2 and ulst[-1] == ulst[-2]:
nums += 1
return self.url_lst_withoutArticles, self.title_lst_withoutArticles
except Exception:
traceback.print_exc()
def ReCrawlArticles(self,url_list,title_list):
'''Continue crawling urls without main information return.
# Arguments:
url_list: List of urls without getting any articles(main body).
title_list: List of urls without crawling any titles.
'''
nums = 1
ulst = []
while url_list != []:
ulst.append(url_list[0])
print(' ', url_list[0])
if nums > 10:
print(' wait 1s before request url again ...')
time.sleep(1)
nums = 1
article, date = self.getUrlInfo(url_list[0])
if date != '' and article != '':
data = {'date' : date,
'address' : url_list[0],
'title' : title_list[0],
'Article' : article}
print(' remove ' + url_list[0] + ' successfully ... ')
url_list.remove(url_list[0])
title_list.remove(title_list[0])
self.collection.insert_one(data)
if len(ulst) >= 2 and ulst[-1] == ulst[-2]:
nums += 1
def CrawlCompanyNews(self,startPage,endPage):
'''Crawl historical company news
'''
self.ConnDB()
AddressLst = self.extractData(['address'])[0]
if AddressLst == []:
urls = []
url_Part = 'http://stocks.nbd.com.cn/columns/275/page/'
for pageId in range(startPage,endPage+1):
urls.append(url_Part + str(pageId))
for url in urls:
print(url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
if a_list == []:
self.url_lst_withoutNews.append(url)
else:
for a in a_list:
if 'click-statistic' in a.attrs and a.string \
and a['click-statistic'].find('Article_') != -1 \
and a['href'].find('http://www.nbd.com.cn/articles/') != -1:
article, date = self.getUrlInfo(a['href'])
if date == '' or article == '':
self.url_lst_withoutArticles.append(a['href'])
self.title_lst_withoutArticles.append(a.string)
elif date != '' and article != '':
data = {'date' : date,
'address' : a['href'],
'title' : a.string,
'Article' : article}
self.collection.insert_one(data)
self.CrawledUrlsID.append(int(url.split('/')[-1]))
else:
urls = []
url_Part = 'http://stocks.nbd.com.cn/columns/275/page/'
for pageId in range(startPage,endPage+1):
urls.append(url_Part + str(pageId))
for url in urls:
print(' ', url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
if a_list == []:
self.url_lst_withoutNews.append(url)
else:
for a in a_list:
if 'click-statistic' in a.attrs and a.string \
and a['click-statistic'].find('Article_') != -1 \
and a['href'].find('http://www.nbd.com.cn/articles/') != -1:
if a['href'] not in AddressLst:
article, date = self.getUrlInfo(a['href'])
if date == '' or article == '':
self.url_lst_withoutArticles.append(a['href'])
self.title_lst_withoutArticles.append(a.string)
elif date != '' and article != '':
data = {'date' : date,
'address' : a['href'],
'title' : a.string,
'Article' : article}
self.collection.insert_one(data)
self.CrawledUrlsID.append(int(url.split('/')[-1]))
def ConnDB(self):
'''Connect mongodb.
'''
client = pymongo.MongoClient(self.IP, self.PORT)
mydb = client[self.dbName]
self.collection = mydb.get_collection(self.colName)
def extractData(self,tag_list):
'''Extract column data with tag in 'tag_list' to the list.
'''
data = []
for tag in tag_list:
exec(tag + " = self.collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def single_run(self):
'''Single threading running.
'''
page_ranges_lst = self.GenPagesLst()
for ind, page_range in enumerate(page_ranges_lst):
self.CrawlCompanyNews(page_range[0],page_range[1])
return self.url_lst_withoutNews
def multi_threads_run(self):
'''Multi-threading running.
'''
page_ranges_lst = self.GenPagesLst()
th_lst = []
for page_range in page_ranges_lst:
thread = threading.Thread(target=self.CrawlCompanyNews,\
args=(page_range[0],page_range[1]))
th_lst.append(thread)
for thread in th_lst:
thread.start()
for thread in th_lst:
thread.join()
return self.url_lst_withoutNews
def coroutine_run(self):
'''Coroutines running.
'''
jobs = []
page_ranges_lst = self.GenPagesLst()
for page_range in page_ranges_lst:
jobs.append(gevent.spawn(self.CrawlCompanyNews,page_range[0],page_range[1]))
gevent.joinall(jobs)
return self.url_lst_withoutNews
================================================
FILE: legacy_v1/Crawler/crawler_sina.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 22 10:01:40 2018
@author: Damon Li
"""
import time, re, requests
from concurrent import futures
from bs4 import BeautifulSoup
from pymongo import MongoClient
import Text_Analysis.text_mining as tm
import gevent
from gevent import monkey,pool
monkey.patch_all()
class WebCrawlFromSina(object):
'''Crawl company news from 'http://roll.finance.sina.com.cn/finance/zq1/ssgs/index.shtml' website.
# Arguments:
totalPages: Number of pages set to be crawled(int type).
Range: Divide total web pages into totalPages/Range parts
for multi-threading processing(int type).
ThreadsNum: Number of threads needed to be start(int type).
dbName: Name of database(string type).
colName: Name of collection(string type).
IP: Local IP address(string type).
PORT: Port number corresponding to IP address(int type).
'''
def __init__(self,*arg,**kwarg):
self.totalPages = arg[0] #totalPages
self.Range = arg[1] #Range
self.ThreadsNum = kwarg['ThreadsNum']
self.dbName = kwarg['dbName']
self.colName = kwarg['collectionName']
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.Porb = .5
self.realtimeNewsURL = []
self.tm = tm.TextMining(IP="localhost",PORT=27017)
def countchn(self,string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chnnum = len(result)
possible = chnnum/len(str(string))
return (chnnum, possible)
def getUrlInfo(self,url):
'''Analyze website and extract useful information.
'''
respond = requests.get(url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
meta_list = bs.find_all('meta')
span_list = bs.find_all('span')
part = bs.find_all('p')
article = ''
date = ''
summary = ''
keyWords = ''
stockCodeLst = ''
for meta in meta_list:
if 'name' in meta.attrs and meta['name'] == 'description':
summary = meta['content']
elif 'name' in meta.attrs and meta['name'] == 'keywords':
keyWords = meta['content']
if summary != '' and keyWords != '':
break
for span in span_list:
if 'class' in span.attrs:
if span['class'] == ['date'] or span['class'] == ['time-source']:
string = span.text.split()
for dt in string:
if dt.find('年') != -1:
date += dt.replace('年','-').replace('月','-').replace('日',' ')
elif dt.find(':') != -1:
date += dt
break
if 'id' in span.attrs and span['id'] == 'pub_date':
string = span.text.split()
for dt in string:
if dt.find('年') != -1:
date += dt.replace('年','-').replace('月','-').replace('日',' ')
elif dt.find(':') != -1:
date += dt
break
for span in span_list:
if 'id' in span.attrs and span['id'].find('stock_') != -1:
stockCodeLst += span['id'][8:] + ' '
for paragraph in part:
chnstatus = self.countchn(str(paragraph))
possible = chnstatus[1]
'''Porb: Standard frequency of Chinese occurrence among
each parts of one news(article/document), used
to judge whether any part is main body or not.
'''
if possible > self.Porb:
article += str(paragraph)
time1 = time.time()
while article.find('<') != -1 and article.find('>') != -1:
string = article[article.find('<'):article.find('>')+1]
article = article.replace(string,'')
time2 = time.time()
if time2 - time1 > 60:
print(' [*] 循环超时60s,跳出循环 ... ')
break
time1 = time.time()
while article.find('\u3000') != -1:
article = article.replace('\u3000','')
time2 = time.time()
if time2 - time1 > 60:
print(' [*] 循环超时60s,跳出循环 ... ')
break
article = ' '.join(re.split(' +|\n+', article)).strip()
return summary, keyWords, date, stockCodeLst, article
def GenPagesLst(self):
'''Generate page number list using Range parameter.
'''
PageLst = []
k = 1
while k+self.Range-1 <= self.totalPages:
PageLst.append((k,k+self.Range-1))
k += self.Range
if k+self.Range-1 < self.totalPages:
PageLst.append((k,self.totalPages))
return PageLst
def CrawlRealtimeCompanyNews(self,firstPage):
'''Continue crawling company news from first website page
every once in a while and extract the useful information,
including summary, key words, released date, related stock
codes list and main body.
'''
doc_lst = []
if len(self.realtimeNewsURL) == 0:
self.ConnDB()
self._AddressLst = self.extractData(['Address'])[0]
resp = requests.get(firstPage)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('http://finance.sina.com.cn/stock/s/') != -1:
if a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Keywords' : keyWords,
'Summary' : summary,
'Article' : article,
'RelevantStock' : stockCodeLst}
self._collection.insert_one(data)
doc_lst.append(a.string + ' ' + summary + ' ' + article)
print(' [' + date + '] ' + a.string)
else:
resp = requests.get(firstPage)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('http://finance.sina.com.cn/stock/s/') != -1:
if a['href'] not in self.realtimeNewsURL and a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Keywords' : keyWords,
'Summary' : summary,
'Article' : article,
'RelevantStock' : stockCodeLst}
self._collection.insert_one(data)
doc_lst.append(a.string + ' ' + summary + ' ' + article)
print(' [' + date + '] ' + a.string)
return doc_lst
def CrawlHistoryCompanyNews(self,startPage,endPage):
'''Crawl historical company news
'''
self.ConnDB()
AddressLst = self.extractData(['Address'])[0]
if AddressLst == []:
urls = []
url_Part_1 = 'http://roll.finance.sina.com.cn/finance/zq1/ssgs/index_'
url_Part_2 = '.shtml'
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId) + url_Part_2)
for url in urls:
print(url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('http://finance.sina.com.cn/stock/s/') != -1:
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Keywords' : keyWords,
'Summary' : summary,
'Article' : article,
'RelevantStock' : stockCodeLst}
self._collection.insert_one(data)
else:
urls = []
url_Part_1 = 'http://roll.finance.sina.com.cn/finance/zq1/ssgs/index_'
url_Part_2 = '.shtml'
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId) + url_Part_2)
for url in urls:
print(' ', url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and a.string and \
a['href'].find('http://finance.sina.com.cn/stock/s/') != -1:
if a['href'] not in AddressLst:
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
summary, keyWords, date, stockCodeLst, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a.string,
'Keywords' : keyWords,
'Summary' : summary,
'Article' : article,
'RelevantStock' : stockCodeLst}
self._collection.insert_one(data)
def ConnDB(self):
'''Connect mongodb.
'''
Conn = MongoClient(self.IP, self.PORT)
db = Conn[self.dbName]
self._collection = db.get_collection(self.colName)
def extractData(self,tag_list):
'''Extract column data with tag in 'tag_list' to the list.
'''
data = []
for tag in tag_list:
exec(tag + " = self._collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def single_run(self):
'''Single threading running.
'''
page_ranges_lst = self.GenPagesLst()
for ind, page_range in enumerate(page_ranges_lst):
self.CrawlHistoryCompanyNews(page_range[0],page_range[1])
def coroutine_run(self):
'''Coroutines running.
'''
jobs = []
page_ranges_lst = self.GenPagesLst()
for page_range in page_ranges_lst:
jobs.append(gevent.spawn(self.CrawlHistoryCompanyNews,page_range[0],page_range[1]))
gevent.joinall(jobs)
def multi_threads_run(self,**kwarg):
'''Multi-threading running.
'''
page_ranges_lst = self.GenPagesLst()
print(' Using ' + str(self.ThreadsNum) + ' threads for collecting news ... ')
with futures.ThreadPoolExecutor(max_workers=self.ThreadsNum) as executor:
future_to_url = {executor.submit(self.CrawlHistoryCompanyNews,page_range[0],page_range[1]) : \
ind for ind, page_range in enumerate(page_ranges_lst)}
def classifyRealtimeStockNews(self):
'''Continue crawling and classifying news(articles/documents) every 60s.
'''
while True:
print(' * start crawling news from SINA ... ')
doc_list = self.CrawlRealtimeCompanyNews('http://roll.finance.sina.com.cn/finance/zq1/ssgs/index_1.shtml') #
print(' * finish crawling ... ')
if len(doc_list) != 0:
self.tm.classifyRealtimeStockNews(doc_list)
time.sleep(60)
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromSina(5000,100,ThreadsNum=4,IP="localhost",PORT=27017,\
dbName="Sina_Stock",collectionName="sina_news_company")
web_crawl_obj.coroutine_run() #web_crawl_obj.single_run() #web_crawl_obj.multi_threads_run()
================================================
FILE: legacy_v1/Crawler/crawler_stcn.py
================================================
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 3 13:41:50 2018
@author: Damon Li
"""
import time, re, requests, datetime
from concurrent import futures
from bs4 import BeautifulSoup
from pymongo import MongoClient
import Text_Analysis.text_mining as tm
import gevent
from gevent import monkey,pool
monkey.patch_all()
class WebCrawlFromstcn(object):
'''Crawl company news from 'http://company.stcn.com/gsxw/1.shtml',
'http://stock.stcn.com/xingu/1.shtml',
'http://stock.stcn.com/zhuli/1.shtml',
'http://stock.stcn.com/bankuai/1.shtml',
'http://stock.stcn.com/dapan/1.shtml' website.
# Arguments:
totalPages: Number of pages set to be crawled.
Range: Divide total web pages into totalPages/Range parts
for multi-threading processing.
ThreadsNum: Number of threads needed to be start.
dbName: Name of database.
colName: Name of collection.
IP: Local IP address.
PORT: Port number corresponding to IP address.
'''
def __init__(self,**kwarg):
self.ThreadsNum = kwarg['ThreadsNum']
self.dbName = kwarg['dbName']
self.colName = kwarg['collectionName']
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.Prob = .5
self.realtimeNewsURL = []
self.tm = tm.TextMining(IP="localhost",PORT=27017)
def countchn(self,string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chnnum = len(result)
possible = chnnum/len(str(string))
return (chnnum, possible)
def getUrlInfo(self,url):
'''Analyze website and extract useful information.
'''
respond = requests.get(url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
div_list = bs.find_all('div')
part = bs.find_all('p')
article = ''
date = ''
for div in div_list:
if 'class' in div.attrs and div['class'] == ['info']:
date = div.text.split(' ')[0] + ' ' + div.text.split(' ')[1]
break
for paragraph in part:
chnstatus = self.countchn(str(paragraph))
possible = chnstatus[1]
if possible > self.Prob:
article += str(paragraph)
while article.find('<') != -1 and article.find('>') != -1:
string = article[article.find('<'):article.find('>')+1]
article = article.replace(string,'')
while article.find('\u3000') != -1:
article = article.replace('\u3000','')
article = ' '.join(re.split(' +|\n+', article)).strip()
return date, article
def GenPagesLst(self,totalPages,Range,initPageID):
'''Generate page number list using Range parameter.
'''
PageLst = []
k = initPageID
while k+Range-1 <= totalPages:
PageLst.append((k,k+Range-1))
k += Range
if k+Range-1 < totalPages:
PageLst.append((k,totalPages))
return PageLst
def CrawlRealtimeCompanyNews(self,url_part_lst):
'''Continue crawling company news from first website page
every once in a while and extract the useful information,
including summary, key words, released date, related stock
codes list and main body.
'''
doc_lst = []
self.ConnDB()
self._AddressLst = self.extractData(['Address'])[0]
for url_Part in url_part_lst:
url = url_Part + str(1) + '.shtml'
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
if len(self.realtimeNewsURL) == 0:
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.stcn.com/') != -1 \
and a.parent.find('span') or ('href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://stock.stcn.com/') != -1 \
and a.parent.find('span')):
if a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a['title'] + ' ' + article)
print(' [' + date + '] ' + a['title'])
else:
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.stcn.com/') != -1 \
and a.parent.find('span') or ('href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://stock.stcn.com/') != -1 \
and a.parent.find('span')):
if a['href'] not in self.realtimeNewsURL and a['href'] not in self._AddressLst:
self.realtimeNewsURL.append(a['href'])
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
doc_lst.append(a['title'] + ' ' + article)
print(' [' + date + '] ' + a['title'])
return doc_lst
def CrawlCompanyNews(self,startPage,endPage,url_Part_1):
'''Crawl historical company news
'''
self.ConnDB()
AddressLst = self.extractData(['Address'])[0]
if AddressLst == []:
urls = []
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId) + '.shtml')
for url in urls:
print(url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.stcn.com/') != -1 \
and a.parent.find('span'):
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
else:
urls = []
for pageId in range(startPage,endPage+1):
urls.append(url_Part_1 + str(pageId) + '.shtml')
for url in urls:
print(' ', url)
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
a_list = bs.find_all('a')
for a in a_list:
if 'href' in a.attrs and 'target' in a.attrs and 'title' in a.attrs \
and a['href'].find('http://company.stcn.com/') != -1 \
and a.parent.find('span'):
if a['href'] not in AddressLst:
date, article = self.getUrlInfo(a['href'])
while article == '' and self.Prob >= .1:
self.Prob -= .1
date, article = self.getUrlInfo(a['href'])
self.Prob =.5
if article != '':
data = {'Date' : date,
'Address' : a['href'],
'Title' : a['title'],
'Article' : article}
self._collection.insert_one(data)
def ConnDB(self):
'''Connect mongodb.
'''
Conn = MongoClient(self.IP, self.PORT)
db = Conn[self.dbName]
self._collection = db.get_collection(self.colName)
def extractData(self,tag_list):
'''Extract column data with tag in 'tag_list' to the list.
'''
data = []
for tag in tag_list:
exec(tag + " = self._collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def coroutine_run(self,totalPages,Range,initPageID,**kwarg):
'''Coroutines running.
'''
jobs = []
page_ranges_lst = self.GenPagesLst(totalPages,Range,initPageID)
for page_range in page_ranges_lst:
jobs.append(gevent.spawn(self.CrawlCompanyNews,page_range[0],page_range[1],kwarg['url_Part_1']))
gevent.joinall(jobs)
def multi_threads_run(self,**kwarg):
'''Multi-threading running.
'''
page_ranges_lst = self.GenPagesLst()
print(' Using ' + str(self.ThreadsNum) + ' threads for collecting news ... ')
with futures.ThreadPoolExecutor(max_workers=self.ThreadsNum) as executor:
future_to_url = {executor.submit(self.CrawlCompanyNews,page_range[0],page_range[1]) : \
ind for ind, page_range in enumerate(page_ranges_lst)}
def classifyRealtimeStockNews(self):
'''Continue crawling and classifying news(articles/documents) every 60s.
'''
today_Date = datetime.datetime.now().strftime('%Y-%m-%d')
while True:
print(' * start crawling news from STCN ... ')
doc_list = self.CrawlRealtimeCompanyNews(['http://company.stcn.com/gsxw/',\
'http://stock.stcn.com/xingu/',\
'http://stock.stcn.com/zhuli/',\
'http://stock.stcn.com/bankuai/',\
'http://stock.stcn.com/dapan/']) #
print(' * finish crawling ... ')
if len(doc_list) != 0:
self.tm.classifyRealtimeStockNews(doc_list)
time.sleep(60)
================================================
FILE: legacy_v1/Crawler/crawler_tushare.py
================================================
import pymongo
import tushare as ts
import datetime
import time
import math
import traceback
class CrawlStockData(object):
def __init__(self,**kwarg):
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.ConnDB()
self.stockDailyPath = 'D:\\stock_daliy'
def ConnDB(self):
self._Conn = pymongo.MongoClient(self.IP, self.PORT)
def extractData(self,dbName,colName,tag_list):
db = self._Conn[dbName]
collection = db.get_collection(colName)
data = []
for tag in tag_list:
exec(tag + " = collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def getStockBasicFromTushare(self,dbName,colName):
db = self._Conn[dbName]
collection = db.get_collection(colName)
stock_basic_info = ts.get_stock_basics()
for i in range(len(stock_basic_info)):
data = {stock_basic_info.index.name : stock_basic_info.index[i]}
data.update({'name' : stock_basic_info['name'][i]})
data.update({'industry' : stock_basic_info['industry'][i]})
data.update({'area' : stock_basic_info['area'][i]})
data.update({'pe' : stock_basic_info['pe'][i]})
data.update({'outstanding' : stock_basic_info['outstanding'][i]})
data.update({'totals' : stock_basic_info['totals'][i]})
data.update({'totalAssets' : stock_basic_info['totalAssets'][i]})
data.update({'liquidAssets' : stock_basic_info['liquidAssets'][i]})
data.update({'fixedAssets' : stock_basic_info['fixedAssets'][i]})
data.update({'reserved' : stock_basic_info['reserved'][i]})
data.update({'reservedPerShare' : stock_basic_info['reservedPerShare'][i]})
data.update({'esp' : stock_basic_info['esp'][i]})
data.update({'bvps' : stock_basic_info['bvps'][i]})
data.update({'pb' : stock_basic_info['pb'][i]})
data.update({'undp' : stock_basic_info['undp'][i]})
data.update({'perundp' : stock_basic_info['perundp'][i]})
data.update({'rev' : stock_basic_info['rev'][i]})
data.update({'profit' : stock_basic_info['profit'][i]})
data.update({'gpr' : stock_basic_info['gpr'][i]})
data.update({'npr' : stock_basic_info['npr'][i]})
data.update({'holders' : stock_basic_info['holders'][i]})
#detail = dict(zip(stock_basic_info.columns, [stock_basic_info[j][i] for j in stock_basic_info.columns]))
collection.insert_one(data)
def renewStockBasic(self):
pass
def getStockTickHistory(self,dbName,stockCode):
try:
db = self._Conn[dbName]
collection = db.get_collection(stockCode)
date = self.extractData("NBD","nbd_news_company",['date'])[0]
begin_date = min(date).split(' ')[0]
date_list = self.getCalendar(begin_date)
for dt in date_list:
tickDataOfEachDate = ts.get_tick_data(stockCode,date=dt)
if not math.isnan(tickDataOfEachDate['price'][0]): #exist data at that day
data = {}
for i in range(len(tickDataOfEachDate)-1,-1,-1):
data.update({'date' : dt})
data.update({'time' : tickDataOfEachDate['time'][i]})
data.update({'price' : tickDataOfEachDate['price'][i]})
data.update({'change' : tickDataOfEachDate['change'][i]})
data.update({'volume' : int(tickDataOfEachDate['volume'][i])})
data.update({'amount' : int(tickDataOfEachDate['amount'][i])})
data.update({'type' : tickDataOfEachDate['type'][i]})
collection.insert_one(data)
data = {}
print(dt + ' crawl finished ... ')
except Exception:
traceback.print_exc()
def getStockDayHistory(self,dbName,stockCode):
db = self._Conn[dbName]
collection = db.get_collection(stockCode)
Path = self.stockDailyPath + '\\' + stockCode + '.txt'
data = []
for row in open(Path,'r'):
line = row.split()
data.append(line)
Dict = {}
for i in range(len(data)):
if len(data[i]) > 1:
Dict.update({'date' : data[i][0]})
Dict.update({'open' : data[i][1]})
Dict.update({'high' : data[i][2]})
Dict.update({'low' : data[i][3]})
Dict.update({'close' : data[i][4]})
Dict.update({'volume' : data[i][5]})
Dict.update({'turnover' : data[i][6]})
collection.insert_one(Dict)
Dict = {}
def getCalendar(self,begin_date):
date_list = []
begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
end_date = datetime.datetime.strptime(time.strftime('%Y-%m-%d',time.localtime(time.time())), "%Y-%m-%d")
while begin_date <= end_date:
date_str = begin_date.strftime("%Y-%m-%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
return date_list
def isUnique(self, List):
# write your code here
n = len(List)
for i in range(n):
if List.count(List[i]) != 1: #判断单个字符串a[i]出现次数
return False
#break
return True
def getStockTickRealtime(self):
pass
================================================
FILE: legacy_v1/README_OLD.md
================================================
# 上市公司新闻文本分析与分类预测

[]([https://star-history.com/#linhandev/dataset&Date](https://star-history.com/#DemonDamon/Listed-company-news-crawl-and-text-analysis&Date))
-------------------------------
## 简介
上市公司新闻文本分析与分类预测的基本步骤如下:
- 从新浪财经、每经网、金融界、中国证券网、证券时报网上,爬取上市公司(个股)的历史新闻文本数据(包括时间、网址、标题、正文)
- 从Tushare上获取沪深股票日线数据(开、高、低、收、成交量和持仓量)和基本信息(包括股票代码、股票名称、所属行业、所属地区、PE值、总资产、流动资产、固定资产、留存资产等)
- 对抓取的新闻文本按照,去停用词、加载新词、分词的顺序进行处理
- 利用前两步中所获取的股票名称和分词后的结果,抽取出每条新闻里所包含的(0支、1支或多支)股票名称,并将所对应的所有股票代码,组合成与该条新闻相关的股票代码列表,并在历史数据表中增加一列相关股票代码数据
- 从历史新闻数据库中抽取与某支股票相关的所有新闻文本,利用该支股票的日线数据(比如某一天发布的消息,在设定N天后如果价格上涨则认为是利好消息,反之则是利空消息)给每条新闻贴上“利好”和“利空”的标签,并存储到新的数据库中(或导出到CSV文件)
- 实时抓取新闻数据,判断与该新闻相关的股票有哪些,利用上一步的结果,对与某支股票相关的所有历史新闻文本(已贴标签)进行文本分析(构建新的特征集),然后利用SVM(或随机森林)分类器对文本分析结果进行训练(如果已保存训练模型,可选择重新训练或直接加载模型),最后利用训练模型对实时抓取的新闻数据进行分类预测
开发环境`Python-v3(3.6)`:
- gensim==3.2.0
- jieba==0.39
- scikit-learn==0.19.1
- pandas==0.20.0
- numpy==1.13.3+mkl
- scipy==0.19.0
- pymongo==3.6.0
- beautifulsoup4==4.6.0
- tushare==1.1.1
- requests==2.18.4
- gevent==1.2.1
## 文本处理 -> [text_processing.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Text_Analysis/text_processing.py)
- 文本处理包括去停用词处理、加载新词、中文分词、去掉出现次数少的分词
- 生成字典和Bow向量,并基于Gensim转化模型(LSI、LDA、TF-IDF)转化Bow向量
- 计算文本相似度
- 打印词云
## 文本挖掘 -> [text_mining.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Text_Analysis/text_mining.py)
- 从新闻文本中抽取特定信息,并贴上新的文本标签方便往后训练模型
- 从数据库中抽取与某支股票相关的所有新闻文本
- 将贴好标签的历史新闻进行分类训练,利用训练好的模型对实时抓取的新闻文本进行分类预测
## 新闻爬取 -> [crawler_cnstock.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_cnstock.py), [crawler_jrj.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_jrj.py), [crawler_nbd.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_nbd.py), [crawler_sina.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_sina.py), [crawler_stcn.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_stcn.py)
- 分析网站结构,多线程(或协程)爬取上市公司历史新闻数据
## Tushare数据提取 -> [crawler_tushare.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/run_crawler_tushare.py)
- 获取沪深所有股票的基本信息,包括股票代码、股票名称、所属行业、所属地区等
## 用法
- 配好运行环境以及安装MongoDB,最好再安装一个MongoDB的可视化管理工具Studio 3T
- 先运行[crawler_cnstock.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_cnstock.py), [crawler_jrj.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_jrj.py), [crawler_nbd.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_nbd.py), [crawler_sina.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_sina.py), [crawler_stcn.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_stcn.py)这5个py文件,而且可能因为对方服务器没有响应而重复多次运行这几个文件才能抓取大量的历史数据
- 接着运行[crawler_tushare.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/run_crawler_tushare.py)从Tushare获取基本信息和股票价格
- 最后运行[run_main.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/run_main.py)文件,其中有4个步骤,除了第1步初始化外,其他几步最好单独运行
- 注意:所有程序都必须在文件所在目录下运行
## 更新目标
由于之前的项目代码是在初学Python的时候写的,很多写法都是入门级别,因此为了提高整体项目的质量,除了优化代码细节和已有的功能模块之外,还加入了多个功能模块,来支撑未来更加智能化和个性化的金融分析与交易。
- 完成初步构想,重构该项目,将项目分成8大模块,分别是`数据获取模块`,`数据清洗与预处理模块`,`大数据可视化模块`,`基于机器学习的文本挖掘模块`,`金融知识图谱构建模块`,`任务导向多轮对话模块`,`金融交易模块`,`通用服务模块`
(备注:项目在完善之后会重新更名为`Finnews Hunter`,命名的来源是出于对`《全职猎人》`的喜爱,与项目本质的结合,其中`Finnews`是`Financial News`的简写。上面提到的8个模块,分别由`《全职猎人》`中的本人最喜爱的8位角色命名,分别是
- `数据获取模块` -> [Gon](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Gon) -> `网页爬虫、各种数据源API调用等`
- `数据清洗与预处理模块` -> [Killua](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Killua) -> `数据清洗、数据转换(数据采样、类型转换、归一化等)、数据描述(数据可视化)、特征选择与组合(熵增益和分支定界等)、特征抽取(主成分分析、线性判别分析等)`
- `大数据可视化模块` -> [Kurapika](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Kurapika) -> `基于多个可视化模块进行封装,包括提供Web可视化界面`
- `自然语言处理模块` -> [Leorio](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Leorio) -> `中文分词、词性标注、实体识别`
- `基于机器学习的文本挖掘模块` -> [Hisoka](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Hisoka) -> ``
- `金融知识图谱构建模块` -> [Chrollo](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Chrollo) -> ``
- `任务导向多轮对话模块` -> [Illumi](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Illumi) -> ``
- `金融交易模块` -> [Feitan](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Feitan) -> ``
- `基础与Web服务模块` -> [Kite](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src/Kite) -> `基础服务集,包括基本参数配置文件(.py)、数据库的构建与连接、日志打印与收集、多线程服务、Web服务框架搭建以及其他函数`)
## 更新日志
- 注意:
- 以下例子均需在代码根目录[src](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/tree/main/src)下执行
- 先安装好MongoDB用作存储数据库,以及Redis用做简单的消息队列
- 运行下面demo时,先要设置[config.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Kite/config.py)里面的参数
- 更新[crawler_tushare.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_tushare.py)代码为[stockinfospyder.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/stockinfospyder.py),直接运行即可获取股票历史价格数据,并在每天15:30分后更新数据(目前只采集天数据)
- example-1 调用[AkShare](https://www.akshare.xyz/zh_CN/latest/)接口获取股票历史价格数据,并开启实时更新
```
from Kite import config
from Gon.stockinfospyder import StockInfoSpyder
stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
# 指定时间段,获取历史数据,如:stock_info_spyder.get_historical_news(start_date="20150101", end_date="20201204")
# 如果没有指定时间段,且数据库已存在部分数据,则从最新的数据时间开始获取直到现在,比如数据库里已有sh600000价格数据到
# 2020-12-03号,如不设定具体时间,则从自动获取sh600000自2020-12-04至当前的价格数据
stock_info_spyder.get_historical_news()
```
- example-2 开启自动化更新所有股票价格数据(目前只支持在15:30分后更新日数据)
```
from Kite import config
from Gon.stockinfospyder import StockInfoSpyder
stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
stock_info_spyder.get_realtime_news()
```
- 更新[crawler_cnstock.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_cnstock.py)代码为[cnstockspyder.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/cnstockspyder.py),直接运行即可获取中国证券网历史新闻数据,并可以实时更新采集
- example-1 爬取历史新闻数据,然后去重以及去NULL
```
import time
import logging
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.cnstockspyder import CnStockSpyder
cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
logging.info("start crawling {} ...".format(url_to_be_crawled))
cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn)
logging.info("finished ...")
time.sleep(30)
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
```
- example-2 实时更新新闻数据库,并且将新数据推进redis消息队列等待处理
```
import time, logging, threading
from Kite import config
from Kite.database import Database
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.cnstockspyder import CnStockSpyder
obj = Database()
df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"])
cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
# 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先
# 自动补充爬取2020-12-02至2020-12-23的新闻数据
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
# 查询type_chn的最近一条数据的时间
latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list())
cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db)
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# 开启多线程并行实时爬取
thread_list = []
for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60))
thread_list.append(thread)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
```
- 更新[crawler_jrj.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_jrj.py)代码为[jrjspyder.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/jrjspyder.py),直接运行即可获取金融界历史新闻数据,并可以实时更新采集
- example-1 爬取历史新闻数据,然后去重以及去NULL
```
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.jrjspyder import JrjSpyder
jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, start_date="2015-01-01")
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
```
- example-2 已爬取一定量的历史数据下,开启实时更新新闻数据库,并且将新数据推进redis消息队列等待处理
```
from Kite import config
from Gon.jrjspyder import JrjSpyder
jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ) # 补充爬虫数据到最新日期
jrj_spyder.get_realtime_news()
```
- 更新[crawler_nbd.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_nbd.py)代码为[nbdspyder.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/nbdspyder.py),直接运行即可获取每经网历史新闻数据,并可以实时更新采集
- example-1 爬取历史新闻数据,然后去重以及去NULL
```
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.nbdspyder import NbdSpyder
nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
nbd_spyder.get_historical_news(start_page=684)
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
```
- example-2 已爬取一定量的历史数据下,开启实时更新新闻数据库,并且将新数据推进redis消息队列等待处理
```
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.nbdspyder import NbdSpyder
# 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取
# 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取
nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
nbd_spyder.get_historical_news()
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
nbd_spyder.get_realtime_news()
```
- 更新[crawler_sina.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/Crawler/crawler_sina.py)代码为[sinaspyder.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/sinaspyder.py),直接运行即可获取新浪财经历史新闻数据(未更新)
- 停止`证券时报网`爬虫代码的更新(旧代码已不可用),新增`网易财经`和`凤凰财经`的爬虫代码(未更新)
- 新增[buildstocknewsdb.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Killua/buildstocknewsdb.py)如果已经在每经网、中国证券网和金融界爬取了一定量新闻文本,接下来就是针对每支股票构建对应的新闻数据库,并根据股价贴上3/5/10/15/30/60天标签,具体判断条件查看[buildstocknewsdb.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Killua/buildstocknewsdb.py)第111-116行注释
- example-1 从历史新闻数据库中抽取、构建每支股票的新闻数据库,并贴上标签
```
from Kite import config
from Killua.buildstocknewsdb import GenStockNewsDB
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
```
- example-2 监听redis消息队列,将新的数据分别存入与该新闻相关的所有股票新闻数据库中
```
from Kite import config
from Killua.buildstocknewsdb import GenStockNewsDB
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.listen_redis_queue()
```
- 新增[realtime_spyder_startup.bat](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/realtime_spyder_startup.bat)同时以下程序
- 开启多个爬虫实例,包括[realtime_starter_cnstock.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/realtime_starter_cnstock.py)、[realtime_starter_jrj.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/realtime_starter_jrj.py)、[realtime_starter_nbd.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/realtime_starter_nbd.py)等
- 全股票数据更新代码[realtime_starter_stock_price.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/realtime_starter_stock_price.py)
- 监听redis消息队列[realtime_starter_redis_queue.py](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/Gon/realtime_starter_redis_queue.py)
- 新增[realtime_spyder_stopall.bat](https://github.com/DemonDamon/Listed-company-news-crawl-and-text-analysis/blob/main/src/realtime_spyder_stopall.bat)批量终止爬虫程序
- 更新前使用jieba分词系统,在实体识别上需要不断维护新词表来提高识别精度;更新后,使用基于BERT预训练的FinBERT对金融领域实体进行识别
# FinnewsHunter (Reborn)
基于 AgenticX 框架构建的企业级多智能体金融决策平台。
## 项目状态
🚧 **重构进行中** 🚧
本项目正在经历重大重构,从单一脚本集合升级为现代化的微服务架构。
- **旧版代码**:已归档至 `legacy_v1/` 目录。
- **重构规划**:详见 [planning.md](../../planning.md)。
## 技术架构
- **后端**: Python, FastAPI, AgenticX (Orchestrator, Debate, Tools)
- **前端**: TypeScript, React
- **算法**: sklearn, PyTorch, vllm
## 快速开始
### 后端开发
1. 进入后端目录:
```bash
cd backend
```
2. 安装依赖:
```bash
pip install -r requirements.txt
```
3. 启动服务:
```bash
uvicorn app.main:app --reload
```
## 目录结构
```
FinnewsHunter/
├── backend/ # FastAPI 后端服务
│ ├── app/ # 应用代码
│ └── tests/ # 测试用例
├── frontend/ # React 前端应用 (待初始化)
├── legacy_v1/ # 旧版代码归档
├── docs/ # 项目文档
└── README.md # 项目说明
```
### 快速开始
1. 进入后端目录:
```bash
cd backend
```
2. 安装依赖:
```bash
pip install -r requirements.txt
```
3. 启动服务:
```bash
uvicorn app.main:app --reload
```
## 目录结构
```
FinnewsHunter/
├── backend/ # FastAPI 后端服务
│ ├── app/ # 应用代码
│ └── tests/ # 测试用例
├── frontend/ # React 前端应用 (待初始化)
├── legacy_v1/ # 旧版代码归档
├── docs/ # 项目文档
└── README.md # 项目说明
```
================================================
FILE: legacy_v1/Text_Analysis/__init__.py
================================================
================================================
FILE: legacy_v1/Text_Analysis/text_mining.py
================================================
# -*- coding: UTF-8 -*-
"""
Created on Sat Jan 20 10:20:33 2018
@author: Damon Li
"""
import os, re, csv, time, warnings, threading
from pymongo import MongoClient
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from bson.objectid import ObjectId
import Text_Analysis.text_processing as tp
from gensim import corpora, utils
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.exceptions
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=Warning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='gensim')
warnings.filterwarnings("ignore", category=RuntimeWarning, module='gensim')
class TextMining(object):
'''Text analysis and prediction functions class.
# Arguments:
IP: IP address of mongodb database.
PORT: Port number corresponding to IP.
'''
def __init__(self,**kwarg):
self.IP = kwarg['IP']
self.PORT = kwarg['PORT']
self.ConnDB()
self.tp = tp.TextProcessing(os.getcwd() + '\\' + 'Chinese_Stop_Words.txt', \
os.getcwd() + '\\' + 'finance_dict.txt')
if not os.path.exists(os.getcwd() + '\\' + 'stock_dict_file'):
os.makedirs(os.getcwd() + '\\' + 'stock_dict_file')
self.DictPath = os.getcwd() + '\\' + 'stock_dict_file'
def ConnDB(self):
'''Connect to the mongodb.
'''
self._Conn = MongoClient(self.IP, self.PORT)
def extractData(self,dbName,colName,tag_list):
'''Extract data from specific collection of specific database.
# Arguments:
dbName: Name of database.
colName: Name of collection.
tag_list: List of tags that need to be extracted.
'''
db = self._Conn[dbName]
collection = db.get_collection(colName)
data = []
Dict = {}
for tag in tag_list:
exec(tag + " = collection.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
exec("Dict.update({'" + tag + "' : np.array(" + tag + ")})")
dataFrame = pd.DataFrame(Dict,columns=tag_list)
return dataFrame
def extractStockCodeFromArticle(self,dbName,colName):
'''Extract the stocks mentioned by each news(articles/documents).
# Arguments:
dbName: Name of database.
colName: Name of collection.
'''
db = self._Conn[dbName]
collection = db.get_collection(colName)
idLst = self.extractData(dbName,colName,['_id'])._id
data = self.extractData("Stock","Basic_Info",['name','code'])
articles = []
for _id in idLst:
if dbName == 'NBD_Stock':
title = collection.find_one({'_id':ObjectId(_id)})['title']
else:
title = collection.find_one({'_id':ObjectId(_id)})['Title']
article = collection.find_one({'_id':ObjectId(_id)})['Article']
articles.append(title + ' ' + article)
token, _, _ = self.tp.genDictionary(articles,saveDict=False)
j = 0
for tk in token:
relevantStockName = []
relevantStockCode = []
for k in range(len(tk)):
if len(tk[k]) >= 3 and tk[k] in list(data.name):
relevantStockName.append(tk[k])
relevantStockCode.append(list(data[(data.name == tk[k])].code)[0])
if len(relevantStockCode) != 0:
relevantStockCodeDuplicateRemoval = list(set(relevantStockCode))
collection.update({"_id":idLst[j]},{"$set":{"relevantStock":\
' '.join(relevantStockCodeDuplicateRemoval)}})
# print(' [*] finished ' + str(j+1) + ' ... ')
j += 1
def extractStockCodeFromRealtimeNews(self,documents):
'''Extract stocks mentioined by real-time crawled news(articles/documents),
and return the list of corresponding codes.
# Arguments:
documents: Real-time crawled news(articles/documents).
'''
stock_basic_info = self.extractData("Stock","Basic_Info",['name','code'])
token_list = self.tp.jieba_tokenize(documents)
relevant_stock_list = []
for tokens in token_list:
relevantStockCode = []
for tk in tokens:
if len(tk) >= 3 and tk in list(stock_basic_info.name):
relevantStockCode.append(list(stock_basic_info[(stock_basic_info.name == tk)].code)[0])
relevant_stock_list.append(list(set(relevantStockCode)))
return relevant_stock_list
def judgeGoodOrBadNews(self,stockCode,date,judgeTerm):
'''Label the historical news(articles/documents) with 'Bad', 'Good' or 'Neutral'.
# Arguments:
stockCode: Code of specific stock.
date: Date at which released the specific news.
judgeTerm: Interval after which compare the close price with that at the released date.
'''
db = self._Conn['Stock']
collection = db.get_collection(stockCode)
dateLst = self.extractData("Stock",stockCode,['date']).date
days = 0
CloseLst = []
for dt in dateLst:
if dt >= date:
CloseLst.append(float(collection.find_one({'date':dt})['close']))
if days >= judgeTerm:
break
days += 1
if CloseLst[-1] > CloseLst[0]:
character = '利好'
elif CloseLst[-1] < CloseLst[0]:
character = '利空'
else:
character = '中立'
return character
def getNewsOfSpecificStock(self,dbColLst,stockCode,**kwarg):
'''Get news related to specific stock from historical database.
# Arguments:
dbColLst: List of databases and collections, eg: [(db_1,col_1),(db_2,col_2),...,(db_N,col_N)].
stockCode: Code of specific stock.
export: List parameters deciding the ways of exporting('csv' or 'database')
and file path of saving, eg: export=['csv','.\\file'].
'''
if kwarg['export'][0] == 'csv':
with open(kwarg['export'][1] + '\\' + stockCode + '.csv', 'a+', newline='',encoding='utf-8') as file:
fieldnames = ['date','address','title','article']
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
for dbName,colName in dbColLst:
db = self._Conn[dbName]
collection = db.get_collection(colName)
idLst = self.extractData(dbName,colName,['_id'])._id
if dbName == 'Sina_Stock':
for _id in idLst:
keys = ' '.join([k for k in collection.find_one({'_id':ObjectId(_id)}).keys()])
if keys.find('RelevantStock') != -1:
if collection.find_one({'_id':ObjectId(_id)})['RelevantStock'].find(stockCode) != -1:
print(' ' + collection.find_one({'_id':ObjectId(_id)})['Title'])
writer.writerow({'date':collection.find_one({'_id':ObjectId(_id)})['Date'], \
'address':collection.find_one({'_id':ObjectId(_id)})['Address'], \
'title':collection.find_one({'_id':ObjectId(_id)})['Title'], \
'article':collection.find_one({'_id':ObjectId(_id)})['Article']})
elif dbName == 'NBD':
for _id in idLst:
keys = ' '.join([k for k in collection.find_one({'_id':ObjectId(_id)}).keys()])
if keys.find('relevantStock') != -1:
if collection.find_one({'_id':ObjectId(_id)})['relevantStock'].find(stockCode) != -1:
print(' ' + collection.find_one({'_id':ObjectId(_id)})['title'])
writer.writerow({'date':collection.find_one({'_id':ObjectId(_id)})['date'], \
'address':collection.find_one({'_id':ObjectId(_id)})['address'], \
'title':collection.find_one({'_id':ObjectId(_id)})['title'], \
'article':collection.find_one({'_id':ObjectId(_id)})['Article']})
print(' [*] extracting ' + stockCode + ' news from ' + dbName + ' database to CSV file successfully ... ')
elif kwarg['export'][0] == 'database': #new database
for dbName,colName in dbColLst:
db = self._Conn[dbName]
collection = db.get_collection(colName)
idLst = self.extractData(dbName,colName,['_id'])._id
if dbName == 'NBD_Stock':
newdb = self._Conn[kwarg['export'][1]]
newcollection = newdb.get_collection(kwarg['export'][2])
for _id in idLst:
keys = ' '.join([k for k in collection.find_one({'_id':ObjectId(_id)}).keys()])
if keys.find('relevantStock') != -1:
if collection.find_one({'_id':ObjectId(_id)})['relevantStock'].find(stockCode) != -1:
character = self.judgeGoodOrBadNews(stockCode,\
collection.find_one({'_id':ObjectId(_id)})['date'].split(' ')[0].replace('-',''),kwarg['judgeTerm'])
# print(' ' + collection.find_one({'_id':ObjectId(_id)})['title'] + '(' + character + ')')
data = {'Date' : collection.find_one({'_id':ObjectId(_id)})['date'],
'Address' : collection.find_one({'_id':ObjectId(_id)})['address'],
'Title' : collection.find_one({'_id':ObjectId(_id)})['title'],
'Article' : collection.find_one({'_id':ObjectId(_id)})['Article'],
'Character' : character}
newcollection.insert_one(data)
elif dbName == 'Sina_Stock':
newdb = self._Conn[kwarg['export'][1]]
newcollection = newdb.get_collection(kwarg['export'][2])
for _id in idLst:
keys = ' '.join([k for k in collection.find_one({'_id':ObjectId(_id)}).keys()])
if keys.find('RelevantStock') != -1:
if collection.find_one({'_id':ObjectId(_id)})['RelevantStock'].find(stockCode) != -1:
character = self.judgeGoodOrBadNews(stockCode,\
collection.find_one({'_id':ObjectId(_id)})['Date'].split(' ')[0].replace('-',''),kwarg['judgeTerm'])
# print(' ' + collection.find_one({'_id':ObjectId(_id)})['Title'] + '(' + character + ')')
data = {'Date' : collection.find_one({'_id':ObjectId(_id)})['Date'],
'Address' : collection.find_one({'_id':ObjectId(_id)})['Address'],
'Title' : collection.find_one({'_id':ObjectId(_id)})['Title'],
'Article' : collection.find_one({'_id':ObjectId(_id)})['Article'],
'Character' : character}
newcollection.insert_one(data)
else:
newdb = self._Conn[kwarg['export'][1]]
newcollection = newdb.get_collection(kwarg['export'][2])
for _id in idLst:
keys = ' '.join([k for k in collection.find_one({'_id':ObjectId(_id)}).keys()])
if keys.find('relevantStock') != -1:
if collection.find_one({'_id':ObjectId(_id)})['relevantStock'].find(stockCode) != -1:
character = self.judgeGoodOrBadNews(stockCode,\
collection.find_one({'_id':ObjectId(_id)})['Date'].split(' ')[0].replace('-',''),kwarg['judgeTerm'])
# print(' ' + collection.find_one({'_id':ObjectId(_id)})['Title'] + '(' + character + ')')
data = {'Date' : collection.find_one({'_id':ObjectId(_id)})['Date'],
'Address' : collection.find_one({'_id':ObjectId(_id)})['Address'],
'Title' : collection.find_one({'_id':ObjectId(_id)})['Title'],
'Article' : collection.find_one({'_id':ObjectId(_id)})['Article'],
'Character' : character}
newcollection.insert_one(data)
print(' [' + stockCode + '] ' + dbName + ' has been extracted successfully ... ')
def classifyHistoryStockNews(self,dbName,stockCode,**kwarg):
'''Build classifier from historical news(articles/documents) of specific stock.
# Arguments:
dbName: Name of database.
stockCode: Code of specific stock.
renewDict: Renew the dictionary created by historical news(articles/documents) of
specific stock or not(bool type).
modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
tfDim: The number of topics that will be extracted from each news(articles/documents).
renewModel: Re-train the transformation models or not(bool type).
Classifier: The name of classifier, including 'SVM' and 'RandomForest' so far.
Params: The parameters of classifier, detail refer to the setting of classifier parameters of scikit-learn module.
'''
if kwarg['renewDict']:
if not os.path.exists(self.DictPath+'\\'+stockCode):
os.makedirs(self.DictPath+'\\'+stockCode)
db = self._Conn[dbName]
collection = db.get_collection(stockCode)
idLst = self.extractData(dbName,stockCode,['_id'])._id
articles = []
characters = []
for _id in idLst:
articles.append(collection.find_one({'_id':ObjectId(_id)})['Article'])
if collection.find_one({'_id':ObjectId(_id)})['Character'] == "利好":
characters.append(1)
elif collection.find_one({'_id':ObjectId(_id)})['Character'] == "利空":
characters.append(-1)
else:
characters.append(0)
self.tp.genDictionary(articles,saveDict=True,saveDictPath=self.DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict',\
saveBowvec=True,saveBowvecPath=self.DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm',returnValue=False)
print(' [*] renew the dictionary and bow-vector successfully ... ')
elif not os.path.exists(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict') \
or not os.path.exists(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm'):
if not os.path.exists(self.DictPath+'\\'+stockCode):
os.makedirs(self.DictPath+'\\'+stockCode)
db = self._Conn[dbName]
collection = db.get_collection(stockCode)
idLst = self.extractData(dbName,stockCode,['_id'])._id
articles = []
characters = []
for _id in idLst:
articles.append(collection.find_one({'_id':ObjectId(_id)})['Article'])
if collection.find_one({'_id':ObjectId(_id)})['Character'] == "利好":
characters.append(1)
elif collection.find_one({'_id':ObjectId(_id)})['Character'] == "利空":
characters.append(-1)
else:
characters.append(0)
self.tp.genDictionary(articles,saveDict=True,saveDictPath=self.DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict',\
saveBowvec=True,saveBowvecPath=self.DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm',returnValue=False)
print(' [*] generate and save the dictionary and bow-vector successfully ... ')
else:
db = self._Conn[dbName]
collection = db.get_collection(stockCode)
idLst = self.extractData(dbName,stockCode,['_id'])._id
characters = []
for _id in idLst:
if collection.find_one({'_id':ObjectId(_id)})['Character'] == "利好":
characters.append(1)
elif collection.find_one({'_id':ObjectId(_id)})['Character'] == "利空":
characters.append(-1)
else:
characters.append(0)
dictionary = corpora.Dictionary.load(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict')
bowvec = corpora.MmCorpus(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm')
print(' [*] load dictionary and bow-vector successfully ... ')
_, modelVec = self.tp.CallTransformationModel(dictionary,bowvec,modelType=kwarg['modelType'],\
tfDim=kwarg['tfDim'],renewModel=kwarg['renewModel'],modelPath=self.DictPath+'\\'+stockCode+'\\')
CSRMatrix = self.ConvertToCSRMatrix(modelVec)
train_X, train_Y, test_X, test_Y = self.genTrainingSet(CSRMatrix,characters)
if kwarg['Classifier'] == 'SVM':
self.SVMClassifier(train_X,train_Y,test_X,test_Y,kwarg['Params'],['precision'],stockCode)
if kwarg['Classifier'] == 'RandomForest':
self.RdForestClassifier(train_X,train_Y,test_X,test_Y,kwarg['Params'],['precision'],stockCode)
return self._precise
def classifyRealtimeStockNews(self,doc_list):
'''Classify real-time news(articles/documents) of specific stock.
#Arguments:
doc_list: List of real-time news(articles/documents) crawled from specific websites.
'''
print(' * extract relevant stock codes from latest crawled news ... ')
relevant_stock_list = self.extractStockCodeFromRealtimeNews(doc_list)
if len(relevant_stock_list) != 0:
tfDim = 200
for i, code_list in enumerate(relevant_stock_list):
for code in code_list:
print(' * load SVM parameters (gamma & C) ... ')
Params_svm = {'kernel': ['rbf'], 'gamma': [10, 20, 50, 100, 150, 200], \
'C': [10, 15, 20, 30, 50, 100]}
print(' * use historical news to build SVM model of ' + code + ' ... ')
self.classifyHistoryStockNews("Stock_News",code,modelType='lda',tfDim=tfDim,renewDict=False,\
renewModel=False,Classifier='SVM',Params=Params_svm) #code="600740"
print(' * load historical dictionary of ' + code + ' ...')
dictionary = corpora.Dictionary.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_dict.dict')
print(' * tokenize latest crawled news ... ')
token = self.tp.jieba_tokenize(doc_list)
print(' * create bow-vector of latest news of ' + code + ' ... ')
bowvec_doc = [dictionary.doc2bow(text) for text in token]
print(' * load bow-vector of historical news of ' + code + ' ... ')
bowvec_all = list(corpora.MmCorpus(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_bowvec.mm'))
print(' * extend latest bow-vector to historical bow-vector of ' + code + ' ... ')
bowvec_all.extend(bowvec_doc)
print(' * create new lda model of ' + code + ' ... ')
_, NewmodelVec = self.tp.CallTransformationModel(dictionary,bowvec_all,modelType='lda',\
tfDim=200,renewModel=False,modelPath=os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\')
print(' * convert latest lda vector to CSR matrix of ' + code + ' ... ')
NewCSRMatrix = self.ConvertToCSRMatrix(NewmodelVec)
print(' * load SVM model of ' + code + ' ... ')
clf = joblib.load(os.getcwd() + '\\' + 'stock_dict_file\\' + code + '\\' + code + '_svm.pkl')
print(' * predicting ... ')
if clf.predict(NewCSRMatrix[i-2,:])[0] == 1:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利好消息 ...')
elif clf.predict(NewCSRMatrix[i-2,:])[0] == -1:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是利空消息 ...')
else:
print(' 《' + doc_list[i].split(' ')[0] + "》" + '对' + code + '是中立消息 ...')
else:
print(' * not any relevant stock ... ')
def SVMClassifier(self,train_X,train_Y,test_X,test_Y,tuned_parameters,scores,stockCode):
'''SVM Classifier.
# Arguments:
train_X: Features train data.
train_Y: Labels train data.
test_X: Features train data.
test_Y: Labels train data.
tuned_parameters: The parameters of classifier, refer to the setting of classifier parameters of scikit-learn module.
scores: Targets of optimization, detail refer to optimal targets setting of scikit-learn module.
stockCode: Code of specific stock.
'''
for score in scores:
if not os.path.exists(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_svm.pkl'):
clf = GridSearchCV(svm.SVC(), tuned_parameters, cv=5, scoring='%s_weighted' % score) # 构造这个GridSearch的分类器,5-fold
clf.fit(train_X, train_Y) # 只在训练集上面做k-fold,然后返回最优的模型参数
joblib.dump(clf, self.DictPath+'\\'+stockCode+'\\'+stockCode+'_svm.pkl')
print(clf.best_params_) # 输出最优的模型参数
else:
clf = joblib.load(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_svm.pkl')
# for params, mean_score, scores in clf.grid_scores_:
# print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
train_pred = clf.predict(train_X)
test_pred = clf.predict(test_X) # 在测试集上测试最优的模型的泛化能力.
print(classification_report(test_Y, test_pred))
precise_train = 0
for k in range(len(train_pred)):
if train_pred[k] == train_Y[k]:
precise_train += 1
precise_test = 0
for k in range(len(test_pred)):
if test_pred[k] == test_Y[k]:
precise_test += 1
print(' [*] train_pred:', precise_train/len(train_Y), ', test_pred:', precise_test/len(test_pred))
print(' ' + '-' * 50)
self._precise = precise_test/len(test_pred)
def RdForestClassifier(self,train_X,train_Y,test_X,test_Y,tuned_parameters,scores,stockCode):
'''Random Forest Classifier.
# Arguments:
train_X: Features train data.
train_Y: Labels train data.
test_X: Features train data.
test_Y: Labels train data.
tuned_parameters: The parameters of classifier, refer to the setting of classifier parameters of scikit-learn module.
scores: Targets of optimization, detail refer to optimal targets setting of scikit-learn module.
stockCode: Code of specific stock.
'''
for score in scores:
if not os.path.exists(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_rdf.pkl'):
clf = GridSearchCV(RandomForestClassifier(random_state=14), tuned_parameters, cv=5, scoring='%s_weighted' % score) # 构造这个GridSearch的分类器,5-fold
clf.fit(train_X, train_Y) # 只在训练集上面做k-fold,然后返回最优的模型参数
joblib.dump(clf, self.DictPath+'\\'+stockCode+'\\'+stockCode+'_rdf.pkl')
print(clf.best_params_) # 输出最优的模型参数
else:
clf = joblib.load(self.DictPath+'\\'+stockCode+'\\'+stockCode+'_rdf.pkl')
# for params, mean_score, scores in clf.grid_scores_:
# print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
train_pred = clf.predict(train_X)
test_pred = clf.predict(test_X) # 在测试集上测试最优的模型的泛化能力.
print(classification_report(test_Y, test_pred))
precise_train = 0
for k in range(len(train_pred)):
if train_pred[k] == train_Y[k]:
precise_train += 1
precise_test = 0
for k in range(len(test_pred)):
if test_pred[k] == test_Y[k]:
precise_test += 1
print(' [*] train_pred:', precise_train/len(train_Y), ', test_pred:', precise_test/len(test_pred))
print(' ' + '-' * 50)
self._precise = precise_test/len(test_pred)
def ConvertToCSRMatrix(self,modelVec):
'''Convert LDA(LSI) model vector to CSR sparse matrix, that could be accepted by Scipy and Numpy.
# Arguments:
modelVec: Transformation model vector, such as LDA model vector, tfidf model vector or lsi model vector.
'''
data = []
rows = []
cols = []
self._line_count = 0
for line in modelVec:
for elem in line:
rows.append(self._line_count)
cols.append(elem[0])
data.append(elem[1])
self._line_count += 1
sparse_matrix = csr_matrix((data,(rows,cols)))
matrix = sparse_matrix.toarray()
return matrix
def genTrainingSet(self,X,Y):
'''Generate training data set.
# Arguments:
X: Feature set.
Y: Label set.
'''
rarray=np.random.random(size=self._line_count)
train_X = []
train_Y = []
test_X = []
test_Y = []
for i in range(self._line_count):
if rarray[i]<0.8:
train_X.append(X[i,:])
train_Y.append(Y[i])
else:
test_X.append(X[i,:])
test_Y.append(Y[i])
return train_X,train_Y,test_X,test_Y
================================================
FILE: legacy_v1/Text_Analysis/text_processing.py
================================================
# -*- coding: UTF-8 -*-
"""
Created on Fri Feb 23 12:37:46 2018
@author: Damon Li
"""
import numpy as np
import jieba, os
from gensim import corpora,similarities,models,matutils,utils
class TextProcessing(object):
'''Text pre-processing functions class.
# Arguments
chnSTWPath: chinese stop words txt file path.
finance_dict: latest financial related words txt file path.
'''
def __init__(self,chnSTWPath,finance_dict):
self.chnSTWPath = chnSTWPath
self.finance_dict = finance_dict
def renewFinanceDict(self,new_Word_list):
'''Add latest necessary financial words into financial dictionary
for improving tokenization effect.
# Arguments:
new_Word_list: New financial words list, eg: ["区块链","离岸金融"].
'''
with open(self.finance_dict,'a',encoding='utf-8') as file:
for word in new_Word_list:
file.write(word + '\n')
def getchnSTW(self):
'''Load the stop words txt file.
'''
stopwords = [line.strip() for line in open(self.chnSTWPath, 'r').readlines()]
return stopwords
def jieba_tokenize(self,documents):
'''Cut the documents into a sequence of independent words.
# Arguments:
documents: List of news(articles).
'''
chnSTW = self.getchnSTW()
corpora_documents = []
jieba.load_userdict(self.finance_dict)
for item_text in documents:
outstr = []
sentence_seged = list(jieba.cut(item_text))
for word in sentence_seged:
if word not in chnSTW and word != '\t' \
and word != ' ':
outstr.append(word)
corpora_documents.append(outstr)
return corpora_documents
def RemoveWordAppearOnce(self,corpora_documents):
'''Remove the words that appear once among all the tokenized news(articles).
# Arguments:
corpora_documents: List of tokenized news(articles).
'''
frequency = defaultdict(int)
for text in corpora_documents:
for token in text:
frequency[token] += 1
corpora_documents = [[token for token in text if frequency[token] > 1] for text in corpora_documents]
return corpora_documents
def genDictionary(self,documents,**kwarg):
'''Generate dictionary and bow-vector of all tokenzied news(articles).
# Arguments:
documents: List of news(articles).
saveDict: Save dictionary or not(bool type).
saveBowvec: Save bow-vector or not(bool type).
returnValue: Return value or not(bool type).
'''
self._raw_documents = documents
token = self.jieba_tokenize(documents) #jieba tokenize
#corpora_documents = self.RemoveWordAppearOnce(token) # remove thw words appearing once in the dictionary
self._dictionary = corpora.Dictionary(token) # generate dictionary using tokenized documents
if kwarg['saveDict']:
self._dictionary.save(kwarg['saveDictPath']) # store the dictionary, for future reference
self._BowVecOfEachDoc = [self._dictionary.doc2bow(text) for text in token] # convert tokenized documents to vectors
if kwarg['saveBowvec']:
corpora.MmCorpus.serialize(kwarg['saveBowvecPath'], self._BowVecOfEachDoc) # store to disk, for later use
if kwarg['returnValue']:
return token, self._dictionary, self._BowVecOfEachDoc
def CallTransformationModel(self,Dict,Bowvec,**kwarg):
'''Invoke specific transformation models of Gensim module.
# Arguments:
Dict: Dictionary made by all tokenized news(articles/documents).
Bowvec: Bow-vector created by all tokenized news(articles/documents).
modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel.
tfDim: The number of topics that will be extracted from each news(articles/documents).
renewModel: Re-train the transformation models or not(bool type).
modelPath: The path of saving trained transformation models.
'''
if kwarg['renewModel']:
tfidf = models.TfidfModel(Bowvec) # initialize tfidf model
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
if kwarg['modelType'] == 'lsi':
model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
model.save(kwarg['modelPath']) # same for tfidf, lda, ...
elif kwarg['modelType'] == 'lda':
model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
model.save(kwarg['modelPath']) # same for tfidf, lda, ...
elif kwarg['modelType'] == 'None':
model = tfidf
modelVec = tfidfVec
else:
if not os.path.exists(kwarg['modelPath']+"tfidf_model.tfidf"):
tfidf = models.TfidfModel(Bowvec) # initialize tfidf model
tfidfVec = tfidf[Bowvec] #
tfidf.save(kwarg['modelPath']+"tfidf_model.tfidf")
else:
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
if kwarg['modelType'] == 'lsi':
if not os.path.exists(kwarg['modelPath']+"lsi_model.lsi"):
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) # initialize an LSI transformation
modelVec = model[tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
model.save(kwarg['modelPath']+"lsi_model.lsi") # same for tfidf, lda, ...
else:
model = models.LsiModel.load(kwarg['modelPath']+"lsi_model.lsi")
modelVec = model[tfidfVec]
elif kwarg['modelType'] == 'lda':
if not os.path.exists(kwarg['modelPath']+"lda_model.lda"):
tfidf = models.TfidfModel.load(kwarg['modelPath']+"tfidf_model.tfidf")
tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus
model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'])
modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
model.save(kwarg['modelPath']+"lda_model.lda") # same for tfidf, lda, ...
else:
model = models.LdaModel.load(kwarg['modelPath']+"lda_model.lda")
modelVec = model[tfidfVec]
elif kwarg['modelType'] == 'None':
model = tfidf
modelVec = tfidfVec
return tfidfVec, modelVec
def CalSim(self,test_document,Type,best_num):
'''Calculate similarities between test document wth all news(articles/documents).
# Arguments:
test_document: List of raw documents.
Type: Models of calculating similarities.
best_num: refer to 'num_best' parameter in Gensim module.
'''
if Type == 'Similarity-tfidf-index':
tfidf = models.TfidfModel(self._BowVecOfEachDoc)
tfidfVec = tfidf[self._BowVecOfEachDoc]
self._num_features = len(self._dictionary.token2id.keys())
self._similarity = similarities.Similarity(Type, tfidfVec, \
num_features=self._num_features,num_best=best_num)
test_cut_raw = list(jieba.cut(test_document))
test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc]
elif Type == 'Similarity-LSI-index':
lsi_model = models.LsiModel(self._BowVecOfEachDoc)
corpus_lsi = lsi_model[self._BowVecOfEachDoc]
self._num_features = len(self._dictionary.token2id.keys())
self._similarity = similarities.Similarity(Type, corpus_lsi, \
num_features=self._num_features,num_best=best_num)
test_cut_raw = list(jieba.cut(test_document))
test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw)
self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc]
self.Print_CalSim()
IdLst = []
SimRltLst = []
SimTxLst = []
for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]:
IdLst.append(Id)
SimRltLst.append(Sim)
SimTxLst.append(self._raw_documents[Id])
return IdLst,SimTxLst,SimRltLst
def PrintWorfCloud(self,documents,backgroundImgPath,fontPath):
'''Print out the word cloud of all news(articles/documents).
# Arguments:
documents: Overall raw documents.
backgroundImgPath: Background image path.
fontPath: The path of windows fonts that used to create the word-cloud.
'''
from scipy.misc import imread
import matplotlib.pyplot as plt
from wordcloud import WordCloud
corpora_documents = self.jieba_tokenize(documents) #分词
for k in range(len(corpora_documents)):
corpora_documents[k] = ' '.join(corpora_documents[k])
corpora_documents = ' '.join(corpora_documents)
color_mask = imread(backgroundImgPath) #"C:\\Users\\lenovo\\Desktop\\Text_Mining\\3.jpg"
cloud = WordCloud(font_path=fontPath,mask=color_mask,background_color='white',\
max_words=2000,max_font_size=40) #"C:\\Windows\\Fonts\\simhei.ttf"
word_cloud = cloud.generate(corpora_documents)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
if __name__ == '__main__':
tp = TextProcessing(os.getcwd() + '\\' + 'Chinese_Stop_Words.txt', \
os.getcwd() + '\\' + 'finance_dict.txt')
doc = ['中央、地方支持政策频出,煤炭行业站上了风口 券商研报浩如烟海,投资线索眼花缭乱,第一财经推出\
《一财研选》产品,挖掘研报精华,每期梳理5条投资线索,便于您短时间内获取有价值的信息。专业团队\
每周日至每周四晚8点准时“上新”,\
助您投资顺利!1.中央、地方支持政策频出,这个行业站上了风口!(信达证券)近年来,利好住房租赁\
市场发展的政策频频发布,顶层设计趋于完善。信达证券指出,2015年以来,住建部、国务院等机构相继出\
台政策支持住房租赁市场发展,地方积极跟进,试点城市全部出台相关方案支持当地住房租赁市场发展。除\
此之外,“租购同权”保障承租人享受公共服务的权益,稳定租赁关系,利好长租公寓发展。除政策利好长租\
公寓外,需求的逐步释放对长租公寓市场形成支撑。信达证券研究发现,人口向核心一、二线城市流动趋势不\
减,高房价刺激购房需求转向租房需求、首次置业年龄抬升、高校毕业生租房需求增加等因素将刺激长租公寓\
需求进一步释放。总体而言,住房租赁市场容量逾万亿且具备区域性特征。2017年8月,国土资源部、住房和城\
乡建设部联合印发《利用集体建设用地建设租赁住房试点方案》,选择13个试点城市推进利用集体建设用地建\
设租赁住房,各地“只租不售”地块频出,彰显政府发展住房租赁市场决心。类REITs产品盘活租赁资产,解决\
长租融资痛点,上述举措能够有效增加租赁住房供给。伴随政策利好,多主体纷纷进军住房租赁市场。信达证\
券指出,截至目前,房企、房地产中介、专业租赁机构、连锁酒店、金融机构和互联网公司均已涉足住宅租赁市\
场。其中,房企多采用自持物业的重资产运营方式,中介机构及其他公司多以轻资产运营方式为主,从房源获\
取的角度看,集中与分散并行。信达证券指出,当前我国租赁住房的发展还处于初步阶段,多主体参与、多模式\
并存。参与各方均凭借自身比较优势切入住房租赁领域。未来,房企、互联网公司、金融机构存在巨大的合作空间。\
在市场细分的前提下,增值服务的提供将成为住房租赁市场发展的关键。信达证券推荐关注招商蛇口(21.100, \
-1.43, -6.35%)(001979.SZ)、万科A(31.270, -1.48, -4.52%)(000002.SZ)、世联行(8.700, -0.87,\
-9.09%)(002285.SZ)、昆百大A(7.510, -0.05, -0.66%)(000560.SZ)、天健集团(9.330, -0.56, -5.66%)\
(000090.SZ)。2.煤炭库存创八年新低,缺煤升级,高煤价仍将持续(中银国际)截至1月30日,秦皇岛5500大\
卡山西优混动力煤报755元,跳涨2%,再超预期,并创近6年新高,此轮上涨持续了10周时间,累计涨幅达13%。煤炭\
行业是本栏重点追踪的行业板块,近期的大涨验证了此前选摘的多家研究机构的观点,今天我们再来看一下中银国际\
对板块未来表现的分析观点。中银国际指出,六大电厂日耗量周均81万吨,环比增加9%,库存天数由13天下降至10.9天\
,为近8年新低,库存下降至899万吨,为近7年新低。缺煤情况非常突出。经济的强韧性叠加寒冷冰雪天气推升需求超预\
期是主因,供应侧在年关生产积极性不高、运输不畅是辅因,且短期较难明显缓解,2月初地方矿也面临陆续放假,在\
这种情况下煤价有继续攀高的可能。中银国际认为此轮煤价上涨包含着较多非季节性因素:六大电厂日耗从2017年12月\
开始同比增幅都在10%以上,这还是在有工业限产的情况下,这是非常高的数字,在2017年7~8月旺季的同比增幅也只\
有15%左右。经济较好下的需求超预期历来是煤炭股最好的催化剂。尽管2月份由于春节因素可能价格会回落,但在2018\
年缺煤明显的情况下,幅度不会太大,高煤价还会继续维持。3月初两会召开,安全形势再度紧张,煤炭的供应仍然会偏\
紧,在叠加3月15日后限产解除,限产解除前后下游补库存,高煤价可能会贯穿整个一季度。中银国际指出,2017年1月秦\
皇岛煤价均价只有602元,2018年1月的均价为726元,同比增长21%,动力煤公司一季度的业绩大概率会上调。尽管后续煤\
价调控的压力在加大,但近期效果可能不明显,中期有待观察。煤炭板块2018年市盈率15倍,估值不贵,且存在继续上调\
盈利预测和估值下行的可能,股价仍有空间。继续推荐动力煤龙头陕西煤业(8.340, -0.77, -8.45%)(601225.SH)、\
兖州煤业(15.150, -1.24, -7.57%)(600803.SH)、中国神华(24.290, -1.16, -4.56%)(601088.SH),以及优质\
的国企改革兼并重组题材股潞安环能(11.590, -1.11, -8.74%)(601699.SH)、山西焦化(12.420, -1.38, -10.00%\
)(600740.SH)、山煤国际(4.520, -0.50, -9.96%)(600546.SH)、阳泉煤业(7.780, -0.86, -9.95%)(600348.SH)\
。',\
'郭文仓到重点工程项目督导检查 2月2日,公司党委书记、董事长、总经理郭文仓,公司董事,股份公司副总经理、总工程师、\
郭毅民,股份公司副总经理张国富、柴高贵及相关单位负责人到焦化厂煤场全封闭和1#—4#干熄焦等重点工程项目建设工地\
督导检查施工进度和安全工作情况。郭文仓一行实地查看并详细了解了现场施工情况,询问了施工队伍人员状况,他说,\
煤场全封闭项目和1#—4#干熄焦项目是公司的重点环保项目,一定要力争将重点工程项目建成精品工程、一流环保标杆项目\
。近日天气寒冷,又临近春节,煤场全封闭项目进入收尾的关键阶段,施工负责人要紧绷安全弦,加强现场安全管理,从细节抓\
起,消除隐患,确保收尾工作安全稳定顺利。1#—4#干熄焦项目在大面积开工的重要时期,一定要统筹安排项目进度和质量\
管理,落实好冬季防护措施,管控好每一道施工环节,目前尤其要注重人员的思想状况,做到不安全不施工,保证施工安全和人\
员人身安全,确保项目“安全无事故、质量全达标、进度按计划、投资不超概、投产即达效、竣工不留尾、审计无问题、廉政建\
设好”,为公司打造成全国独立焦化旗舰企业奠定坚实的基础。']
DictPath = os.getcwd() + '\\' + 'stock_dict_file'
stockCode = '600740'
print(DictPath)
print(DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict')
print(DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm')
if not os.path.exists(DictPath+'\\'+stockCode):
os.makedirs(DictPath+'\\'+stockCode)
tp.genDictionary(doc,saveDict=True,saveDictPath=DictPath+'\\'+stockCode+'\\'+stockCode+'_dict.dict',\
saveBowvec=True,saveBowvecPath=DictPath+'\\'+stockCode+'\\'+stockCode+'_bowvec.mm',returnValue=False)
================================================
FILE: legacy_v1/finance_dict.txt
================================================
================================================
FILE: legacy_v1/run_crawler_cnstock.py
================================================
from Crawler.crawler_cnstock import WebCrawlFromcnstock
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromcnstock(IP="localhost",PORT=27017,ThreadsNum=4,\
dbName="Cnstock_Stock",collectionName="cnstock_news_company")
web_crawl_obj.coroutine_run(621,10,1,url_Part_1='http://company.cnstock.com/company/scp_gsxw/') #Obj.multi_threads_run()
web_crawl_obj.coroutine_run(112,10,0,url_Part_1='http://ggjd.cnstock.com/gglist/search/qmtbbdj/')
web_crawl_obj.coroutine_run(116,10,0,url_Part_1='http://ggjd.cnstock.com/gglist/search/ggkx/')
================================================
FILE: legacy_v1/run_crawler_jrj.py
================================================
from Crawler.crawler_jrj import WebCrawlFromjrj
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromjrj("2009-01-05","2018-02-03",100,ThreadsNum=4,IP="localhost",PORT=27017,\
dbName="Jrj_Stock",collectionName="jrj_news_company")
web_crawl_obj.coroutine_run() #web_crawl_obj.single_run() #web_crawl_obj.multi_threads_run()
================================================
FILE: legacy_v1/run_crawler_nbd.py
================================================
from Crawler.crawler_nbd import WebCrawlFromNBD
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromNBD(2871,10,ThreadsNum=4,IP="localhost",PORT=27017,dbName='NBD_Stock',\
collectionName="nbd_news_company")
url_lst_withoutNews = web_crawl_obj.coroutine_run() #web_crawl_obj.single_run() #web_crawl_obj.multi_threads_run()
if url_lst_withoutNews != []:
print(' -------------------- Re-Crawl News List Pages -------------------- ')
url_lst_withoutArticles, title_lst_withoutArticles = web_crawl_obj.ReCrawlNews(url_lst_withoutNews)
if url_lst_withoutArticles != [] or title_lst_withoutArticles != []:
print(' -------------------- Re-Crawl Article Pages -------------------- ')
web_crawl_obj.ReCrawlArticles(url_lst_withoutArticles,title_lst_withoutArticles)
================================================
FILE: legacy_v1/run_crawler_sina.py
================================================
from Crawler.crawler_sina import WebCrawlFromSina
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromSina(5000,100,ThreadsNum=4,IP="localhost",PORT=27017,\
dbName="Sina_Stock",collectionName="sina_news_company")
web_crawl_obj.coroutine_run() #web_crawl_obj.single_run() #web_crawl_obj.multi_threads_run()
================================================
FILE: legacy_v1/run_crawler_stcn.py
================================================
from Crawler.crawler_stcn import WebCrawlFromstcn
if __name__ == '__main__':
web_crawl_obj = WebCrawlFromstcn(IP="localhost",PORT=27017,ThreadsNum=4,\
dbName="Stcn_Stock",collectionName="stcn_news_company")
web_crawl_obj.coroutine_run(20,1,1,url_Part_1='http://company.stcn.com/gsxw/')
web_crawl_obj.coroutine_run(20,1,1,url_Part_1='http://stock.stcn.com/xingu/')
web_crawl_obj.coroutine_run(20,1,1,url_Part_1='http://stock.stcn.com/zhuli/')
web_crawl_obj.coroutine_run(20,1,1,url_Part_1='http://stock.stcn.com/bankuai/')
web_crawl_obj.coroutine_run(20,1,1,url_Part_1='http://stock.stcn.com/dapan/')
================================================
FILE: legacy_v1/run_crawler_tushare.py
================================================
from Crawler.crawler_tushare import CrawlStockData
if __name__ == '__main__':
t1 = time.time()
# Initiate
Obj = CrawlStockData(IP="localhost",PORT=27017)
# Get basic infos of stocks
Obj.getStockBasicFromTushare("Stock","Basic_Info")
# Extract stocks' code
Code = Obj.extractData('Stock','Basic_Info',['code'])[0]
# Get stock price from Tushare
for stockcode in Code:
Obj.getStockDayHistory('Stock',stockcode)
print(' [*] ' + stockcode + ' has finished storing ... ')
t2 = time.time()
print(' running time:', t2 - t1)
================================================
FILE: legacy_v1/run_main.py
================================================
import time, datetime, threading
from concurrent import futures
from Crawler.crawler_sina import WebCrawlFromSina
from Crawler.crawler_jrj import WebCrawlFromjrj
from Crawler.crawler_cnstock import WebCrawlFromcnstock
from Crawler.crawler_stcn import WebCrawlFromstcn
import Text_Analysis.text_mining as tm
def crawlers(web):
if web == 'sina':
web_crawl_obj = WebCrawlFromSina(5000,100,ThreadsNum=4,IP="localhost",PORT=27017,\
dbName="Sina_Stock",collectionName="sina_news_company")
web_crawl_obj.classifyRealtimeStockNews()
elif web == 'jrj':
web_crawl_obj = WebCrawlFromjrj("2009-01-05","2018-02-03",100,ThreadsNum=4,IP="localhost",PORT=27017,\
dbName="Jrj_Stock",collectionName="jrj_news_company")
web_crawl_obj.classifyRealtimeStockNews()
elif web == 'cnstock':
web_crawl_obj = WebCrawlFromcnstock(IP="localhost",PORT=27017,ThreadsNum=4,\
dbName="Cnstock_Stock",collectionName="cnstock_news_company")
web_crawl_obj.classifyRealtimeStockNews()
elif web == 'stcn':
web_crawl_obj = WebCrawlFromstcn(IP="localhost",PORT=27017,ThreadsNum=4,\
dbName="Stcn_Stock",collectionName="stcn_news_company")
web_crawl_obj.classifyRealtimeStockNews()
if __name__ == '__main__':
# Step 1. Initiate
text_mining_obj = tm.TextMining(IP="localhost",PORT=27017)
# Step 2. Extract relevant stock codes of news(articles/documents) from all database
text_mining_obj.extractStockCodeFromArticle("NBD_Stock","nbd_news_company") # 从每经网的新闻中抽出相关的股票代码
text_mining_obj.extractStockCodeFromArticle("Cnstock_Stock","cnstock_news_company") # 从中国证券网的新闻中抽出相关的股票代码
text_mining_obj.extractStockCodeFromArticle("Stcn_Stock","stcn_news_company") # 从证券时报网的新闻中抽出相关的股票代码
text_mining_obj.extractStockCodeFromArticle("Jrj_Stock","jrj_news_company") # 从金融界网的新闻中抽出相关的股票代码
# Step 3. Extract all news related to specific stock to new database(this step will take long time)
codeLst = text_mining_obj.extractData("Stock","Basic_Info",['code']).code
Range = 10
Idx = 0
while Idx < len(codeLst):
thread_lst = []
for stockcode in codeLst[Idx:Idx+Range]:
thread = threading.Thread(target=text_mining_obj.getNewsOfSpecificStock,\
args=([("NBD_Stock","nbd_news_company"),("Sina_Stock","sina_news_company"),\
("Cnstock_Stock","cnstock_news_company"),("Stcn_Stock","stcn_news_company"),("Jrj_Stock",\
"jrj_news_company")],stockcode),kwargs={"export":['database','Stock_News',stockcode],"judgeTerm":3})
thread_lst.append(thread)
for thread in thread_lst:
thread.start()
for thread in thread_lst:
thread.join()
print(' [*] have extracted ' + codeLst[Idx:Idx+Range])
Idx += Range
thread_lst = []
for stockcode in codeLst[Idx:]:
thread = threading.Thread(target=text_mining_obj.getNewsOfSpecificStock,\
args=([("NBD_Stock","nbd_news_company"),("Sina_Stock","sina_news_company"),\
("Cnstock_Stock","cnstock_news_company"),("Stcn_Stock","stcn_news_company"),("Jrj_Stock",\
"jrj_news_company")],stockcode),kwargs={"export":['database','Stock_News',stockcode],"judgeTerm":3})
thread_lst.append(thread)
for thread in thread_lst:
thread.start()
for thread in thread_lst:
thread.join()
print(' [*] have extracted ' + codeLst[Idx:Idx+Range])
# Step 4. Crawl real-time news from 'web_list' and make classification
web_list = ['sina','jrj','cnstock','stcn']
with futures.ThreadPoolExecutor(max_workers=4) as executor:
future_to_url = {executor.submit(crawlers,param) : \
ind for ind, param in enumerate(web_list)}
================================================
FILE: legacy_v1/src/Gon/__init__.py
================================================
import os
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
# add `./src` dir to system path
src_dir_1 = os.path.abspath(os.path.join(os.getcwd(), "../"))
# add `./src/Gon` dir to system path
src_dir_2 = os.path.dirname(__file__)
add_path(src_dir_1)
add_path(src_dir_2)
================================================
FILE: legacy_v1/src/Gon/cnstockspyder.py
================================================
"""
中国证券网:https://www.cnstock.com
公司聚焦:https://company.cnstock.com/company/scp_gsxw
公告解读:https://ggjd.cnstock.com/gglist/search/qmtbbdj
公告快讯:https://ggjd.cnstock.com/gglist/search/ggkx
利好公告:https://ggjd.cnstock.com/company/scp_ggjd/tjd_sdlh
"""
import __init__
from spyder import Spyder
from Kite import utils
from Kite import config
from Kite.database import Database
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Leorio.tokenization import Tokenization
import re
import time
import json
import redis
import random
import logging
import threading
from bs4 import BeautifulSoup
from selenium import webdriver
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class CnStockSpyder(Spyder):
def __init__(self, database_name, collection_name):
super(CnStockSpyder, self).__init__()
self.db_obj = Database()
self.col = self.db_obj.conn[database_name].get_collection(collection_name)
self.terminated_amount = 0
self.db_name = database_name
self.col_name = collection_name
self.tokenization = Tokenization(import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH)
self.redis_client = redis.StrictRedis(host=config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_NEWS_REDIS_DB_ID)
def get_url_info(self, url):
try:
bs = utils.html_parser(url)
except Exception:
return False
span_list = bs.find_all("span")
part = bs.find_all("p")
article = ""
date = ""
for span in span_list:
if "class" in span.attrs and span["class"] == ["timer"]:
date = span.text
break
for paragraph in part:
chn_status = utils.count_chn(str(paragraph))
possible = chn_status[1]
if possible > self.is_article_prob:
article += str(paragraph)
while article.find("<") != -1 and article.find(">") != -1:
string = article[article.find("<"):article.find(">")+1]
article = article.replace(string, "")
while article.find("\u3000") != -1:
article = article.replace("\u3000", "")
article = " ".join(re.split(" +|\n+", article)).strip()
return [date, article]
def get_historical_news(self, url, category_chn=None, start_date=None):
"""
:param url: 爬虫网页
:param category_chn: 所属类别, 中文字符串, 包括'公司聚焦', '公告解读', '公告快讯', '利好公告'
:param start_date: 数据库中category_chn类别新闻最近一条数据的时间
"""
assert category_chn is not None
driver = webdriver.Chrome(executable_path=config.CHROME_DRIVER)
btn_more_text = ""
crawled_urls_list = self.extract_data(["Url"])[0]
logging.info("historical data length -> {} ... ".format(len(crawled_urls_list)))
# crawled_urls_list = []
driver.get(url)
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
if start_date is None:
while btn_more_text != "没有更多":
more_btn = driver.find_element_by_id('j_more_btn')
btn_more_text = more_btn.text
logging.info("1-{}".format(more_btn.text))
if btn_more_text == "加载更多":
more_btn.click()
time.sleep(random.random()) # sleep random time less 1s
elif btn_more_text == "加载中...":
time.sleep(random.random()+2)
more_btn = driver.find_element_by_id('j_more_btn')
btn_more_text = more_btn.text
logging.info("2-{}".format(more_btn.text))
if btn_more_text == "加载更多":
more_btn.click()
else:
more_btn.click()
break
bs = BeautifulSoup(driver.page_source, "html.parser")
for li in bs.find_all("li", attrs={"class": ["newslist"]}):
a = li.find_all("h2")[0].find("a")
if a["href"] not in crawled_urls_list:
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a["title"], a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
data = {"Date": date,
"Category": category_chn,
"Url": a["href"],
"Title": a["title"],
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)}
# self.col.insert_one(data)
self.db_obj.insert_data(self.db_name, self.col_name, data)
logging.info("[SUCCESS] {} {} {}".format(date, a["title"], a["href"]))
else:
# 当start_date不为None时,补充历史数据
is_click_button = True
start_get_url_info = False
tmp_a = None
while is_click_button:
bs = BeautifulSoup(driver.page_source, "html.parser")
for li in bs.find_all("li", attrs={"class": ["newslist"]}):
a = li.find_all("h2")[0].find("a")
if tmp_a is not None and a["href"] != tmp_a:
continue
elif tmp_a is not None and a["href"] == tmp_a:
start_get_url_info = True
if start_get_url_info:
date, _ = self.get_url_info(a["href"])
if date <= start_date:
is_click_button = False
break
tmp_a = a["href"]
if is_click_button:
more_btn = driver.find_element_by_id('j_more_btn')
more_btn.click()
# 从一开始那条新闻到tmp_a都是新增新闻,不包括tmp_a
bs = BeautifulSoup(driver.page_source, "html.parser")
for li in bs.find_all("li", attrs={"class": ["newslist"]}):
a = li.find_all("h2")[0].find("a")
if a["href"] != tmp_a:
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a["title"], a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
data = {"Date": date,
"Category": category_chn,
"Url": a["href"],
"Title": a["title"],
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)}
# self.col.insert_one(data)
self.db_obj.insert_data(self.db_name, self.col_name, data)
logging.info("[SUCCESS] {} {} {}".format(date, a["title"], a["href"]))
else:
break
driver.quit()
def get_realtime_news(self, url, category_chn=None, interval=60):
logging.info("start real-time crawling of URL -> {}, request every {} secs ... ".format(url, interval))
assert category_chn is not None
# TODO: 由于cnstock爬取的数据量并不大,这里暂时是抽取历史所有数据进行去重,之后会修改去重策略
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
crawled_urls = self.db_obj.get_data(self.db_name,
self.col_name,
keys=["Url"])["Url"].to_list()
while True:
# 每隔一定时间轮询该网址
bs = utils.html_parser(url)
for li in bs.find_all("li", attrs={"class": ["newslist"]}):
a = li.find_all("h2")[0].find("a")
if a["href"] not in crawled_urls: # latest_3_days_crawled_href
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a["title"], a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.CNSTOCK_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.CNSTOCK_MAX_REJECTED_AMOUNTS,
config.RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
self.db_obj.insert_data(self.db_name, self.col_name,
{"Date": date,
"Category": category_chn,
"Url": a["href"],
"Title": a["title"],
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)})
self.redis_client.lpush(config.CACHE_NEWS_LIST_NAME, json.dumps(
{"Date": date,
"Category": category_chn,
"Url": a["href"],
"Title": a["title"],
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list),
"OriDB": config.DATABASE_NAME,
"OriCOL": config.COLLECTION_NAME_CNSTOCK
}
))
logging.info("[SUCCESS] {} {} {}".format(date, a["title"], a["href"]))
crawled_urls.append(a["href"])
# logging.info("sleep {} secs then request {} again ... ".format(interval, url))
time.sleep(interval)
# """
# Example-1:
# 爬取历史新闻数据
# """
# if __name__ == '__main__':
# import time
# import logging
# from Kite import config
# from Killua.denull import DeNull
# from Killua.deduplication import Deduplication
# from Gon.cnstockspyder import CnStockSpyder
#
# cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
# for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
# logging.info("start crawling {} ...".format(url_to_be_crawled))
# cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn)
# logging.info("finished ...")
# time.sleep(30)
#
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# """
# Example-2:
# 爬取实时新闻数据
# """
# if __name__ == '__main__':
# import time, logging, threading
# from Kite import config
# from Kite.database import Database
# from Killua.denull import DeNull
# from Killua.deduplication import Deduplication
# from Gon.cnstockspyder import CnStockSpyder
#
# obj = Database()
# df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"])
#
# cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
# # 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先
# # 自动补充爬取2020-12-02至2020-12-23的新闻数据
# for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
# # 查询type_chn的最近一条数据的时间
# latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list())
# cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db)
#
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
#
# # 开启多线程并行实时爬取
# thread_list = []
# for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
# thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60))
# thread_list.append(thread)
# for thread in thread_list:
# thread.start()
# for thread in thread_list:
# thread.join()
================================================
FILE: legacy_v1/src/Gon/history_starter_cnstock.py
================================================
import __init__
import time
import logging
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Killua.buildstocknewsdb import GenStockNewsDB
from Gon.cnstockspyder import CnStockSpyder
# 1. 爬取历史数据
cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
logging.info("start crawling {} ...".format(url_to_be_crawled))
cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn)
logging.info("finished ...")
time.sleep(30)
# 2. 针对历史数据进行去重清洗
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# 3. 将历史数据中包含null值的行去掉
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# 4. 创建新的数据库,针对每一个股票,将所有涉及该股票的新闻都保存在新的数据库,并贴好"利好","利空"和"中性"标签
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
================================================
FILE: legacy_v1/src/Gon/history_starter_jrj.py
================================================
import __init__
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Killua.buildstocknewsdb import GenStockNewsDB
from Gon.jrjspyder import JrjSpyder
# 1. 爬取历史数据
jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, start_date="2015-01-01")
# 2. 针对历史数据进行去重清洗
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# 3. 将历史数据中包含null值的行去掉
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# 4. 创建新的数据库,针对每一个股票,将所有涉及该股票的新闻都保存在新的数据库,并贴好"利好","利空"和"中性"标签
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
================================================
FILE: legacy_v1/src/Gon/history_starter_nbd.py
================================================
import __init__
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Killua.buildstocknewsdb import GenStockNewsDB
from Gon.nbdspyder import NbdSpyder
# 1. 爬取历史数据
nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
nbd_spyder.get_historical_news(start_page=684)
# 2. 针对历史数据进行去重清洗
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# 3. 将历史数据中包含null值的行去掉
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# 4. 创建新的数据库,针对每一个股票,将所有涉及该股票的新闻都保存在新的数据库,并贴好"利好","利空"和"中性"标签
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
================================================
FILE: legacy_v1/src/Gon/history_starter_stock_price.py
================================================
import __init__
from Kite import config
from Gon.stockinfospyder import StockInfoSpyder
stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
# 指定时间段,获取历史数据,如:stock_info_spyder.get_historical_news(start_date="20150101", end_date="20201204")
# 如果没有指定时间段,且数据库已存在部分数据,则从最新的数据时间开始获取直到现在,比如数据库里已有sh600000价格数据到
# 2020-12-03号,如不设定具体时间,则从自动获取sh600000自2020-12-04至当前的价格数据
stock_info_spyder.get_historical_news()
================================================
FILE: legacy_v1/src/Gon/ifengspyder.py
================================================
"""
凤凰财经网:https://finance.ifeng.com
上市公司:https://finance.ifeng.com/shanklist/1-62-83-
大盘评述:https://finance.ifeng.com/shanklist/1-62-85-
证券要闻:https://finance.ifeng.com/shanklist/1-62-84-
"""
================================================
FILE: legacy_v1/src/Gon/jrjspyder.py
================================================
"""
金融界:http://www.jrj.com.cn
股票频道全部新闻:http://stock.jrj.com.cn/xwk/202012/20201203_1.shtml
"""
import __init__
from spyder import Spyder
from Kite import utils
from Kite import config
from Kite.database import Database
from Leorio.tokenization import Tokenization
import time
import json
import redis
import datetime
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class JrjSpyder(Spyder):
def __init__(self, database_name, collection_name):
super(JrjSpyder, self).__init__()
self.db_obj = Database()
self.col = self.db_obj.conn[database_name].get_collection(collection_name)
self.terminated_amount = 0
self.db_name = database_name
self.col_name = collection_name
self.tokenization = Tokenization(import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH)
self.redis_client = redis.StrictRedis(host=config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_NEWS_REDIS_DB_ID)
def get_url_info(self, url, specific_date):
try:
bs = utils.html_parser(url)
except Exception:
return False
date = ""
for span in bs.find_all("span"):
if span.contents[0] == "jrj_final_date_start":
date = span.text.replace("\r", "").replace("\n", "")
break
if date == "":
date = specific_date
article = ""
for p in bs.find_all("p"):
if not p.find_all("jrj_final_daohang_start") and p.attrs == {} and \
not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i") and not p.find_all("span"):
# if p.contents[0] != "jrj_final_daohang_start1" and p.attrs == {} and \
# not p.find_all("input") and not p.find_all("a", attrs={"class": "red"}) and not p.find_all("i"):
article += p.text.replace("\r", "").replace("\n", "").replace("\u3000", "")
return [date, article]
def get_historical_news(self, url, start_date=None, end_date=None):
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
crawled_urls_list = []
if end_date is None:
end_date = datetime.datetime.now().strftime("%Y-%m-%d")
if start_date is None:
# 如果start_date是None,则从历史数据库最新的日期补充爬取到最新日期
# e.g. history_latest_date_str -> "2020-12-08"
# history_latest_date_dt -> datetime.date(2020, 12, 08)
# start_date -> "2020-12-09"
history_latest_date_list = self.db_obj.get_data(self.db_name,
self.col_name,
keys=["Date"])["Date"].to_list()
if len(history_latest_date_list) != 0:
history_latest_date_str = max(history_latest_date_list).split(" ")[0]
history_latest_date_dt = datetime.datetime.strptime(history_latest_date_str, "%Y-%m-%d").date()
offset = datetime.timedelta(days=1)
start_date = (history_latest_date_dt + offset).strftime('%Y-%m-%d')
else:
start_date = config.JRJ_REQUEST_DEFAULT_DATE
dates_list = utils.get_date_list_from_range(start_date, end_date)
dates_separated_into_ranges_list = utils.gen_dates_list(dates_list, config.JRJ_DATE_RANGE)
for dates_range in dates_separated_into_ranges_list:
for date in dates_range:
first_url = "{}/{}/{}_1.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", ""))
max_pages_num = utils.search_max_pages_num(first_url, date)
for num in range(1, max_pages_num + 1):
_url = "{}/{}/{}_{}.shtml".format(url, date.replace("-", "")[0:6], date.replace("-", ""), str(num))
bs = utils.html_parser(_url)
a_list = bs.find_all("a")
for a in a_list:
if "href" in a.attrs and a.string and \
a["href"].find("/{}/{}/".format(date.replace("-", "")[:4],
date.replace("-", "")[4:6])) != -1:
if a["href"] not in crawled_urls_list:
# 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成
if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \
a.string.find("新三板挂牌上市") == -1:
result = self.get_url_info(a["href"], date)
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.JRJ_MAX_REJECTED_AMOUNTS,
config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"], date)
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a.string, a["href"]))
else:
# 有返回但是article为null的情况
article_specific_date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"], date)
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.JRJ_MAX_REJECTED_AMOUNTS,
config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"],
60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"], date)
article_specific_date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
data = {"Date": article_specific_date,
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)}
# self.col.insert_one(data)
self.db_obj.insert_data(self.db_name, self.col_name, data)
logging.info("[SUCCESS] {} {} {}".format(article_specific_date,
a.string,
a["href"]))
self.terminated_amount = 0 # 爬取结束后重置该参数
else:
logging.info("[QUIT] {}".format(a.string))
def get_realtime_news(self, interval=60):
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
# crawled_urls_list = []
is_change_date = False
last_date = datetime.datetime.now().strftime("%Y-%m-%d")
while True:
today_date = datetime.datetime.now().strftime("%Y-%m-%d")
if today_date != last_date:
is_change_date = True
last_date = today_date
if is_change_date:
# crawled_urls_list = []
utils.batch_lpop(self.redis_client,
config.CACHE_SAVED_NEWS_JRJ_TODAY_VAR_NAME,
self.redis_client.llen(config.CACHE_SAVED_NEWS_JRJ_TODAY_VAR_NAME))
is_change_date = False
_url = "{}/{}/{}_1.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ,
today_date.replace("-", "")[0:6],
today_date.replace("-", ""))
max_pages_num = utils.search_max_pages_num(_url, today_date)
for num in range(1, max_pages_num + 1):
_url = "{}/{}/{}_{}.shtml".format(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ,
today_date.replace("-", "")[0:6],
today_date.replace("-", ""),
str(num))
bs = utils.html_parser(_url)
a_list = bs.find_all("a")
for a in a_list:
if "href" in a.attrs and a.string and \
a["href"].find("/{}/{}/".format(today_date.replace("-", "")[:4],
today_date.replace("-", "")[4:6])) != -1:
# if a["href"] not in crawled_urls_list:
if a["href"] not in self.redis_client.lrange(config.CACHE_SAVED_NEWS_JRJ_TODAY_VAR_NAME, 0, -1):
# 如果标题不包含"收盘","报于"等字样,即可写入数据库,因为包含这些字样标题的新闻多为机器自动生成
if a.string.find("收盘") == -1 and a.string.find("报于") == -1 and \
a.string.find("新三板挂牌上市") == -1:
result = self.get_url_info(a["href"], today_date)
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.JRJ_MAX_REJECTED_AMOUNTS,
config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"], today_date)
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a.string, a["href"]))
else:
# 有返回但是article为null的情况
article_specific_date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"], today_date)
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.JRJ_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.JRJ_MAX_REJECTED_AMOUNTS,
config.RECORD_JRJ_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"],
60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"], today_date)
article_specific_date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
self.db_obj.insert_data(self.db_name, self.col_name,
{"Date": article_specific_date,
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)})
self.redis_client.lpush(config.CACHE_NEWS_LIST_NAME, json.dumps(
{"Date": article_specific_date,
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list),
"OriDB": config.DATABASE_NAME,
"OriCOL": config.COLLECTION_NAME_JRJ
}
))
logging.info("[SUCCESS] {} {} {}".format(article_specific_date,
a.string,
a["href"]))
self.terminated_amount = 0 # 爬取结束后重置该参数
else:
logging.info("[QUIT] {}".format(a.string))
# crawled_urls_list.append(a["href"])
self.redis_client.lpush(config.CACHE_SAVED_NEWS_JRJ_TODAY_VAR_NAME, a["href"])
# logging.info("sleep {} secs then request again ... ".format(interval))
time.sleep(interval)
# """
# Example-1:
# 爬取历史新闻数据
# """
# if __name__ == "__main__":
# jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
# jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, start_date="2015-01-01")
#
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# """
# Example-2:
# 爬取实时新闻数据
# """
# if __name__ == '__main__':
# from Kite import config
# from Gon.jrjspyder import JrjSpyder
#
# jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
# jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ) # 补充爬虫数据到最新日期
# jrj_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Gon/kill_realtime_spyder_tasks.py
================================================
import __init__
import os
import wmi
import redis
import logging
from Kite import config
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class KillPyTasks(object):
def __init__(self):
self.redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
for _id in range(self.redis_client.llen(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR)):
proc = self.get_python_process(param=self.redis_client.lindex(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, _id).decode())
for p in proc:
self.killtask(p.Handle)
self.print_pid_info(p)
for _ in range(self.redis_client.llen(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR)):
self.redis_client.lpop(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR)
@staticmethod
def killtask(pid):
os.system(f"taskkill /F /pid {pid} -t")
@staticmethod
def get_python_process(prop="python.exe", param=None):
output = []
w = wmi.WMI()
for proc in w.Win32_Process(name=prop):
if param is None:
output.append(proc)
else:
if str(proc.CommandLine).find(param) >= 0:
output.append(proc)
return output
@staticmethod
def print_pid_info(process):
logging.info("{} | {} | {} -> killed ... ".format(process.Handle, process.Caption, process.CommandLine))
if __name__ == "__main__":
KillPyTasks()
================================================
FILE: legacy_v1/src/Gon/money163spyder.py
================================================
"""
网易财经网:https://money.163.com
个股资讯:http://money.163.com/special/g/00251LR5/gptj.html
市场资讯:http://money.163.com/special/00251LR5/cpznList.html
行业板块:http://money.163.com/special/00251LJV/hyyj.html
"""
================================================
FILE: legacy_v1/src/Gon/nbdspyder.py
================================================
"""
每经网:http://www.nbd.com.cn
A股动态:http://stocks.nbd.com.cn/columns/275/page/1
"""
import __init__
from spyder import Spyder
from Kite import utils
from Kite import config
from Kite.database import Database
from Leorio.tokenization import Tokenization
import re
import time
import json
import redis
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class NbdSpyder(Spyder):
def __init__(self, database_name, collection_name):
super(NbdSpyder, self).__init__()
self.db_obj = Database()
self.col = self.db_obj.conn[database_name].get_collection(collection_name)
self.terminated_amount = 0
self.db_name = database_name
self.col_name = collection_name
self.tokenization = Tokenization(import_module="jieba", user_dict=config.USER_DEFINED_DICT_PATH)
self.redis_client = redis.StrictRedis(host=config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_NEWS_REDIS_DB_ID)
def get_url_info(self, url):
try:
bs = utils.html_parser(url)
except Exception:
return False
span_list = bs.find_all("span")
part = bs.find_all("p")
article = ""
date = ""
for span in span_list:
if "class" in span.attrs and span.text and span["class"] == ["time"]:
string = span.text.split()
for dt in string:
if dt.find("-") != -1:
date += dt + " "
elif dt.find(":") != -1:
date += dt
break
for paragraph in part:
chn_status = utils.count_chn(str(paragraph))
possible = chn_status[1]
if possible > self.is_article_prob:
article += str(paragraph)
while article.find("<") != -1 and article.find(">") != -1:
string = article[article.find("<"):article.find(">")+1]
article = article.replace(string, "")
while article.find("\u3000") != -1:
article = article.replace("\u3000", "")
article = " ".join(re.split(" +|\n+", article)).strip()
return [date, article]
def get_historical_news(self, start_page=684):
date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list()
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
if len(date_list) == 0:
# 说明没有历史数据,从头开始爬取
crawled_urls_list = []
page_urls = ["{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_id)
for page_id in range(start_page, 0, -1)]
for page_url in page_urls:
bs = utils.html_parser(page_url)
a_list = bs.find_all("a")
for a in a_list:
if "click-statistic" in a.attrs and a.string \
and a["click-statistic"].find("Article_") != -1 \
and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
if a["href"] not in crawled_urls_list:
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a.string, a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
data = {"Date": date,
# "PageId": page_url.split("/")[-1],
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)}
# self.col.insert_one(data)
self.db_obj.insert_data(self.db_name, self.col_name, data)
logging.info("[SUCCESS] {} {} {}".format(date, a.string, a["href"]))
else:
is_stop = False
start_date = max(date_list)
page_start_id = 1
while not is_stop:
page_url = "{}/{}".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD, page_start_id)
bs = utils.html_parser(page_url)
a_list = bs.find_all("a")
for a in a_list:
if "click-statistic" in a.attrs and a.string \
and a["click-statistic"].find("Article_") != -1 \
and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a.string, a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
if date > start_date:
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
data = {"Date": date,
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)}
self.db_obj.insert_data(self.db_name, self.col_name, data)
logging.info("[SUCCESS] {} {} {}".format(date, a.string, a["href"]))
else:
is_stop = True
break
if not is_stop:
page_start_id += 1
def get_realtime_news(self, interval=60):
page_url = "{}/1".format(config.WEBSITES_LIST_TO_BE_CRAWLED_NBD)
logging.info("start real-time crawling of URL -> {}, request every {} secs ... ".format(page_url, interval))
name_code_df = self.db_obj.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
# crawled_urls = []
date_list = self.db_obj.get_data(self.db_name, self.col_name, keys=["Date"])["Date"].to_list()
latest_date = max(date_list)
while True:
# 每隔一定时间轮询该网址
# if len(crawled_urls) > 100:
# # 防止list过长,内存消耗大,维持list在100条
# crawled_urls.pop(0)
if self.redis_client.llen(config.CACHE_SAVED_NEWS_NBD_TODAY_VAR_NAME) > 100:
# 防止缓存list过长,内存消耗大,维持list在100条
self.redis_client.rpop(config.CACHE_SAVED_NEWS_NBD_TODAY_VAR_NAME)
bs = utils.html_parser(page_url)
a_list = bs.find_all("a")
for a in a_list:
if "click-statistic" in a.attrs and a.string \
and a["click-statistic"].find("Article_") != -1 \
and a["href"].find("http://www.nbd.com.cn/articles/") != -1:
# if a["href"] not in crawled_urls:
if a["href"] not in self.redis_client.lrange(config.CACHE_SAVED_NEWS_NBD_TODAY_VAR_NAME, 0, -1):
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
if not result:
# 爬取失败的情况
logging.info("[FAILED] {} {}".format(a.string, a["href"]))
else:
# 有返回但是article为null的情况
date, article = result
if date > latest_date:
while article == "" and self.is_article_prob >= .1:
self.is_article_prob -= .1
result = self.get_url_info(a["href"])
while not result:
self.terminated_amount += 1
if self.terminated_amount > config.NBD_MAX_REJECTED_AMOUNTS:
# 始终无法爬取的URL保存起来
with open(config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH, "a+") as file:
file.write("{}\n".format(a["href"]))
logging.info("rejected by remote server longer than {} minutes, "
"and the failed url has been written in path {}"
.format(config.NBD_MAX_REJECTED_AMOUNTS,
config.RECORD_NBD_FAILED_URL_TXT_FILE_PATH))
break
logging.info("rejected by remote server, request {} again after "
"{} seconds...".format(a["href"], 60 * self.terminated_amount))
time.sleep(60 * self.terminated_amount)
result = self.get_url_info(a["href"])
date, article = result
self.is_article_prob = .5
if article != "":
related_stock_codes_list = self.tokenization.find_relevant_stock_codes_in_article(article,
name_code_dict)
self.db_obj.insert_data(self.db_name, self.col_name,
{"Date": date,
# "PageId": page_url.split("/")[-1],
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list)})
self.redis_client.lpush(config.CACHE_NEWS_LIST_NAME, json.dumps(
{"Date": date,
# "PageId": page_url.split("/")[-1],
"Url": a["href"],
"Title": a.string,
"Article": article,
"RelatedStockCodes": " ".join(related_stock_codes_list),
"OriDB": config.DATABASE_NAME,
"OriCOL": config.COLLECTION_NAME_NBD
}
))
# crawled_urls.append(a["href"])
self.redis_client.lpush(config.CACHE_SAVED_NEWS_NBD_TODAY_VAR_NAME, a["href"])
logging.info("[SUCCESS] {} {} {}".format(date, a.string, a["href"]))
# logging.info("sleep {} secs then request again ... ".format(interval))
time.sleep(interval)
# """
# Example-1:
# 爬取历史新闻数据
# """
# if __name__ == "__main__":
# nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
# nbd_spyder.get_historical_news(start_page=684)
#
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# """
# Example-2:
# 爬取实时新闻数据
# """
# if __name__ == '__main__':
# from Kite import config
#
# from Killua.denull import DeNull
# from Killua.deduplication import Deduplication
#
# from Gon.nbdspyder import NbdSpyder
#
# # 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取
# # 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取
# nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
# nbd_spyder.get_historical_news()
#
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
#
# nbd_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Gon/realtime_starter_cnstock.py
================================================
import __init__
import time
import redis
import logging
import threading
from Kite import config
from Kite.database import Database
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.cnstockspyder import CnStockSpyder
redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
redis_client.lpush(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, "realtime_starter_cnstock.py")
obj = Database()
df = obj.get_data(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK, keys=["Date", "Category"])
cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
# 先补充历史数据,比如已爬取数据到2020-12-01,但是启动实时爬取程序在2020-12-23,则先
# 自动补充爬取2020-12-02至2020-12-23的新闻数据
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
# 查询type_chn的最近一条数据的时间
latets_date_in_db = max(df[df.Category == type_chn]["Date"].to_list())
cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn, start_date=latets_date_in_db)
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
# 开启多线程并行实时爬取
thread_list = []
for url, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
thread = threading.Thread(target=cnstock_spyder.get_realtime_news, args=(url, type_chn, 60))
thread_list.append(thread)
for thread in thread_list:
thread.start()
for thread in thread_list:
thread.join()
================================================
FILE: legacy_v1/src/Gon/realtime_starter_jrj.py
================================================
import __init__
import redis
from Kite import config
from Gon.jrjspyder import JrjSpyder
redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
redis_client.lpush(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, "realtime_starter_jrj.py")
jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ) # 补充爬虫数据到最新日期
jrj_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Gon/realtime_starter_nbd.py
================================================
import __init__
import redis
from Kite import config
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Gon.nbdspyder import NbdSpyder
redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
redis_client.lpush(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, "realtime_starter_nbd.py")
# 如果没有历史数据从头爬取,如果已爬取历史数据,则从最新的时间开始爬取
# 如历史数据中最近的新闻时间是"2020-12-09 20:37:10",则从该时间开始爬取
nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
nbd_spyder.get_historical_news()
# Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
# DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
nbd_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Gon/realtime_starter_redis_queue.py
================================================
import __init__
import redis
from Kite import config
from Killua.buildstocknewsdb import GenStockNewsDB
redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
redis_client.lpush(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, "realtime_starter_redis_queue.py")
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.listen_redis_queue()
================================================
FILE: legacy_v1/src/Gon/realtime_starter_stock_price.py
================================================
import __init__
import redis
from Kite import config
from Gon.stockinfospyder import StockInfoSpyder
redis_client = redis.StrictRedis(config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID)
redis_client.lpush(config.CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR, "realtime_starter_stock_price.py")
stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
stock_info_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Gon/sinaspyder.py
================================================
"""
新浪财经网:https://finance.sina.com.cn
公司要闻:https://finance.sina.com.cn/roll/index.d.html?cid=56592&page=1
个股点评:https://finance.sina.com.cn/roll/index.d.html?cid=56588&page=1
大盘评述:https://finance.sina.com.cn/roll/index.d.html?cid=56589&page=1
公司研究:http://stock.finance.sina.com.cn/stock/go.php/vReport_List/kind/company/index.phtml?p=1
市场研究:https://finance.sina.com.cn/roll/index.d.html?cid=56605&page=1
主力动向:https://finance.sina.com.cn/roll/index.d.html?cid=56615&page=1
行业研究:http://stock.finance.sina.com.cn/stock/go.php/vReport_List/kind/industry/index.phtml?p=1
投资策略:http://stock.finance.sina.com.cn/stock/go.php/vReport_List/kind/strategy/index.phtml?p=1
"""
import __init__
from spyder import Spyder
================================================
FILE: legacy_v1/src/Gon/spyder.py
================================================
class Spyder(object):
def __init__(self):
self.is_article_prob = .5
def extract_data(self, tag_list):
data = list()
for tag in tag_list:
exec(tag + " = self.col.distinct('" + tag + "')")
exec("data.append(" + tag + ")")
return data
def query_news(self, _key, param):
# 模糊查询
return self.col.find({_key: {'$regex': ".*{}.*".format(param)}})
def get_url_info(self, url):
pass
def get_historical_news(self, url):
pass
def get_realtime_news(self, url):
pass
================================================
FILE: legacy_v1/src/Gon/stockinfospyder.py
================================================
"""
https://www.akshare.xyz/zh_CN/latest/
"""
import __init__
import os
import time
import redis
import logging
import datetime
from spyder import Spyder
from pandas._libs.tslibs.timestamps import Timestamp
from Kite.database import Database
from Kite import config
import akshare as ak
import tushare as ts
ts.set_token(config.TUSHARE_TOKEN)
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class StockInfoSpyder(Spyder):
def __init__(self, database_name, collection_name):
super(StockInfoSpyder, self).__init__()
self.db_obj = Database()
self.col_basic_info = self.db_obj.get_collection(database_name, collection_name)
self.database_name = database_name
self.collection_name = collection_name
self.start_program_date = datetime.datetime.now().strftime("%Y%m%d")
self.redis_client = redis.StrictRedis(host="localhost",
port=6379,
db=config.REDIS_CLIENT_FOR_CACHING_STOCK_INFO_DB_ID)
self.redis_client.set("today_date", datetime.datetime.now().strftime("%Y-%m-%d"))
def get_stock_code_info(self):
# TODO:每半年需要更新一次
stock_info_df = ak.stock_info_a_code_name() # 获取所有A股code和name
stock_symbol_code = ak.stock_zh_a_spot().get(["symbol", "code"]) # 获取A股所有股票的symbol和code
for _id in range(stock_info_df.shape[0]):
_symbol = stock_symbol_code[stock_symbol_code.code == stock_info_df.iloc[_id].code].symbol.values
if len(_symbol) != 0:
_dict = {"symbol": _symbol[0]}
_dict.update(stock_info_df.iloc[_id].to_dict())
self.col_basic_info.insert_one(_dict)
def get_historical_news(self, start_date=None, end_date=None, freq="day"):
if end_date is None:
end_date = datetime.datetime.now().strftime("%Y%m%d")
stock_symbol_list = self.col_basic_info.distinct("symbol")
if len(stock_symbol_list) == 0:
self.get_stock_code_info()
stock_symbol_list = self.col_basic_info.distinct("symbol")
if freq == "day":
start_stock_code = 0 if self.redis_client.get("start_stock_code") is None else int(self.redis_client.get("start_stock_code").decode())
for symbol in stock_symbol_list:
if int(symbol[2:]) > start_stock_code:
if start_date is None:
# 如果该symbol有历史数据,如果有则从API获取从数据库中最近的时间开始直到现在的所有价格数据
# 如果该symbol无历史数据,则从API获取从2015年1月1日开始直到现在的所有价格数据
_latest_date = self.redis_client.get(symbol)
if _latest_date is None:
symbol_start_date = config.STOCK_PRICE_REQUEST_DEFAULT_DATE
else:
tmp_date_dt = datetime.datetime.strptime(_latest_date.decode(), "%Y-%m-%d").date()
offset = datetime.timedelta(days=1)
symbol_start_date = (tmp_date_dt + offset).strftime('%Y%m%d')
if symbol_start_date < end_date:
stock_zh_a_daily_hfq_df = ak.stock_zh_a_daily(symbol=symbol,
start_date=symbol_start_date,
end_date=end_date,
adjust="qfq")
stock_zh_a_daily_hfq_df.insert(0, 'date', stock_zh_a_daily_hfq_df.index.tolist())
stock_zh_a_daily_hfq_df.index = range(len(stock_zh_a_daily_hfq_df))
_col = self.db_obj.get_collection(self.database_name, symbol)
for _id in range(stock_zh_a_daily_hfq_df.shape[0]):
_tmp_dict = stock_zh_a_daily_hfq_df.iloc[_id].to_dict()
_tmp_dict.pop("outstanding_share")
_tmp_dict.pop("turnover")
_col.insert_one(_tmp_dict)
self.redis_client.set(symbol, str(_tmp_dict["date"]).split(" ")[0])
logging.info("{} finished saving from {} to {} ... ".format(symbol, symbol_start_date, end_date))
self.redis_client.set("start_stock_code", int(symbol[2:]))
self.redis_client.set("start_stock_code", 0)
elif freq == "week":
pass
elif freq == "month":
pass
elif freq == "5mins":
pass
elif freq == "15mins":
pass
elif freq == "30mins":
pass
elif freq == "60mins":
pass
def get_realtime_news(self, freq="day"):
while True:
if_updated = input("Has the stock price dataset been updated today? (Y/N) \n")
if if_updated == "Y":
self.redis_client.set("is_today_updated", "1")
break
elif if_updated == "N":
self.redis_client.set("is_today_updated", "")
break
self.get_historical_news() # 对所有股票补充数据到最新
while True:
if freq == "day":
time_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
if time_now.split(" ")[0] != self.redis_client.get("today_date").decode():
self.redis_client.set("today_date", time_now.split(" ")[0])
self.redis_client.set("is_today_updated", "") # 过了凌晨,该参数设置回空值,表示今天未进行数据更新
if not bool(self.redis_client.get("is_today_updated").decode()):
update_time = "{} {}".format(time_now.split(" ")[0], "15:30:00")
if time_now >= update_time:
stock_zh_a_spot_df = ak.stock_zh_a_spot() # 当天的日数据行情下载
for _id, sym in enumerate(stock_zh_a_spot_df["symbol"]):
_col = self.db_obj.get_collection(self.database_name, sym)
_tmp_dict = {}
_tmp_dict.update({"date": Timestamp("{} 00:00:00".format(time_now.split(" ")[0]))})
_tmp_dict.update({"open": stock_zh_a_spot_df.iloc[_id].open})
_tmp_dict.update({"high": stock_zh_a_spot_df.iloc[_id].high})
_tmp_dict.update({"low": stock_zh_a_spot_df.iloc[_id].low})
_tmp_dict.update({"close": stock_zh_a_spot_df.iloc[_id].trade})
_tmp_dict.update({"volume": stock_zh_a_spot_df.iloc[_id].volume})
_col.insert_one(_tmp_dict)
self.redis_client.set(sym, time_now.split(" ")[0])
logging.info("finished updating {} price data of {} ... ".format(sym, time_now.split(" ")[0]))
self.redis_client.set("is_today_updated", "1")
#TODO:当更新股票价格数据后,接着应该更新股票新闻数据库标签
# if __name__ == "__main__":
# from Kite import config
# from Gon.stockinfospyder import StockInfoSpyder
#
# stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
#
# # 指定时间段,获取历史数据,如:stock_info_spyder.get_historical_news(start_date="20150101", end_date="20201204")
# # 如果没有指定时间段,且数据库已存在部分数据,则从最新的数据时间开始获取直到现在,比如数据库里已有sh600000价格数据到
# # 2020-12-03号,如不设定具体时间,则从自动获取sh600000自2020-12-04至当前的价格数据
# # stock_info_spyder.get_historical_news()
#
# # 开启自动化更新所有股票价格数据(目前只支持在15:30分后更新日数据)
# stock_info_spyder.get_realtime_news()
================================================
FILE: legacy_v1/src/Hisoka/classifier.py
================================================
import __init__
import logging
import warnings
from Kite import config
import joblib
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import sklearn.exceptions
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s",
datefmt="%a, %d %b %Y %H:%M:%S")
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=Warning, module='sklearn')
warnings.filterwarnings("ignore", category=UserWarning, module='gensim')
warnings.filterwarnings("ignore", category=RuntimeWarning, module='gensim')
class Classifier(object):
def __init__(self):
self.scores = config.CLASSIFIER_SCORE_LIST
def train(self, train_x, train_y, test_x, test_y, model_type="svm", model_save_path=None):
assert len(self.scores) != 0
clf = None
for score in self.scores:
# 'cv': 构造这个GridSearch的分类器,5-fold
# 'refit': 默认为True,程序将会以交叉验证训练集得到的最佳参数,重新对所有可用的训练,
# 作为最终用于性能评估的最佳模型参数。即在搜索参数结束后,用最佳参数结果再
# 次fit一遍全部数据集
if model_type == "svm":
tuned_parameters = config.SMV_TUNED_PARAMTERS
clf = GridSearchCV(svm.SVC(),
tuned_parameters,
cv=5,
scoring=score,
refit="AUC")
elif model_type == "rdforest":
tuned_parameters = config.RDFOREST_TUNED_PARAMTERS
clf = GridSearchCV(RandomForestClassifier(random_state=10),
tuned_parameters,
cv=5,
scoring=score,
refit="AUC")
# 只在训练集上面做k-fold,然后返回最优的模型参数
clf.fit(train_x, train_y)
if model_save_path is not None:
joblib.dump(clf, model_save_path)
# 输出最优的模型参数
logging.info("the best params: {}".format(clf.best_params_))
train_pred = clf.predict(train_x)
test_pred = clf.predict(test_x) # 在测试集上测试最优的模型的泛化能力
logging.info("\n{}".format(classification_report(test_y, test_pred)))
precise_train = 0
for k in range(len(train_pred)):
if train_pred[k] == train_y[k]:
precise_train += 1
precise_test = 0
for k in range(len(test_pred)):
if test_pred[k] == test_y[k]:
precise_test += 1
logging.info('train_accuracy: {} test_accuracy: {}'
.format(str(round(precise_train / len(train_y), 4)),
str(round(precise_test / len(test_pred), 4))))
self._precise = precise_test / len(test_pred)
assert clf is not None
return clf
@staticmethod
def model_load(classifier_save_path):
return joblib.load(classifier_save_path)
================================================
FILE: legacy_v1/src/Killua/__init__.py
================================================
import os
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
# add `./src` dir to system path
src_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
add_path(src_dir)
================================================
FILE: legacy_v1/src/Killua/buildstocknewsdb.py
================================================
import __init__
import json
import redis
import logging
import datetime
import akshare as ak
from Kite import config
from Kite.database import Database
from Leorio.tokenization import Tokenization
from Leorio.topicmodelling import TopicModelling
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class GenStockNewsDB(object):
def __init__(self):
self.database = Database()
# 获取从1990-12-19至2020-12-31股票交易日数据
self.trade_date = ak.tool_trade_date_hist_sina()["trade_date"].tolist()
self.label_range = {3: "3DaysLabel",
5: "5DaysLabel",
10: "10DaysLabel",
15: "15DaysLabel",
30: "30DaysLabel",
60: "60DaysLabel"}
self.redis_client = redis.StrictRedis(host=config.REDIS_IP,
port=config.REDIS_PORT,
db=config.CACHE_NEWS_REDIS_DB_ID)
self.redis_client.set("today_date", datetime.datetime.now().strftime("%Y-%m-%d"))
self.redis_client.delete("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML))
self._stock_news_nums_stat()
def get_all_news_about_specific_stock(self, database_name, collection_name):
# 获取collection_name的key值,看是否包含RelatedStockCodes,如果没有说明,没有做将新闻中所涉及的
# 股票代码保存在新的一列
_keys_list = list(next(self.database.get_collection(database_name, collection_name).find()).keys())
if "RelatedStockCodes" not in _keys_list:
tokenization = Tokenization(import_module="jieba", user_dict="./Leorio/financedict.txt")
tokenization.update_news_database_rows(database_name, collection_name)
# 创建stock_code为名称的collection
stock_symbol_list = self.database.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["symbol"])["symbol"].to_list()
col_names = self.database.connect_database(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names(session=None)
for symbol in stock_symbol_list:
if symbol not in col_names:
# if int(symbol[2:]) > 837:
_collection = self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
_tmp_num_stat = 0
for row in self.database.get_collection(database_name, collection_name).find(): # 迭代器
if symbol[2:] in row["RelatedStockCodes"].split(" "):
# 返回新闻发布后n天的标签
_tmp_dict = {}
for label_days, key_name in self.label_range.items():
_tmp_res = self._label_news(
datetime.datetime.strptime(row["Date"].split(" ")[0], "%Y-%m-%d"), symbol, label_days)
_tmp_dict.update({key_name: _tmp_res})
_data = {"Date": row["Date"],
"Url": row["Url"],
"Title": row["Title"],
"Article": row["Article"],
"OriDB": database_name,
"OriCOL": collection_name}
_data.update(_tmp_dict)
_collection.insert_one(_data)
_tmp_num_stat += 1
logging.info("there are {} news mentioned {} in {} collection need to be fetched ... "
.format(_tmp_num_stat, symbol, collection_name))
# else:
# logging.info("{} has fetched all related news from {}...".format(symbol, collection_name))
def listen_redis_queue(self):
# 监听redis消息队列,当新的实时数据过来时,根据"RelatedStockCodes"字段,将新闻分别保存到对应的股票数据库
# e.g.:缓存新的一条数据中,"RelatedStockCodes"字段数据为"603386 603003 600111 603568",则将该条新闻分别
# 都存进这四支股票对应的数据库中
crawled_url_today = set()
while True:
date_now = datetime.datetime.now().strftime("%Y-%m-%d")
if date_now != self.redis_client.get("today_date").decode():
crawled_url_today = set()
self.redis_client.set("today_date", date_now)
if self.redis_client.llen(config.CACHE_NEWS_LIST_NAME) != 0:
data = json.loads(self.redis_client.lindex(config.CACHE_NEWS_LIST_NAME, -1))
if data["Url"] not in crawled_url_today: # 排除重复插入冗余文本
crawled_url_today.update({data["Url"]})
if data["RelatedStockCodes"] != "":
for stock_code in data["RelatedStockCodes"].split(" "):
# 将新闻分别送进相关股票数据库
symbol = "sh{}".format(stock_code) if stock_code[0] == "6" else "sz{}".format(stock_code)
_collection = self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, symbol)
_tmp_dict = {}
for label_days, key_name in self.label_range.items():
_tmp_res = self._label_news(
datetime.datetime.strptime(data["Date"].split(" ")[0], "%Y-%m-%d"), symbol, label_days)
_tmp_dict.update({key_name: _tmp_res})
_data = {"Date": data["Date"],
"Url": data["Url"],
"Title": data["Title"],
"Article": data["Article"],
"OriDB": data["OriDB"],
"OriCOL": data["OriCOL"]}
_data.update(_tmp_dict)
_collection.insert_one(_data)
logging.info("the real-time fetched news {}, which was saved in [DB:{} - COL:{}] ...".format(data["Title"],
config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE,
symbol))
#
# if symbol.encode() in self.redis_client.lrange("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML), 0, -1):
# label_name = "3DaysLabel"
# # classifier_save_path = "{}_classifier.pkl".format(symbol)
# ori_dict_path = "{}_docs_dict.dict".format(symbol)
# bowvec_save_path = "{}_bowvec.mm".format(symbol)
#
# topicmodelling = TopicModelling()
# chn_label = topicmodelling.classify_stock_news(data["Article"],
# config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE,
# symbol,
# label_name=label_name,
# topic_model_type="lsi",
# classifier_model="rdforest", # rdforest / svm
# ori_dict_path=ori_dict_path,
# bowvec_save_path=bowvec_save_path)
# logging.info(
# "document '{}...' was classified with label '{}' for symbol {} ... ".format(
# data["Article"][:20], chn_label, symbol))
self.redis_client.rpop(config.CACHE_NEWS_LIST_NAME)
logging.info("now pop {} from redis queue of [DB:{} - KEY:{}] ... ".format(data["Title"],
config.CACHE_NEWS_REDIS_DB_ID,
config.CACHE_NEWS_LIST_NAME))
def _label_news(self, date, symbol, n_days):
"""
:param date: 类型datetime.datetime,表示新闻发布的日期,只包括年月日,不包括具体时刻,如datetime.datetime(2015, 1, 5, 0, 0)
:param symbol: 类型str,表示股票标的,如sh600000
:param n_days: 类型int,表示根据多少天后的价格设定标签,如新闻发布后n_days天,如果收盘价格上涨,则认为该则新闻是利好消息
"""
# 计算新闻发布当天经过n_days天后的具体年月日
this_date_data = self.database.get_data(config.STOCK_DATABASE_NAME,
symbol,
query={"date": date})
# 考虑情况:新闻发布日期是非交易日,因此该日期没有价格数据,则往前寻找,比如新闻发布日期是2020-12-12是星期六,
# 则考虑2020-12-11日的收盘价作为该新闻发布时的数据
tmp_date = date
if this_date_data is None:
i = 1
while this_date_data is None and i <= 10:
tmp_date -= datetime.timedelta(days=i)
# 判断日期是否是交易日,如果是再去查询数据库;如果this_date_data还是NULL值,则说明数据库没有该交易日数据
if tmp_date.strftime("%Y-%m-%d") in self.trade_date:
this_date_data = self.database.get_data(config.STOCK_DATABASE_NAME,
symbol,
query={"date": tmp_date})
i += 1
try:
close_price_this_date = this_date_data["close"][0]
except Exception:
close_price_this_date = None
# 考虑情况:新闻发布后n_days天是非交易日,或者没有采集到数据,因此向后寻找,如新闻发布日期是2020-12-08,5天
# 后的日期是2020-12-13是周日,因此将2020-12-14日周一的收盘价作为n_days后的数据
new_date = date + datetime.timedelta(days=n_days)
n_days_later_data = self.database.get_data(config.STOCK_DATABASE_NAME,
symbol,
query={"date": new_date})
if n_days_later_data is None:
i = 1
while n_days_later_data is None and i <= 10:
new_date = date + datetime.timedelta(days=n_days+i)
if new_date.strftime("%Y-%m-%d") in self.trade_date:
n_days_later_data = self.database.get_data(config.STOCK_DATABASE_NAME,
symbol,
query={"date": new_date})
i += 1
try:
close_price_n_days_later = n_days_later_data["close"][0]
except Exception:
close_price_n_days_later = None
# 判断条件:
# (1)如果n_days个交易日后且n_days<=10天,则价格上涨(下跌)超过3%,则认为该新闻是利好(利空)消息;如果价格在3%的范围内,则为中性消息
# (2)如果n_days个交易日后且10 param:
return "利好"
elif (close_price_n_days_later - close_price_this_date) / close_price_this_date < -param:
return "利空"
else:
return "中性"
else:
return ""
def _stock_news_nums_stat(self):
cols_list = self.database.connect_database(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE).list_collection_names(session=None)
for sym in cols_list:
if self.database.get_collection(config.ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE, sym).estimated_document_count() > config.MINIMUM_STOCK_NEWS_NUM_FOR_ML:
self.redis_client.lpush("stock_news_num_over_{}".format(config.MINIMUM_STOCK_NEWS_NUM_FOR_ML), sym)
if __name__ == "__main__":
from Kite import config
from Killua.buildstocknewsdb import GenStockNewsDB
gen_stock_news_db = GenStockNewsDB()
# gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
# gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
# gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
# gen_stock_news_db.listen_redis_queue()
================================================
FILE: legacy_v1/src/Killua/deduplication.py
================================================
import __init__
from Kite.database import Database
from Kite import utils
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class Deduplication(object):
def __init__(self, database_name, collection_name):
self.database = Database()
self.database_name = database_name
self.collection_name = collection_name
self.delete_num = 0
def run(self):
date_list = self.database.get_data(self.database_name,
self.collection_name,
keys=["Date"])["Date"].tolist()
collection = self.database.get_collection(self.database_name, self.collection_name)
date_list.sort() # 升序
# start_date, end_date = date_list[1].split(" ")[0], date_list[-1].split(" ")[0]
start_date, end_date = min(date_list).split(" ")[0], max(date_list).split(" ")[0]
for _date in utils.get_date_list_from_range(start_date, end_date):
# 获取特定时间对应的数据并根据URL去重
# logging.info(_date)
try:
data_df = self.database.get_data(self.database_name,
self.collection_name,
query={"Date": {"$regex": _date}})
except Exception:
continue
if data_df is None:
continue
data_df_drop_duplicate = data_df.drop_duplicates(["Url"])
for _id in list(set(data_df["_id"]) - set(data_df_drop_duplicate["_id"])):
collection.delete_one({'_id': _id})
self.delete_num += 1
# logging.info("{} finished ... ".format(_date))
logging.info("DB:{} - COL:{} had {} data length originally, now has deleted {} depulications ... "
.format(self.database_name, self.collection_name, str(len(date_list)), self.delete_num))
if __name__ == "__main__":
from Killua.deduplication import Deduplication
from Kite import config
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
================================================
FILE: legacy_v1/src/Killua/denull.py
================================================
"""
删除数据库中含有null值的行
"""
import __init__
import logging
from Kite.database import Database
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class DeNull(object):
def __init__(self, database_name, collection_name):
self.database = Database()
self.database_name = database_name
self.collection_name = collection_name
self.delete_num = 0
def run(self):
collection = self.database.get_collection(self.database_name, self.collection_name)
for row in self.database.get_collection(self.database_name, self.collection_name).find():
for _key in list(row.keys()):
if _key != "RelatedStockCodes" and row[_key] == "":
collection.delete_one({'_id': row["_id"]})
self.delete_num += 1
break
logging.info("there are {} news contained NULL value in {} collection ... "
.format(self.delete_num, self.collection_name))
if __name__ == "__main__":
from Killua.denull import DeNull
from Kite import config
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
================================================
FILE: legacy_v1/src/Kite/__init__.py
================================================
import os
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
this_dir = os.path.dirname(__file__)
# add `./src/Kite` dir to system path
add_path(this_dir)
================================================
FILE: legacy_v1/src/Kite/config.py
================================================
MONGODB_IP = "localhost"
MONGODB_PORT = 27017
REDIS_IP = "localhost"
REDIS_PORT = 6379
THREAD_NUMS_FOR_SPYDER = 4
DATABASE_NAME = "finnewshunter"
COLLECTION_NAME_CNSTOCK = "cnstock"
CHROME_DRIVER = "./chromedriver.exe"
# WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK = {"https://company.cnstock.com/company/scp_gsxw": "公司聚焦",
# "https://ggjd.cnstock.com/gglist/search/qmtbbdj": "公告解读",
# "https://ggjd.cnstock.com/gglist/search/ggkx": "公告快讯",
# "https://ggjd.cnstock.com/company/scp_ggjd/tjd_sdlh": "利好公告"}
WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK = {"https://company.cnstock.com/company/scp_gsxw": "公司聚焦",
"http://ggjd.cnstock.com/company/scp_ggjd/tjd_bbdj": "公告解读",
"http://ggjd.cnstock.com/company/scp_ggjd/tjd_ggkx": "公告快讯",
"https://ggjd.cnstock.com/company/scp_ggjd/tjd_sdlh": "利好公告"}
RECORD_CNSTOCK_FAILED_URL_TXT_FILE_PATH = "D:/workfiles/gpu-cloud-backup/Listed-company-news-crawl-and-text-analysis/src/Gon/cnstock_failed_urls.txt"
CNSTOCK_MAX_REJECTED_AMOUNTS = 10
COLLECTION_NAME_JRJ = "jrj"
JRJ_DATE_RANGE = 100
WEBSITES_LIST_TO_BE_CRAWLED_JRJ = "http://stock.jrj.com.cn/xwk"
RECORD_JRJ_FAILED_URL_TXT_FILE_PATH = "D:/workfiles/gpu-cloud-backup/Listed-company-news-crawl-and-text-analysis/src/Gon/jrj_failed_urls.txt"
JRJ_MAX_REJECTED_AMOUNTS = 10
JRJ_REQUEST_DEFAULT_DATE = "2015-01-01"
CACHE_SAVED_NEWS_JRJ_TODAY_VAR_NAME = "cache_news_queue_jrj"
COLLECTION_NAME_NBD = "nbd"
WEBSITES_LIST_TO_BE_CRAWLED_NBD = "http://stocks.nbd.com.cn/columns/275/page"
RECORD_NBD_FAILED_URL_TXT_FILE_PATH = "D:/workfiles/gpu-cloud-backup/Listed-company-news-crawl-and-text-analysis/src/Gon/nbd_failed_urls.txt"
NBD_TOTAL_PAGES_NUM = 684
NBD_MAX_REJECTED_AMOUNTS = 10
CACHE_SAVED_NEWS_NBD_TODAY_VAR_NAME = "cache_news_queue_nbd"
TUSHARE_TOKEN = "97fbc4c73727b5d171ca6670cbc4af8b0a3de5fbab74b52f30b598cc"
STOCK_DATABASE_NAME = "stock"
COLLECTION_NAME_STOCK_BASIC_INFO = "basic_info"
STOCK_PRICE_REQUEST_DEFAULT_DATE = "20150101"
REDIS_CLIENT_FOR_CACHING_STOCK_INFO_DB_ID = 1
ALL_NEWS_OF_SPECIFIC_STOCK_DATABASE = "stocknews"
TOPIC_NUMBER = 200
SVM_TUNED_PARAMTERS = {"kernel": ["rbf"], "gamma": [10, 20, 50, 100, 150, 200], "C": [10, 15, 20, 30, 50, 100]}
RDFOREST_TUNED_PARAMTERS = {"n_estimators": [1, 2, 3, 4, 5, 10],
"criterion": ["gini", "entropy"],
"max_features": ["auto", "sqrt"]}
CLASSIFIER_SCORE_LIST = ["f1_weighted"]
USER_DEFINED_DICT_PATH = "D:/workfiles/gpu-cloud-backup/Listed-company-news-crawl-and-text-analysis/src/Leorio/financedict.txt"
CHN_STOP_WORDS_PATH = "D:/workfiles/gpu-cloud-backup/Listed-company-news-crawl-and-text-analysis/src/Leorio/chnstopwords.txt"
CACHE_NEWS_REDIS_DB_ID = 0
CACHE_NEWS_LIST_NAME = "cache_news_waiting_for_classification"
CACHE_RECORED_OPENED_PYTHON_PROGRAM_DB_ID = 0
CACHE_RECORED_OPENED_PYTHON_PROGRAM_VAR = "opened_python_scripts"
MINIMUM_STOCK_NEWS_NUM_FOR_ML = 1000
================================================
FILE: legacy_v1/src/Kite/database.py
================================================
from pymongo import MongoClient
import pandas as pd
class Database(object):
def __init__(self, ip="localhost", port=27017):
self.ip = ip
self.port = port
self.conn = MongoClient(self.ip, self.port)
def connect_database(self, database_name):
return self.conn[database_name]
def get_collection(self, database_name, collection_name):
return self.connect_database(database_name).get_collection(collection_name)
def insert_data(self, database_name, collection_name, data_dict):
database = self.conn[database_name]
collection = database.get_collection(collection_name)
collection.insert_one(data_dict)
def update_row(self, database_name, collection_name, query, new_values):
assert isinstance(query, dict)
assert isinstance(new_values, dict)
database = self.conn[database_name]
collection = database.get_collection(collection_name)
collection.update_one(query, {"$set": new_values})
def get_data(self, database_name, collection_name, max_data_request=None, query=None, keys=None):
# e.g.:
# ExampleObj = Database()
# ExampleObj.get_data("finnewshunter", "nbd", query={"Date": {"$regex": "2014"}}, keys=["Url", "Title"])
database = self.conn[database_name]
collection = database.get_collection(collection_name)
if query:
assert isinstance(query, dict)
else:
query = {}
if keys:
assert isinstance(keys, list)
else:
keys = []
if max_data_request:
assert isinstance(max_data_request, int)
else:
max_data_request = float("inf")
try:
if len(keys) != 0:
_dict = {_key: [] for _key in keys}
data = collection.find(query) if len(query) != 0 else collection.find()
for _id, row in enumerate(data):
if _id + 1 <= max_data_request:
for _key in keys:
_dict[_key].append(row[_key])
else:
break
else:
# data = collection.find()
data = collection.find(query) if len(query) != 0 else collection.find()
data_keys = list(
next(data).keys()) # ['_id', 'Date', 'PageId', 'Url', 'Title', 'Article', 'RelevantStockCodes']
_dict = {_key: [] for _key in data_keys}
for _id, row in enumerate(collection.find(query) if len(query) != 0 else collection.find()):
if _id + 1 <= max_data_request:
for _key in data_keys:
_dict[_key].append(row[_key])
else:
break
return pd.DataFrame(_dict)
except Exception:
return None
def drop_db(self, database):
self.conn.drop_database(database)
'''
from database import Database
ExampleObj = Database()
db = ExampleObj.connect_database("cnstock")
col = ExampleObj.create_col(db, "cnstock_col")
ExampleObj.insert_data(col, {'name': 'sena', "id": 136})
ExampleObj.drop_db(db)
'''
================================================
FILE: legacy_v1/src/Kite/log.py
================================================
================================================
FILE: legacy_v1/src/Kite/utils.py
================================================
import re
import datetime
import requests
import numpy as np
from bs4 import BeautifulSoup
from scipy.sparse import csr_matrix
def generate_pages_list(total_pages, range, init_page_id):
page_list = list()
k = init_page_id
while k + range - 1 <= total_pages:
page_list.append((k, k + range -1))
k += range
if k + range - 1 < total_pages:
page_list.append((k, total_pages))
return page_list
def count_chn(string):
'''Count Chinese numbers and calculate the frequency of Chinese occurrence.
# Arguments:
string: Each part of crawled website analyzed by BeautifulSoup.
'''
pattern = re.compile(u'[\u1100-\uFFFDh]+?')
result = pattern.findall(string)
chn_num = len(result)
possible = chn_num / len(str(string))
return chn_num, possible
def get_date_list_from_range(begin_date, end_date):
'''Get date list from 'begin_date' to 'end_date' on the calendar.
'''
date_list = list()
begin_date = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
while begin_date <= end_date:
date_str = begin_date.strftime("%Y-%m-%d")
date_list.append(date_str)
begin_date += datetime.timedelta(days=1)
return date_list
def gen_dates_list(date_list, date_range):
date_list_latest = list()
k = 0
while k < len(date_list):
if k + date_range >= len(date_list):
break
else:
date_list_latest.append(date_list[k: k + date_range])
k += date_range
date_list_latest.append(date_list[k:])
return date_list_latest
def get_date_before(n_days):
"""
获取前n_days天的日期,如今天是2020-12-25,当n_days=1,返回"2020-12-24"
:param n_days: 前n_days天数,如n_days=1,即前1天
"""
today = datetime.datetime.now()
# 计算偏移量
offset = datetime.timedelta(days=-n_days)
# 获取想要的日期的时间
re_date = (today + offset).strftime('%Y-%m-%d')
return re_date
def search_max_pages_num(first_url, date):
"""
主要针对金融界网站
通过日期搜索新闻,比如2020年1月1日的新闻,下面链接
http://stock.jrj.com.cn/xwk/202001/20200101_1.shtml
为搜索返回的第一个网页,通过这个网页可以发现,数据库
返回的最大页数是4,即2020年1月1日共有4页的新闻列表
:param first_url: 搜索该日期返回的第一个网址,如'http://stock.jrj.com.cn/xwk/202001/20200101_1.shtml'
:param date: 日期,如'2020-01-01'
"""
respond = requests.get(first_url)
respond.encoding = BeautifulSoup(respond.content, "lxml").original_encoding
bs = BeautifulSoup(respond.text, "lxml")
a_list = bs.find_all("a")
max_pages_num = 1
for a in a_list:
if "href" in a.attrs and "target" in a.attrs:
if a["href"].find(date.replace("-", "") + "_") != -1 \
and a.text.isdigit():
max_pages_num += 1
return max_pages_num
def html_parser(url):
resp = requests.get(url)
resp.encoding = BeautifulSoup(resp.content, "lxml").original_encoding
bs = BeautifulSoup(resp.text, "lxml")
return bs
def get_chn_stop_words(path):
'''Load the stop words txt file.
'''
stopwords = [line.strip() for line in open(path, 'r').readlines()]
return stopwords
def convert_to_csr_matrix(model_vector):
"""
Convert LDA(LSI) model vector to CSR sparse matrix, that could be accepted by Scipy and Numpy.
# Arguments:
modelVec: Transformation model vector, such as LDA model vector, tfidf model vector or lsi model vector.
"""
data = []
rows = []
cols = []
_line_count = 0
for line in model_vector: # line=[(int, float), (int, float), ...]
for elem in line: # elem=(int, float)
rows.append(_line_count)
cols.append(elem[0])
data.append(elem[1])
_line_count += 1
sparse_matrix = csr_matrix((data, (rows, cols)))
matrix = sparse_matrix.toarray() #
return matrix
def generate_training_set(x, y, split=0.8):
rand = np.random.random(size=x.shape[0])
train_x = []
train_y = []
test_x = []
test_y = []
for i in range(x.shape[0]):
if rand[i] < split:
train_x.append(x[i, :])
train_y.append(y[i])
else:
test_x.append(x[i, :])
test_y.append(y[i])
return train_x, train_y, test_x, test_y
def is_contain_chn(word):
"""
判断传入字符串是否包含中文
:param word: 待判断字符串
:return: True:包含中文 False:不包含中文
"""
zh_pattern = re.compile(u'[\u4e00-\u9fa5]+')
if zh_pattern.search(word):
return True
else:
return False
def batch_lpop(client, key, n):
p = client.pipeline()
p.lrange(key, 0, n-1)
p.ltrim(key, n, -1)
p.execute()
================================================
FILE: legacy_v1/src/Kite/webserver.py
================================================
================================================
FILE: legacy_v1/src/Leorio/__init__.py
================================================
import os
import sys
def add_path(path):
if path not in sys.path:
sys.path.insert(0, path)
# add `./src` dir to system path
src_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
add_path(src_dir)
================================================
FILE: legacy_v1/src/Leorio/chnstopwords.txt
================================================
ÿ
ǰ
ת
λ
֤ȯ
ο
Υ߱ؾ
£
:
&
*
һһ
~~~~
.
.һ
./
--
ۣ
ۢݣݣ
ۢ٣ģ
P
//
ۢڣ
ۢڣ
}
Ҳ
ۢ٢ޣ
ۢڣ£
ۢ٣
ۢܣ
ۢ٢ۣ
ۣۢ
ۣ
ۢڣ
ۢ٢
ۢݣ
ۢڣ
ۢܣ
ۢڢۣ
ۣۢ
ۢܣ
ۢ٢ݣ
ۢ٢ߣ
ۢ٣
ʣ
ۢ٢
ۢ٢ܣ
ۢ٣
ۢڣ
ۢڢ
ۢڢ٣
ۢ٣ã
ۣۢ
ۣۢ
ۢڢݣ
ۢڢڣ
һ.
ۢ٣
.
ۣ
ۢ٣£
/
ۢ٣
ۣۢ
ۢ٢٣
ۢܣ
ۢܣ
ۣۢ
ۢݣ
ۢ٣
ۢڢ
ۢڢߣ
ۢ٣
ۢڣ
ݣ
://
ۢڢ
ۢݣ
...
...................
ڣأƣɣԣ
ۣۢƣ
ۢ٣
ݡġ䣽
Ȧա
ڣ
ۢۢ٣
ң̣
ۢ٣ţ
ۣݣ
.
ۢڣ
ۢ
ۢڢߣ
ۢڢڣ
ۣۢ
ۢ٣
ۢ٣£
ۢ٣
ۢ٣
ۢ٣
ۢ٢ڣ
ۢڣ
ۢ
ۢ٣
ۢڣ
ۢڢޣ
ۣۢ
ۢڢ
Ԫ
ۢڢ
ۢ٣
::
ۢڣ
ۣۢ
ۢܣ
ۢݣ
ۢޣ
ۢߣ
ۢ
ۢ
?
,
'
?
?
<
>
[
]
(
)
-
+
/
"
;
#
@
գ
sub
exp
sup
sub
Lex
=
ۢݣ
ۢݣ
ۢڣ
ۢڣǣ
ۢ٣
̣
ۣ
......
ʵϰ
ѽ
Ӵ
ȷ
˴
˵
Ȼ
Ω
ֻ
֮
˼
Ӷ
Ļ
ȵ
˵
֮
ǵ
ͽ
µ
λ
ʴ
Ȼ
Ȼ
δ
ο
ʱ
仰˵
֮
ʹ
ʱ
Ȼ
̶
֮
ʹ
֮
˵
˵
˵
ʼ
ɼ
ͬ
һ
˵
˵
ð
ô
ÿ
ÿ
Ī
ij
ij
ijЩ
ı
Ķ
ĸ
Щ
DZ
Ƕ
Ǹ
ǻ
ô
ôЩ
ô
ʱ
Щ
Ը
Ŷ
Ż
ž
ƾ
ƾ
һ
ǡǡ෴
ǰ
ǰ
Ȼ
Ȼ
Ȼ
˼
κ
ƾ
ɶ
ʹ
ô
ʡ
ʱ
ʲô
ʲô
ʹ
ǵ
˭
˭֪
˳
˳
Ƶ
Ȼ
˵
Ȼ
Ȼ
ʹ
ͨ
ͬ
ͬʱ
һ
Ϊ
Ϊ
Ϊ
Ϊʲô
Ϊ
ι
غ
ں
Զ
ѽ
Ҫ
Ҫ
ҪȻ
Ҫ
Ҫô
Ҫ
Ҳ
Ҳ
Ҳ
һ
һ
һ
һ
һ
һ
һ
һ
Ա
Լ
ֻ
Ϊ
Ӵ
ɴ˿ɼ
е
й
Щ
Ǻ
ͬʱ
Խ
˵
ô
ô
ô
զ
˵
ô
ô
ôЩ
ô
ʱ
Щ
֨
֮
֮
֮
֮һ
ֻ
ֻ
ֻҪ
ֻ
λ
Դ
Ը
Ը
Լ
Լ
ܵ
ܵ˵
ܵ˵
֮ܶ
֮
Ȼ
ʹ
Ϊ
ѽ
Ӵ
Ұ
Ű
ʱ
˵
Ȼ
˳
װ
˵
Ͼ
ض
ؽ
û
û
Ȼ
ò
ɿ
ɿ
ܲ
ȻĻ
ʤ
ʱ
ͬ
Ҫ
ֺ
ɵ
ֶ
ô
֪
ֹ
ֹһ
Ե
һ
Ե
˵
˵ú
ȥ
˵
ҹ
ñ
û
˻
ʤ
϶
Ȼ
伫
ȥ
˶
ȥ
ȴ
Ϣ
˵
˺
ε
Ҵ
Ӳ
Ӵ
ӴԺ
ӹŵ
ӹ
ӽԺ
ӿ
ͷ
δ
С
絽
ﵩ
촰˵
Լ
Ը
ָ֮
ڶ
Ȼ
ͥ
ͷ
˵
˶
ĿǰΪֹ
ͷ
ͷ
ȷ
ȵ
Ȼ
Ȼ
ʱ
ǰ
˵
û˵
֮Ȼ
֮
dz
ǵ
ڷ
ͷ
Ȼ
¸
õ
Ͽ
粻
ղ
պ
ߵ
ҹ
ʽ
һ
Ϊ
Ȼ
Ƶ
ʶ
ֲ
߳
ޱ
α
γ
η
ο
ֶΪ
ֹ
ܶ
Ȼ
Ȼ
˵
Ȼ
Ȼ
ͬ
Ϊ
Ҵ
˵
...
֮
֮
֮
ֱ
Ҫ
ϱ
Ϊ
Կ
Ȼ
ʱ
ȥ
Ȼ
Ľ
ľ
Ȼ
ʹ
͵
Ȼ
ٷ
ݳ
ݴ
ʵ
˵
֪
Ϥ
˵
ȥ
ɺ
Ҫ
ü
ϴ
ʵʵ
۴
Ӧ
ʱ
ٵ
һ
·
Ŵ
Ŵ
ʶ
Ȼ
Լ
Ϊ
˵
û
û
ÿ
ÿÿ
ÿʱÿ
Ȼ
Ȼ
Ī
Ī
Ī
Ī
ĬĬ
ĬȻ
ĩ
ѵ
ѵ
ѹ
˵
긴һ
ż
ż
Ʃ
ƫƫ
ƹ
ƽ
ͨ
ʵ
ͷ
ֹ
ǡ
ǡ
ǡǡ
ǡ
ǡ
ǡ
ǧ
ǧ
ǧǧ
в
Ī
̼
֮
ȡ
ȥ
Ȩʱ
ȫ
ȫ
ȫ
ȫȻ
ȫ
Ȼ
Ծ
Ȼ
ոһ
ռ
ս
糣
˵ȵ
ǰ
ͷ
ɪɪ
ɳɳ
ȥ
һ.
һһ
һ
һ
һЩ
һ
һͨ
һ
һ
һʱ
һ
һƬ
һ
һֱ
һ
һ
һת
һ
һ
ȥ
һ
Ȼ
˵
ר
Ҳ˵
˵
ϸ
С
м
ḻ
Ϊ
Ϊʲ
Ϊֹ
Ϊ
Ҫ
֮ǰ
֮
֮
Ҳ˵
Ҳ
˽
ȡ
ƶ
Щ
ʲ
Ϊ
ǰ
Ժ
Թ
ͼ
ΰ
ƺ
ʹ
ʹ
ٽ
Ȼ
Ԫ
Ȳ
Ⱥ
ȫ
ȫ
ȫ
ͬ
֮
ٴ
˵
ֱ
ǰ
ǰ
ǰ
ǿ
ʮ
ȴ
ȴ
ԭ
ּ
ʱ
˫
Ӧ
ӳ
ȡ
ܵ
Ϥ
ֻ
ֻ
ֻ
ֻ
ٿ
ͬһ
ͬ
ʹ
Χ
Ǻ
Ψ
ॵ
ٺ
ô
ʧȥ
õ
ͬ
ʼ
֪
ǵ
ȫ
ȫ
ʵ
ʵ
Ӧ
Դ
Է
Ա
С
Ҫ
Ѿ
Ͱ
㷺
Ӧ
Ӧ
Ӧ
չ
ǿ
ǿ
ǰ
ʱ
γ
ʱ
ó
õ
Ȼ
Ҫ
ܽ
Ω
˼
Ը
Ϊ
ҵ
Ի
ս
ν
/
ȷ
Dz
Ƿ
Ȼ
ͨ
ձ
м
Ч
ʱ
е
е
ĩ##ĩ
˵
ijij
ӭ
ֵ
˵
˴
ʱ
˴
ÿ
ÿ
ÿ
ȼ
Ƚ
ûκ
ע
Ȼ
ر
ص
ִ
ɴ
Ŀǰ
ֱ
ֱ
෴
ͬ
Ӧ
൱
գ
Ӻ
֪
ȷ
ƶ
ͻ
ͻȻ
ڶ
ϰ
̺
ά
ϵ
ܷ
ܹ
Ժ
Դ
Χ
ĪȻ
Ϊ
ж
ʾ
Ҫ
涨
Ʃ
Ϊ
ʶ
˵
˵
˵˵
˭
˭
ת
ת
ת
ﵽ
Ѹ
ȥ
Ҫ
һ
Ӧ
ʵ
ͨ
ѭ
ǰ
ȡ
ش
Ҫ
ֹ
ʱ
ѵ˵
Ҫ
Ƕ
================================================
FILE: legacy_v1/src/Leorio/financedict.txt
================================================
备付金
余额宝
佣金宝
前海
C轮融资
区块链
数字货币
去中心化
正虹科技
千山药机
常山北明
华菱精工
蓝晓科技
兴化股份
红墙股份
世荣兆业
奥飞数据
万兴科技
德邦股份
海辰药业
宣亚国际
长亮科技
蓝色光标
翔港科技
永吉股份
天永智能
成飞集成
北特科技
科顺股份
三五互联
哈空调
新宁物流
湖南投资
华联控股
上海雅仕
海澜之家
富祥股份
药石科技
神雾环保
新城控股
上峰水泥
旗滨集团
久吾高科
天虹股份
横店影视
天泽信息
华发股份
四川双马
国发股份
中国国航
万年青
复旦复华
信达地产
光启技术
中设集团
山西焦化
象屿股份
南京银行
安迪苏
神雾节能
罗普斯金
展鹏科技
罗 牛 山
中石科技
真视通
金发拉比
葛洲坝
大唐电信
劲胜智能
*ST金宇
智飞生物
科力远
东方通信
英可瑞
*ST东海A
阳光股份
中房股份
南华仪器
顺网科技
天邦股份
先导智能
南方航空
华斯股份
森马服饰
尚品宅配
彩虹股份
珠江实业
中交地产
光华科技
云南城投
诚志股份
信息发展
泰格医药
飞乐音响
永悦科技
中国化学
宏昌电子
东北电气
南山控股
我武生物
天威视讯
康隆达
协鑫集成
中旗股份
海峡股份
古越龙山
爱建集团
阳 光 城
百合花
格力电器
楚江新材
瀛通通讯
*ST云网
天健集团
掌阅科技
中坚科技
中欣氟材
得利斯
海天味业
滨江集团
久其软件
当代明诚
吉比特
中源协和
华友钴业
格力地产
冠农股份
重庆啤酒
华英农业
珠海港
杭氧股份
海螺水泥
世茂股份
京山轻机
华联综超
威孚高科
井神股份
华鑫股份
华录百纳
生 意 宝
开山股份
华新水泥
飞利信
南大光电
众信旅游
重庆建工
奥马电器
雷曼股份
招商蛇口
一汽轿车
镇海股份
北新建材
世龙实业
中南文化
海汽集团
*ST匹凸
六国化工
掌趣科技
北大荒
中国建筑
健友股份
大晟文化
中远海特
首旅酒店
中国人寿
金牌厨柜
金地集团
风语筑
海大集团
精测电子
吉宏股份
中海油服
金自天正
湘潭电化
东方雨虹
新元科技
先达股份
烽火通信
唐人神
首开股份
创业软件
华鲁恒升
老板电器
欧普照明
新 希 望
金健米业
高鸿股份
恒大高新
九强生物
盛天网络
五洲交通
中国高科
哈工智能
科达洁能
新南洋
大商股份
东方财富
江河集团
大华股份
中青宝
天玑科技
高升控股
同仁堂
安德利
万方发展
田中精机
合盛硅业
通源石油
湖南海利
广州港
华西能源
蓝盾股份
聚灿光电
辉隆股份
未名医药
柯利达
傲农生物
塔牌集团
金 融 街
ST云维
山西证券
蓝思科技
中国长城
易见股份
新日股份
三诺生物
S佳通
吉艾科技
电工合金
山鹰纸业
金科股份
南 玻A
创新股份
华胜天成
ST景谷
三全食品
新钢股份
银座股份
新华保险
神马股份
沱牌舍得
中国武夷
云南锗业
国旅联合
元成股份
北陆药业
赫美集团
卧龙地产
上港集团
康得新
福建水泥
滨海能源
保龄宝
金冠电气
蓝光发展
梅雁吉祥
大连重工
当代东方
冀东装备
大秦铁路
福星股份
欧派家居
众应互联
绿景控股
华东重机
通达股份
波导股份
京汉股份
电子城
华伍股份
大连圣亚
皮阿诺
美利云
冀东水泥
三峡新材
奇精机械
海量数据
恒基达鑫
金杯电工
金陵体育
音飞储存
上海银行
振东制药
沙河股份
康跃科技
利尔化学
梦百合
凯伦股份
*ST昌九
会稽山
苏垦农发
汇洁股份
华菱星马
杰克股份
万达信息
华策影视
银亿股份
三毛派神
登海种业
盐 田 港
上工申贝
沃森生物
中国石化
中材国际
玲珑轮胎
天华超净
鸿博股份
吉峰农机
众源新材
志邦股份
光洋股份
柳 工
中南建设
博彦科技
光力科技
美亚柏科
兰州民百
宝鼎科技
东湖高新
美亚光电
华帝股份
智度股份
美丽生态
中远海控
东港股份
江阴银行
宝新能源
建发股份
众兴菌业
仟源医药
祁连山
*ST昌鱼
常山药业
贝达药业
建新股份
三六五网
宝色股份
龙马环卫
粤泰股份
钧达股份
天晟新材
晨鸣纸业
金 螳 螂
双鹭药业
中国太保
达威股份
光韵达
界龙实业
华泰股份
天创时尚
尖峰集团
迪马股份
探路者
强力新材
纳思达
立霸股份
创维数字
华谊集团
浙江交科
盐湖股份
广州发展
风神股份
新湖中宝
湖南发展
华夏幸福
片仔癀
中信银行
蓝英装备
万通地产
华讯方舟
奥佳华
捷成股份
山煤国际
海南橡胶
柘中股份
九阳股份
鱼跃医疗
全筑股份
新开源
香江控股
交大昂立
东方网力
元隆雅图
派思股份
沃施股份
唐德影视
天康生物
恒瑞医药
三安光电
东方时尚
冰川网络
华瑞股份
天山股份
海峡环保
长方集团
申通地铁
万和电气
电广传媒
航天长峰
中国海诚
梦舟股份
涪陵电力
铁流股份
青岛海尔
力源信息
金字火腿
梦洁股份
健康元
张 裕A
万盛股份
共达电声
贤丰控股
桂东电力
工大高新
雅戈尔
设研院
联美控股
南京高科
华天科技
奥飞娱乐
航天电子
荣盛发展
柳钢股份
暴风集团
爱迪尔
博雅生物
航天电器
道明光学
机器人
泛微网络
龙元建设
鼎捷软件
岱勒新材
华业资本
鸿特精密
中元股份
科伦药业
海南高速
中科曙光
科达股份
长信科技
海航创新
星光农机
美诺华
龙江交通
江泉实业
大亚圣象
中集集团
天源迪科
富安娜
佛山照明
财信发展
三维丝
美的集团
双汇发展
东方钽业
兰太实业
敦煌种业
国际实业
激智科技
凯龙股份
深科技
恒锋工具
兆日科技
青龙管业
时代万恒
洽洽食品
顺发恒业
美凯龙
银信科技
京投发展
兴发集团
梅花生物
川大智胜
云意电气
金枫酒业
利君股份
科泰电源
数据港
天地源
三维通信
上实发展
伟明环保
中国平安
信雅达
天广中茂
绿地控股
金逸影视
粤高速A
天神娱乐
香雪制药
九牧王
浙大网新
北京银行
贵州茅台
同力水泥
天目药业
隆平高科
三棵树
冠城大通
天能重工
华兰生物
陕西黑猫
厦门国贸
易联众
台基股份
永安行
老百姓
腾龙股份
用友网络
北京城建
再升科技
皖江物流
旺能环境
昆仑万维
江苏银行
国联水产
沙隆达A
爱乐达
广州浪奇
*ST准油
水井坊
聚隆科技
华谊兄弟
安妮股份
五 粮 液
博汇纸业
金洲慈航
苏 泊 尔
中国交建
亚宝药业
吉林化纤
金路集团
同洲电子
二三四五
凤形股份
东方通
齐峰新材
深圳华强
明星电缆
建设银行
安彩高科
北信源
海正药业
亚泰集团
鼎信通讯
木林森
万里石
家家悦
金陵饭店
华中数控
达 意 隆
万马股份
南风股份
卫宁健康
洋河股份
金晶科技
中国重汽
辉煌科技
东兴证券
多伦科技
太化股份
瑞斯康达
招商轮船
雏鹰农牧
恒生电子
巴安水务
宁夏建材
东莞控股
杭州银行
深圳机场
冠昊生物
瑞茂通
贵人鸟
招商证券
华侨城A
方正科技
华孚时尚
龙津药业
拓普集团
天原集团
东晶电子
江铃汽车
新澳股份
天坛生物
安正时尚
隆基股份
名雕股份
长盈精密
澳柯玛
网达软件
粤 水 电
华夏银行
现代制药
金科文化
润达医疗
赛摩电气
花园生物
福建高速
三友化工
无锡银行
长春经开
易尚展示
太极股份
京华激光
中毅达
滨化股份
一拖股份
银河生物
长航凤凰
科士达
全 聚 德
神州泰岳
华电重工
中农立华
上海家化
永艺股份
森特股份
中国铁建
顺鑫农业
紫鑫药业
中信海直
山东路桥
深物业A
上柴股份
克来机电
长城汽车
汉威科技
亚盛集团
福田汽车
申万宏源
广州酒家
埃斯顿
煌上煌
同花顺
鲁商置业
七 匹 狼
桐昆股份
绵石投资
易德龙
上海物贸
伊利股份
合锻智能
华贸物流
上海三毛
东阿阿胶
睿康股份
奋达科技
云南能投
游族网络
杰瑞股份
中国中铁
青岛啤酒
黑猫股份
梅安森
同方股份
绿盟科技
创意信息
浪潮软件
浙能电力
来伊份
华星创业
兰石重装
重庆路桥
西水股份
维维股份
新华百货
中直股份
莎普爱思
中国石油
康盛股份
中海达
哈高科
景兴纸业
众合科技
首钢股份
红旗连锁
川环科技
美尔雅
中远海能
赛为智能
三星医疗
银邦股份
爱施德
光大银行
浙江富润
西藏发展
荣科科技
万业企业
芭田股份
三一重工
银禧科技
广宇集团
神州高铁
常熟银行
证通电子
天瑞仪器
国祯环保
中国神华
洁美科技
中国汽研
兴业银行
法 尔 胜
金花股份
东吴证券
中洲控股
新 大 陆
海普瑞
*ST柳化
天富能源
昌红科技
海南瑞泽
*ST宝实
杰恩设计
铁龙物流
三湘印象
张家界
金禾实业
中远海发
阳光照明
新泉股份
歌力思
榕基软件
厦门港务
上海机电
泸州老窖
澄星股份
靖远煤电
白云机场
宁波港
正丹股份
物产中大
襄阳轴承
天夏智慧
浙江美大
恒立液压
顾家家居
华润双鹤
中航光电
千金药业
圣农发展
佳讯飞鸿
宇通客车
继峰股份
保利地产
天润曲轴
广誉远
深纺织A
南方汇通
奥特佳
利安隆
北京文化
长江润发
新五丰
华舟应急
鲁阳节能
拓尔思
国药一致
徐家汇
科新机电
印纪传媒
千禾味业
汇川技术
雪榕生物
华远地产
上海临港
元力股份
欢瑞世纪
汉鼎宇佑
金新农
透景生命
振华重工
理工光科
新乡化纤
世纪星源
云煤能源
海兴电力
天茂集团
莱美药业
同有科技
福耀玻璃
中钨高新
索菲亚
宋城演艺
交运股份
中体产业
星星科技
鹏博士
乐凯新材
广发证券
歌华有线
三维股份
一汽夏利
上海机场
新农开发
希努尔
乐普医疗
浙数文化
东方新星
闽发铝业
深南电路
豪迈科技
陆家嘴
海鸥卫浴
东富龙
中国银行
东北证券
中国国旅
交通银行
通富微电
四维图新
厦门空港
永和智控
易华录
广弘控股
山东海化
亿晶光电
周大生
重庆百货
棒杰股份
益丰药房
新华龙
鸿利智汇
拓日新能
齐心集团
思创医惠
小康股份
艾比森
山推股份
王府井
晶方科技
雪 莱 特
振静股份
华纺股份
*ST坊展
宏大爆破
二六三
龙净环保
承德露露
迎驾贡酒
丰林集团
粤宏远A
大众交通
锡业股份
骆驼股份
科大智能
燕京啤酒
大港股份
四创电子
獐子岛
龙头股份
海利生物
炬华科技
迪安诊断
光线传媒
锦江股份
齐翔腾达
鞍重股份
汇通能源
凯恩股份
汉邦高科
新 海 宜
四川金顶
华域汽车
利欧股份
苏常柴A
太极实业
海欣股份
大连港
杭齿前进
航民股份
广东甘化
人民网
日盈电子
世联行
天润数娱
贵绳股份
云南白药
中新赛克
远方信息
融钰集团
锦江投资
易成新能
中水渔业
沈阳化工
江海股份
楚天科技
华联股份
东材科技
兴源环境
澳洋科技
民生银行
江苏阳光
洪城水业
华宏科技
神州长城
ST常林
农发种业
美芝股份
旋极信息
首航节能
通鼎互联
凯美特气
渤海轮渡
山河药辅
王子新材
新界泵业
汉缆股份
星辉娱乐
重庆水务
三维工程
美好置业
健帆生物
兆驰股份
通化东宝
乐山电力
天鹅股份
渝 开 发
欣龙控股
长江投资
丽珠集团
青海华鼎
湖北广电
东南网架
黑牡丹
上汽集团
东方明珠
实丰文化
康恩贝
宜宾纸业
海默科技
海油工程
中科金财
东华科技
国投电力
太平鸟
合众思壮
天津港
*ST新城
星宇股份
工商银行
弘宇股份
光明乳业
西藏城投
申科股份
延华智能
露天煤业
岭南控股
*ST青松
华金资本
永太科技
中国电建
国药股份
星源材质
西安旅游
佳隆股份
金力泰
金盾股份
四方股份
上海建工
云投生态
怡达股份
宝信软件
广电电气
日照港
海南椰岛
大龙地产
富春股份
*ST 中绒
新亚制程
建投能源
浙江震元
华懋科技
广电网络
锦州港
金证股份
太安堂
今世缘
商赢环球
多喜爱
冠豪高新
凯利泰
永高股份
东方精工
黔轮胎A
文投控股
高伟达
中原传媒
北京科锐
黄山旅游
菲达环保
博信股份
长城影视
华闻传媒
通策医疗
小天鹅A
徐工机械
陕西煤业
天地科技
合金投资
济民制药
亚星客车
御银股份
海欣食品
韩建河山
联创电子
宁波精达
合诚股份
力生制药
京运通
润邦股份
亚通股份
新华医疗
东诚药业
世纪瑞尔
普邦股份
万润股份
招商银行
中国国贸
华宇软件
锦龙股份
沧州大化
强生控股
兖州煤业
浙商证券
阳光电源
摩恩电气
旷达科技
*ST丹科
中远海科
轻纺城
申能股份
南京医药
中国中车
长久物流
南卫股份
中华企业
德威新材
飞荣达
茂业通信
览海投资
鹿港文化
酒鬼酒
长电科技
龙泉股份
沃特股份
金河生物
大元泵业
天房发展
利亚德
金鹰股份
*ST爱富
史丹利
福建金森
安徽水利
亚太实业
扬子新材
初灵信息
航天机电
中衡设计
福能股份
华东医药
万孚生物
威帝股份
仙琚制药
亚邦股份
东方航空
南京化纤
桂林旅游
苏交科
珠江控股
同达创业
白云电器
浪潮信息
飞科电器
国民技术
金莱特
丰元股份
华鹏飞
西藏旅游
环能科技
神思电子
白云山
山东章鼓
川投能源
上海莱士
北部湾港
中航地产
国投中鲁
莱宝高科
欣旺达
中航机电
古井贡酒
大豪科技
润和软件
乐凯胶片
微光股份
安硕信息
海立股份
三圣股份
科林电气
*ST宏盛
博敏电子
新文化
方直科技
金固股份
安记食品
山东出版
帝龙文化
创新医疗
三聚环保
博思软件
新华文轩
百川能源
瑞康医药
正平股份
长荣股份
海通证券
应流股份
神开股份
津膜科技
国机通用
西部黄金
中泰化学
贵阳银行
凤凰光学
金利华电
三特索道
华东电脑
萃华珠宝
浙江仙通
南洋股份
德尔股份
上海沪工
乐心医疗
中信证券
四方冷链
卫 士 通
九鼎投资
必康股份
麦趣尔
宜华健康
巨人网络
平治信息
科达利
兆易创新
城地股份
步长制药
嘉澳环保
朗迪集团
五洲新春
科森科技
杭电股份
东方电缆
引力传媒
司太立
集友股份
维力医疗
圣达生物
德新交运
赛福天
山东华鹏
大唐发电
凤凰传媒
嘉泽新能
中国中冶
中国铝业
*ST锐电
陕鼓动力
君正集团
中国西电
晋亿实业
宁波热电
渤海活塞
江苏有线
*ST嘉陵
洛阳玻璃
石化油服
厦华电子
星湖科技
*ST京城
人民同泰
新华传媒
益民集团
中路股份
*ST厦工
华北制药
山西汾酒
天业股份
天津磁卡
宁波海运
保税科技
鲁银投资
汉商集团
天海投资
一汽富维
实达集团
S*ST前锋
绿庭投资
中船防务
奥瑞德
哈药股份
豫园股份
富控互动
申达股份
鹏起科技
惠泉啤酒
中珠医疗
国睿科技
老白干酒
时代出版
莫高股份
狮头股份
栖霞建设
宏达矿业
海航基础
腾达建设
驰宏锌锗
天药股份
信威集团
瑞贝卡
*ST海润
盘江股份
广东明珠
天科股份
三房巷
通葡股份
正源股份
亚星化学
营口港
XD万华化
广汇汽车
华仪电气
江苏舜天
重庆港九
亿利洁能
嘉化能源
航天信息
外运发展
赣粤高速
国电南自
大湖股份
广汇能源
ST成城
中昌数据
民丰特纸
赤天化
瀚叶股份
海航控股
江苏吴中
华资实业
国中水务
安通控股
太原重工
永泰能源
宝硕股份
中国船舶
*ST新亿
太极集团
西宁特钢
*ST天成
大名城
东方金钰
中葡股份
海泰发展
东风科技
宋都股份
康欣新材
宁波联合
四川路桥
东风汽车
朗新科技
隆盛科技
中孚信息
民德电子
南京聚隆
新雷能
贝斯特
会畅通讯
朗科智能
辰安科技
山鼎设计
迈克生物
康拓红外
双杰电气
鲍斯股份
航新科技
中光防雷
迦南科技
三环集团
腾信股份
飞天诚信
光环新网
光一科技
麦捷科技
邦讯技术
聚飞光电
吴通控股
华昌达
海联讯
新莱应材
飞力达
纳川股份
福安药业
佳士科技
通裕重工
智慧松德
迪威迅
新研股份
科融环境
量子高科
星普医科
大富科技
锦富技术
锐奇股份
易世达
坚瑞沃能
盛运环保
康芝药业
华谊嘉信
世纪鼎利
福瑞股份
华力创通
回天新材
上海凯宝
梅泰诺
金龙机电
宝德股份
立思辰
盈趣科技
香山股份
麦格米特
凯中精密
普路通
南兴装备
万达电影
中矿资源
葵花药业
燕塘乳业
奥瑞金
美盛文化
顾地科技
猛狮科技
德联集团
万润科技
民盛金科
三垒股份
瑞和股份
艾格拉斯
亚夏汽车
ST龙力
八菱科技
圣阳股份
中京电子
雷柏科技
群兴玩具
顺灏股份
三七互娱
千红制药
东方铁塔
鸿路钢构
云图控股
林州重机
海源机械
光正集团
天桥起重
日发精机
恺英网络
达华智能
涪陵榨菜
科林环保
金正大
益生股份
天马精化
壹桥股份
龙星化工
江苏神通
尤夫股份
胜利精密
凯撒文化
中原特钢
达实智能
爱仕达
建研集团
信邦制药
南洋科技
东山精密
千方科技
亚太药业
台海核电
神剑股份
森源电气
富临运业
顺丰控股
漫步者
高乐股份
潮宏基
海宁皮城
人人乐
*ST三泰
博云新材
大 东 南
德奥通航
升达林业
步 步 高
合兴包装
恒康医疗
特 尔 佳
利达光电
巴士在线
深圳惠程
中航三鑫
常铝股份
新光圆成
恒星科技
天马股份
三变科技
广博股份
浔兴股份
山河智能
万邦德
沙钢股份
凯瑞德
云南旅游
轴研科技
久联发展
丽江旅游
华信国际
东信和平
霞客环保
德豪润达
华邦健康
华润三九
中弘股份
中通客车
凯迪生态
中粮生化
山大华特
*ST天化
云内动力
现代投资
东凌国际
云南铜业
吉电股份
陕西金叶
冰轮环境
云铝股份
凯撒旅游
长江证券
*ST平能
通化金马
浩物股份
新华制药
南风化工
苏宁环球
恒逸石化
厦门信达
*ST华泽
建新矿业
东方电子
海航投资
平潭发展
太阳能
海南海药
供销大集
航天发展
中天金融
粤电力A
万泽股份
万 家 乐
美菱电器
荣安地产
国际医学
华塑控股
鄂武商A
渤海金控
胜利股份
华数传媒
广聚能源
皇庭国际
泛海控股
中国天楹
神州数码
中粮地产
深深房A
深赤湾A
深深宝A
深中华A
全新好
深振业A
华测导航
ST生化
和仁科技
牧原股份
传艺科技
庄园牧场
浩云科技
华钰矿业
元祖股份
万邦达
曲江文旅
贵航股份
汉森制药
长江电力
吉祥航空
华仁药业
金通灵
红蜻蜓
万东医疗
新日恒力
光大证券
伊力特
张江高科
中南传媒
捷顺科技
瀚蓝环境
维宏股份
精锻科技
深华发A
曲美家居
中威电子
景嘉微
安信信托
赢时胜
天翔环境
永利股份
中金环境
达志科技
东方日升
金明精机
金龙汽车
兰州黄河
湘电股份
国机汽车
奇信股份
龙大肉食
中山公用
杭锅股份
视觉中国
恒信东方
南天信息
福成股份
特变电工
江苏国信
深天地A
北京城乡
广日股份
宏图高科
中兴商业
宜华生活
潍柴重机
文山电力
尚荣医疗
羚锐制药
围海股份
好利来
优博讯
远达环保
精伦电子
慈文传媒
安井食品
隧道股份
恒丰纸业
黑牛食品
雄韬股份
东阳光科
兄弟科技
华铁股份
农 产 品
雷鸣科化
翠微股份
山东威达
ST南化
百利科技
*ST沪科
博深工具
清水源
新天然气
信捷电气
哈森股份
钱江生化
杭钢股份
奥克股份
马应龙
丰乐种业
登云股份
三角轮胎
新开普
永鼎股份
奥拓电子
嘉欣丝绸
华自科技
新朋股份
文科园林
四川九洲
美联新材
三元股份
柏堡龙
茂业商业
正邦科技
新力金融
深圳能源
悦达投资
四方达
川化股份
南京公用
朗姿股份
招商公路
广汽集团
小商品城
金石东方
上海环境
中核钛白
雪峰科技
光电股份
集智股份
国元证券
本钢板材
名家汇
鲁 泰A
西安饮食
南京新百
华扬联众
数字政通
新大洲A
北辰实业
仁和药业
南威软件
德尔未来
奥维通信
博实股份
凌云股份
东江环保
中环股份
青青稞酒
华统股份
皖能电力
天龙股份
荃银高科
新世界
越秀金控
龙韵股份
利源精制
英飞拓
奇正藏药
金亚科技
丽鹏股份
超图软件
金安国纪
晨光文具
新疆浩源
卓郎智能
东风股份
洪涛股份
南都电源
上海九百
江南高纤
吴江银行
航发科技
浦东建设
科大国创
汇中股份
林海股份
永贵电器
*ST智慧
比亚迪
泰达股份
华茂股份
蓝科高新
深高速
宁波富邦
和而泰
银轮股份
昆药集团
力星股份
双环传动
兰花科创
城投控股
哈尔斯
路畅科技
上海电力
人福医药
汉得信息
数码科技
潍柴动力
联环药业
三 力 士
启明星辰
四川成渝
杭州解百
科锐国际
共进股份
三峡水利
北大医药
东土科技
神奇制药
丰原药业
读者传媒
中粮糖业
雪人股份
富奥股份
凤竹纺织
桂林三金
天沃科技
鹏翎股份
福达股份
龙宇燃油
广东鸿图
兴业证券
神州信息
浙江广厦
春兴精工
恒力股份
姚记扑克
同济堂
双箭股份
漳州发展
紫光股份
裕兴股份
天龙光电
九 芝 堂
三鑫医疗
秀强股份
兴业股份
天银机电
石基信息
大东方
安控科技
恒泰实达
华昌化工
吉林高速
津滨发展
远东传动
常青股份
宜通世纪
宝鹰股份
中国联通
德美化工
民生控股
第一创业
北方国际
惠而浦
道恩股份
加加食品
西昌电力
中新科技
皖新传媒
金一文化
汉王科技
*ST沈机
鲁信创投
广汇物流
快克股份
国投资本
诺 普 信
幸福蓝海
中航电子
浦东金桥
科远股份
舒泰神
乔治白
京威股份
兴民智通
惠发股份
闰土股份
泰胜风能
皇氏集团
国金证券
瑞尔特
科力尔
吉林敖东
天喻信息
新华联
ST慧球
宜安科技
西部证券
中色股份
苏州高新
平高电气
智云股份
宝钢股份
际华集团
晋西车轴
山东高速
津劝业
新纶科技
丰华股份
大禹节水
欧亚集团
东音股份
金徽酒
华能国际
*ST上普
博闻科技
精准信息
天壕环境
江化微
雪浪环境
利德曼
东华软件
昆百大A
中电广通
*ST运盛
摩登大道
亿利达
长白山
上海医药
中航重机
中电鑫龙
思源电气
杭萧钢构
佳发安泰
金隅集团
远兴能源
安居宝
精艺股份
江苏国泰
山东金泰
天业通联
康达尔
三超新材
中原环保
安车检测
中持股份
西部矿业
通润装备
铜陵有色
开润股份
诚迈科技
大西洋
克明面业
首商股份
武汉控股
巨轮智能
珠江啤酒
华安证券
美康生物
乐金健康
精华制药
九洲电气
菲林格尔
华达科技
中装建设
游久游戏
健民集团
北部湾旅
申华控股
宝光股份
大康农业
春兰股份
风范股份
以岭药业
百隆东方
软控股份
金智科技
海螺型材
百联股份
中原高速
商业城
国海证券
中国软件
闽东电力
富春环保
恒银金融
吉林森工
莱茵体育
哈投股份
楚天高速
金运激光
西南证券
川仪股份
欧浦智网
皖天然气
爱康科技
西藏矿业
方大化工
文化长城
万 科A
郴电国际
南宁百货
开元股份
联明股份
宝莱特
雄塑科技
创力集团
联发股份
国统股份
华东科技
成都路桥
紫金矿业
祥源文化
泰合健康
中飞股份
仙坛股份
宁波高发
中原证券
西藏药业
广晟有色
宝胜股份
朗源股份
华峰超纤
奥康国际
国轩高科
汤臣倍健
盛通股份
新华网
力帆股份
天圣制药
环旭电子
通宝能源
恒立实业
山东药玻
云赛智联
华映科技
贵糖股份
旭光股份
新 华 都
兔 宝 宝
宜昌交运
广信材料
广泽股份
开创国际
长青集团
南宁糖业
大洋电机
上海电气
林洋能源
任子行
四环生物
黔源电力
中国动力
三雄极光
纽威股份
双星新材
绿城水务
民和股份
东睦股份
诚意药业
大恒科技
绿茵生态
安利股份
和邦生物
日上集团
中化国际
隆基机械
青岛双星
东安动力
中视传媒
开开实业
卧龙电气
中恒集团
天宸股份
中信重工
益佰制药
东方海洋
如意集团
银鸽投资
富森美
中国医药
圆通速递
开滦股份
慈星股份
中煤能源
宁沪高速
泰豪科技
浙江世宝
中际旭创
迪森股份
长城动漫
烽火电子
万向德农
双良节能
佛塑科技
双成药业
海格通信
双象股份
南岭民爆
合肥百货
寒锐钴业
江南化工
杭叉集团
特 力A
万顺股份
上海电影
金种子酒
中电环保
苏州固锝
中炬高新
爱普股份
合康新能
科斯伍德
友阿股份
华海药业
中泰股份
先河环保
博世科
亚厦股份
嘉应制药
海康威视
*ST河化
中文在线
惠达卫浴
青海春天
南方传媒
国新能源
新集能源
长园集团
第一医药
新美星
欣天科技
福鞍股份
太平洋
中航高科
长源电力
鲁西化工
宏创控股
光迅科技
东易日盛
贵州百灵
宁波富达
绿康生化
国泰君安
龙源技术
新野纺织
长缆科技
江南水务
安源煤业
长安汽车
华电国际
华建集团
美达股份
申通快递
豫能控股
聚龙股份
恩华药业
晓程科技
中工国际
亚太科技
方正证券
中牧股份
珠江钢琴
神宇股份
红阳能源
天音控股
航发控制
浙江鼎力
北纬科技
奥联电子
中铁工业
徕木股份
吉鑫科技
明星电力
国农科技
花王股份
华微电子
九州通
天目湖
拓斯达
鸿达兴业
广生堂
今飞凯达
广深铁路
北玻股份
恒宝股份
赛升药业
恒为科技
江淮汽车
达安基因
海越股份
唐山港
向日葵
汇源通信
莱茵生物
道道全
四川长虹
智光电气
融捷股份
健盛集团
灵康药业
长生生物
万丰奥威
五矿资本
外高桥
启迪古汉
凤凰股份
鑫茂科技
赛轮金宇
节能风电
华虹计通
浙江医药
毅昌股份
百花村
康缘药业
梦网集团
岳阳林纸
济川药业
海信科龙
朗玛信息
银泰资源
苏利股份
西藏天路
永新股份
报 喜 鸟
嘉寓股份
京泉华
新时达
汇冠股份
国瓷材料
九洲药业
浙江东方
上海梅林
江苏雷利
科隆股份
西部创业
大同煤业
海虹控股
*ST郑煤
国电电力
盾安环境
我乐家居
时代新材
瑞凌股份
明家联合
东方电气
中成股份
沪电股份
深圳燃气
中国重工
湖北能源
东方集团
圣邦股份
西部牧业
航天通信
安琪酵母
东北制药
好当家
日月股份
华明装备
海亮股份
星云股份
金山股份
赛托生物
安诺其
积成电子
西王食品
长高集团
桃李面包
海印股份
佳沃股份
京蓝科技
百大集团
九安医疗
通程控股
四川美丰
九有股份
怡 亚 通
京天利
普利制药
深天马A
吉视传媒
辽宁成大
泰尔股份
中国电影
阳泉煤业
联络互动
万林股份
金鸿控股
日出东方
东旭光电
中国银河
理邦仪器
北斗星通
峨眉山A
红 宝 丽
漳泽电力
复星医药
五矿发展
太空板业
文一科技
兴业科技
内蒙华电
博济医药
生物股份
清新环境
新北洋
福斯特
道氏技术
特发信息
长江传媒
浙江众成
国美通讯
崇达技术
中富通
维尔利
弘业股份
春秋航空
汇鸿集团
友好集团
江西铜业
苏试试验
太阳纸业
德宏股份
艾华集团
裕同科技
海德股份
乾照光电
卫信康
康斯特
众业达
国风塑业
鹭燕医药
众泰汽车
麦达数字
弘讯科技
大连电瓷
亿帆医药
新洋丰
五洋科技
智慧能源
华西股份
康尼机电
中 关 村
特锐德
中国核建
豫光金铅
艾迪精密
新兴铸管
上海石化
理工环科
雅本化学
中超控股
河钢股份
四通股份
石大胜华
黑芝麻
中能电气
浩丰科技
远大智能
内蒙一机
苏大维格
南京熊猫
兴蓉环境
中化岩土
中钢国际
黄河旋风
康美药业
邦宝益智
凯乐科技
文峰股份
广百股份
武汉中商
数字认证
西部建设
*ST华菱
佳都科技
*ST中基
电科院
铜峰电子
飞马国际
华泰证券
航发动力
黄山胶囊
三元达
高能环境
中原内配
恒天海龙
宝钢包装
天润乳业
通产丽星
岷江水电
拉芳家化
赞宇科技
瑞特股份
三联虹普
宏润建设
金海环境
珈伟股份
航天工程
精达股份
蓝黛传动
中来股份
岭南园林
科华恒盛
南通锻压
银河电子
宝通科技
华立股份
庞大集团
中国核电
腾邦国际
建艺集团
康强电子
青岛金王
荣泰健康
凯盛科技
北京利尔
盈峰环境
奥 特 迅
福日电子
宗申动力
京东方A
濮耐股份
中潜股份
*ST三维
中亚股份
*ST一重
*ST松江
京能电力
江山股份
综艺股份
巨化股份
华媒控股
洪都航空
红宇新材
海思科
北方华创
宝泰隆
中科创达
思维列控
安靠智电
思特奇
司尔特
山东矿机
高德红外
华脉科技
凌霄泵业
新潮能源
柳州医药
中顺洁柔
华能水电
宏达新材
祥龙电业
启迪设计
南山铝业
惠伦晶体
银河磁体
华锦股份
中储股份
良信电器
中科三环
碧水源
红豆股份
火炬电子
玉龙股份
德赛电池
得邦照明
巨星科技
骅威文化
溢多利
久远银海
迪瑞医疗
国恩股份
润欣科技
同和药业
超华科技
茂化实华
钱江水利
亿通科技
奥普光电
联创互联
海洋王
海马汽车
通宇通讯
青松股份
曙光股份
中联重科
紫光国芯
陕天然气
惠威科技
国星光电
久之洋
金城医药
炼石有色
三川智慧
万讯自控
可立克
雪迪龙
三丰智能
合肥城建
启明信息
模塑科技
东方国信
海南矿业
桂冠电力
博晖创新
龙溪股份
宁波建工
全通教育
亚振家居
国信证券
钢研高纳
达刚路机
*ST重钢
山东钢铁
恒泰艾普
维科精华
经纬纺机
网宿科技
吉药控股
抚顺特钢
海利尔
出版传媒
亚太股份
荣之联
珍宝岛
宁波银行
星徽精密
全志科技
中闽能源
温州宏丰
大冷股份
蓝焰控股
华体科技
云天化
东宝生物
广济药业
拓维信息
科华控股
中再资环
泰禾集团
三德科技
宏发股份
运达科技
川润股份
博瑞传播
皖通科技
湘邮科技
汇顶科技
思美传媒
岱美股份
沃华医药
日播时尚
恒通股份
精工钢构
太龙药业
泰和新材
昊华能源
华电能源
瑞泰科技
华天酒店
新黄浦
许继电气
渝三峡A
广安爱众
安泰集团
永辉超市
天保基建
艾德生物
能科股份
东华测试
宝钛股份
贵广网络
盛路通信
永安药业
悦心健康
久立特材
中润资源
新联电子
好想你
长海股份
金陵药业
万集科技
秦川机床
佛慈制药
荣丰控股
广联达
诺邦股份
华灿光电
东方创业
坚朗五金
伟星股份
新天科技
金浦钛业
英特集团
东方电热
英洛华
华光股份
安科生物
东软载波
海王生物
跃岭股份
威华股份
高盟新材
汉钟精机
焦点科技
标准股份
仁智股份
海翔药业
南华生物
扬帆新材
瑞丰高材
乐惠国际
航天动力
起步股份
高新兴
秦安股份
特一药业
路通视信
诺力股份
延长化建
古鳌科技
中百集团
赛隆药业
中国宝安
南方轴承
西部材料
三花智控
惠博普
ST新梅
明牌珠宝
苏州科达
东方锆业
建设机械
天域生态
富临精工
龙建股份
海伦哲
安徽合力
中新药业
皖维高新
韵达股份
耀皮玻璃
海陆重工
牧高笛
英搏尔
数源科技
金圆股份
莲花健康
合力泰
安泰科技
中钢天源
平安银行
大众公用
三利谱
华平股份
宏达高科
万通智控
恒顺众昇
华铁科技
传化智联
东软集团
国光股份
同济科技
天山生物
晶盛机电
金信诺
百润股份
今天国际
金龙羽
天宝食品
刚泰控股
*ST普林
河北宣工
中航飞机
海伦钢琴
惠天热电
日海通讯
环球印务
顶点软件
中国卫星
中宠股份
世纪华通
方正电机
威 尔 泰
联建光电
比音勒芬
禾丰牧业
陕国投A
多氟多
海波重科
伟隆股份
创元科技
赛象科技
香溢融通
雅克科技
宏辉果蔬
新疆天业
华丽家族
长城电工
坤彩科技
和佳股份
山东地矿
中航电测
海能达
太钢不锈
东方网络
海鸥股份
全柴动力
洲际油气
德展健康
广田集团
科华生物
厦门钨业
城市传媒
利民股份
嘉麟杰
同大股份
联化科技
国检集团
中文传媒
诺德股份
科陆电子
天汽模
章源钨业
振华科技
ST明科
金刚玻璃
红日药业
沧州明珠
中鼎股份
金轮股份
东方银星
亚泰国际
弘亚数控
财通证券
松发股份
嘉诚国际
兰生股份
塞力斯
大北农
隆华节能
通用股份
GQY视讯
中电电机
*ST大控
苏州恒久
康泰生物
中科新材
凯文教育
天马科技
紫江企业
中恒电气
胜宏科技
华意压缩
平煤股份
宁波中百
联创光电
鲁亿通
恒通科技
同兴达
包钢股份
潞安环能
杰赛科技
大冶特钢
广西广电
法拉电子
茶花股份
道森股份
得润电子
粤 传 媒
清源股份
天铁股份
瑞普生物
三星新材
东方证券
银之杰
金牛化工
飞亚达A
蒙草生态
分众传媒
孚日股份
迅游科技
金麒麟
江山欧派
浙富控股
大金重工
顺络电子
隆鑫通用
中航资本
广电运通
华工科技
华鼎股份
温氏股份
科大讯飞
上海能源
长鹰信质
双塔食品
水星家纺
勤上股份
鸣志电器
方盛制药
大通燃气
宁波华翔
汇金通
青山纸业
湖南天雁
星期六
美邦服饰
艾艾精工
明泰铝业
星网锐捷
新宝股份
中马传动
宏盛股份
天顺风能
博士眼镜
禾望电气
至正股份
钱江摩托
富瀚微
天首发展
鼎龙股份
秦港股份
动力源
天通股份
甘肃电投
国盛金控
江特电机
远大控股
澳洋顺昌
首创股份
两面针
宁波东力
科信技术
*ST大有
远光软件
创兴资源
格林美
金钼股份
佩蒂股份
东珠景观
新 和 成
易事特
*ST紫学
置信电气
武进不锈
江西长运
神力股份
金贵银业
博通股份
北矿科技
安奈儿
科迪乳业
红 太 阳
*ST万里
硕贝德
康力电梯
航天晨光
冀中能源
荣华实业
中央商场
嘉事堂
英威腾
星帅尔
凯普生物
斯莱克
农业银行
常熟汽饰
龙蟒佰利
东方能源
万里马
万安科技
老凤祥
美锦能源
永创智能
一心堂
新疆众和
新安股份
桂发祥
智慧农业
松芝股份
奥翔药业
海兰信
高争民爆
郑煤机
远 望 谷
长春燃气
酒钢宏兴
世名科技
中航沈飞
乾景园林
正业科技
爱尔眼科
香梨股份
ST信通
英唐智控
大庆华科
中国科传
利群股份
上海凤凰
振华股份
博威合金
盛洋科技
美尚生态
华正新材
世运电路
圣龙股份
海特高新
冠福股份
键桥通讯
硅宝科技
罗顿发展
汇纳科技
海联金汇
株冶集团
苏宁云商
大连友谊
金岭矿业
华测检测
连云港
和科达
京新药业
国泰集团
合纵科技
通光线缆
方大炭素
安科瑞
怡球资源
国创高新
海 利 得
菲利华
银宝山新
北新路桥
电魂网络
威创股份
诚益通
世嘉科技
搜于特
威海广泰
市北高新
美晨生态
鼎汉技术
江南嘉捷
安 纳 达
通威股份
亚星锚链
迪生力
深 赛 格
*ST墨龙
园城黄金
雷迪克
浙江永强
兆丰股份
九华旅游
威龙股份
濮阳惠成
ST仰帆
渤海股份
普丽盛
蓝丰生化
卫星石化
天和防务
南 京 港
景峰医药
石化机械
天舟文化
金桥信息
盈方微
耐威科技
亿联网络
博创科技
南钢股份
超声电子
ST山水
中油资本
棕榈股份
正元智慧
日科化学
号百控股
华荣股份
劲拓股份
海信电器
天士力
电连技术
巨力索具
鞍钢股份
同益股份
泰晶科技
格尔软件
恒源煤电
北方导航
赛意信息
华银电力
横河模具
博腾股份
永清环保
英飞特
长青股份
德艺文创
三晖电气
劲嘉股份
联得装备
金诚信
保变电气
中信国安
昊志机电
凯众股份
纳尔股份
天宇股份
卓翼科技
京能置业
好莱客
新华锦
正泰电器
吉华集团
兴森科技
视源股份
神州易桥
同为股份
*ST圣莱
云海金属
泰山石油
沃尔核材
马钢股份
海天精工
沪宁股份
誉衡药业
正海磁材
恒润股份
美年健康
全信股份
康弘药业
高澜股份
正裕工业
辰欣药业
神农基因
大理药业
卫光生物
阳煤化工
赢合科技
金太阳
睿能科技
英派斯
氯碱化工
百川股份
韶能股份
启迪桑德
雷科防务
上海洗霸
世纪天鸿
先锋新材
光大嘉宝
中科电气
超讯通信
国电南瑞
快乐购
深大通
华升股份
优德精密
四通新材
富满电子
亚玛顿
依顿电子
碳元科技
三祥新材
百傲化学
九鼎新材
中利集团
杉杉股份
哈三联
基蛋生物
美克家居
新宏泰
西仪股份
华控赛格
航天科技
金财互联
杭州高新
斯太尔
友邦吊顶
荣晟环保
新奥股份
中孚实业
大参林
当升科技
中青旅
宝莫股份
太阳电缆
东华能源
如通股份
苏博特
浙江龙盛
信立泰
上海天洋
浦发银行
广宇发展
亚光科技
飞鹿股份
晨化股份
深南电A
聚光科技
法兰泰克
中公高科
新能泰山
三木集团
力盛赛车
*ST中安
海顺新材
联泰环保
大连热电
中国中期
鹏鹞环保
皖通高速
天奇股份
君禾股份
宁波韵升
益盛药业
新易盛
精功科技
贝因美
东方园林
西山煤电
光莆股份
焦作万方
佳创视讯
三夫户外
汇嘉时代
美盈森
鹏辉能源
绝味食品
博天环境
铁汉生态
百洋股份
通达动力
TCL 集团
兆新股份
中金黄金
美思德
伟星新材
拓邦股份
三江购物
东方市场
高新发展
寿仙谷
龙洲股份
金达威
永兴特钢
天华院
中兵红箭
农尚环境
宏达股份
海得控制
中材节能
维格娜丝
和晶科技
浙江东日
天龙集团
广信股份
大丰实业
岳阳兴长
恒锋信息
中核科技
泰禾光电
福晶科技
双林股份
先进数通
五矿稀土
均胜电子
富邦股份
东旭蓝天
厚普股份
开能环保
长春一东
中天科技
金域医学
威星智能
金能科技
华峰氨纶
合力科技
麦迪电气
欧比特
亚威股份
中金岭南
中国出版
丹邦科技
爱司凯
开立医疗
深桑达A
华阳集团
至纯科技
深圳新星
乐歌股份
朗博科技
阳普医疗
天孚通信
金风科技
金洲管道
康惠制药
熊猫金控
新光药业
盛屯矿业
太辰光
江中药业
秋林集团
富瑞特装
恒华科技
方大特钢
兴业矿业
八一钢铁
容大感光
宝馨科技
露笑科技
天海防务
晶瑞股份
川金诺
上海亚虹
亿纬锂能
罗莱生活
贵研铂业
百达精工
深冷股份
锌业股份
创业环保
振芯科技
尔康制药
鄂尔多斯
电光科技
新筑股份
雅百特
北方稀土
山东黄金
瑞丰光电
穗恒运A
新疆火炬
湘油泵
龙蟠科技
移为通信
康德莱
美力科技
辉丰股份
捷荣技术
金发科技
嘉凯城
安凯客车
藏格控股
万里扬
雄帝科技
诚邦股份
新通联
东尼电子
北巴传媒
醋化股份
万向钱潮
广东榕泰
奥士康
口子窖
景旺电子
创源文化
*ST弘高
西部资源
金卡智能
熙菱信息
佐力药业
飞凯材料
省广股份
天赐材料
普利特
四方精创
欧普康视
完美世界
创业黑马
赤峰黄金
蓝帆医疗
北方股份
普洛药业
天际股份
恒邦股份
石英股份
新宙邦
浪莎股份
上海贝岭
翰宇药业
韶钢松山
盐津铺子
设计总院
森霸股份
开尔新材
红星发展
乐通股份
重庆燃气
中广核技
新宏泽
戴维医疗
鹏欣资源
东方中科
晨光生物
麦迪科技
日机密封
德赛西威
上海钢联
有研新材
华通医药
凌钢股份
依米康
地尔汉宇
北讯集团
三钢闽光
帝王洁具
快意电梯
正海生物
中国巨石
大千生态
康达新材
恒顺醋业
经纬电材
中大力德
皇马科技
洪汇新材
横店东磁
超频三
新天药业
先锋电子
江粉磁材
大族激光
新坐标
南极电商
森远股份
安阳钢铁
台华新材
蓝海华腾
中材科技
朗科科技
金鸿顺
歌尔股份
通合科技
智能自控
纵横通信
华铭智能
中油工程
达安股份
银星能源
翔鹭钨业
大立科技
永东股份
凯发电气
永安林业
春风动力
空港股份
星网宇达
中捷资源
武汉凡谷
伊之密
长江通信
南国置业
常宝股份
江龙船艇
鲁北化工
盛讯达
丝路视觉
美格智能
新劲刚
阿石创
银江股份
金银河
国脉科技
蒙娜丽莎
豪能股份
必创科技
辅仁药业
国科微
泰嘉股份
中船科技
北化股份
大烨智能
赣能股份
中通国脉
中设股份
梅轮电梯
天顺股份
勘设股份
富煌钢构
西陇科学
华大基因
英 力 特
宝利国际
恒林股份
新凤鸣
海川智能
联诚精密
天齐锂业
金雷风电
*ST新赛
光威复材
中环装备
大博医疗
金溢科技
正川股份
华源控股
雅化集团
康旗股份
罗平锌电
华锋股份
德创环保
红相电力
双环科技
晨丰科技
浙商中拓
宇顺电子
神火股份
中兴通讯
珀莱雅
中颖电子
捷捷微电
生益科技
昭衍新药
中天能源
广哈通信
兴齐眼药
汇金股份
广和通
长春高新
春秋电子
联合光电
亨通光电
延江股份
光明地产
金瑞矿业
智动力
长盛轴承
昇兴股份
洲明科技
友讯达
中广天择
*ST东数
荣盛石化
东宏股份
华森制药
索通发展
英维克
西泵股份
宏达电子
闻泰科技
东方嘉盛
湖南黄金
安洁科技
莱绅通灵
杭州园林
贝瑞基因
银龙股份
华凯创意
一品红
国光电器
中环环保
欧菲科技
高科石化
意华股份
威唐工业
新国都
茂硕电源
光库科技
澄天伟业
精研科技
剑桥科技
璞泰来
韦尔股份
跨境通
天成自控
水晶光电
喜临门
博迈科
天安新材
信隆健康
江丰电子
高斯贝尔
美都能源
立讯精密
普莱柯
东杰智能
盛达矿业
新经典
江苏索普
金辰股份
扬农化工
新晨科技
和顺电气
旭升股份
和胜股份
润禾材料
北京君正
莱克电气
建研院
金石资源
东百集团
金杯汽车
同德化工
英联股份
伊戈尔
光弘科技
拉夏贝尔
盛弘股份
苏奥传感
迪贝电气
赛腾股份
佳力图
爱柯迪
赣锋锂业
广东骏亚
丽岛新材
东方材料
泰瑞机器
大业股份
上海新阳
国芳集团
盘龙药业
润都股份
长川科技
科创信息
冀凯股份
吉大通信
湖北宜化
铭普光磁
安图生物
银都股份
九典制药
亚士创能
万隆光电
振江股份
晨曦航空
西藏珠峰
祥和实业
华信新材
凯莱英
立昂技术
陇神戎发
鲁抗医药
亚翔集成
科创新源
维业股份
潜能恒信
贝肯能源
阳谷华泰
畅联股份
众生药业
百利电气
宇环数控
阿科力
白银有色
士兰微
易明医药
*ST众和
方大集团
中科信息
张家港行
双一科技
好太太
索菱股份
集泰股份
川恒股份
洛阳钼业
汇金科技
原尚股份
晶华新材
佛燃股份
百华悦邦
英科医疗
洛凯股份
*ST佳电
三孚股份
中曼石油
*ST德力
建科院
康普顿
*ST中富
香飘飘
ST保千里
安达维尔
盛和资源
德生科技
永福股份
海特生物
金奥博
新余国科
信维通信
深康佳A
国立科技
科恒股份
风华高科
万马科技
华通热力
扬杰科技
弘信电子
西菱动力
名臣健康
科蓝软件
山东赫达
保隆科技
贵州燃气
皇台酒业
南纺股份
顺威股份
乐视网
豫金刚石
太龙照明
海达股份
步森股份
成都银行
*ST昆机
*ST吉恩
御家汇
明阳电路
华西证券
*ST建峰
*ST钒钛
*ST烯碳
嘉友国际
中源家居
淳中科技
南都物业
养元饮品
ST网力
天风证券
沪硅产业
新乳业
山鹰国际
湘佳股份
明德生物
新强联
东阳光
中建环能
东方盛虹
河钢资源
达刚控股
青松建化
*ST熊猫
宁德时代
*ST宏图
上海凯鑫
科拓生物
贝仕达克
时空科技
华峰铝业
泰禾智能
聚合顺
首航高科
江苏租赁
鼎胜新材
蔚蓝生物
*ST联络
双林生物
欧菲光
天味食品
吉翔股份
长虹华意
长源东谷
天润工业
*ST梦舟
*ST中南
中贝通信
瀚川智能
弘高创意
中国电研
海晨股份
普元信息
京粮控股
米奥会展
苏州龙杰
安道麦A
成都燃气
*ST金正
硕世生物
上海瀚讯
公牛集团
凯赛生物
森麒麟
雷曼光电
*ST大晟
帝尔激光
*ST济堂
红相股份
凯迪退
城地香江
南兴股份
妙可蓝多
宏和科技
圣济堂
中盐化工
*ST藏格
华软科技
南 玻A
长城证券
帅丰电器
上机数控
品渥食品
协和电子
顺利办
奥特维
*ST界龙
三美股份
广联航空
爱美客
华特气体
联创股份
青农商行
钢研纳克
倍加洁
丰山集团
中国通号
中粮资本
睿创微纳
宇晶股份
奥海科技
杭可科技
东岳硅材
锦江酒店
罗博特科
银泰黄金
易天股份
百亚股份
*ST劝业
传音控股
苏农银行
ST华嵘
罗欣药业
ST冠福
佳禾智能
*ST众泰
中科软
青岛银行
甬金股份
众望布艺
瑞联新材
浙江力诺
海信视像
爱旭股份
福光股份
京沪高铁
申昊科技
美畅股份
甘源食品
天箭科技
国新健康
国茂股份
竞业达
今创集团
科瑞技术
甘咨询
ST浩源
久量股份
创世纪
*ST奋达
ST新海
*ST天娱
锦泓集团
阿拉丁
良信股份
*ST赫美
伟思医疗
睿智医药
若羽臣
蓝盾光电
中铁装配
ST安泰
每日互动
科达制造
华铁应急
金宏气体
麦克奥迪
帝科股份
汉嘉设计
*ST东电
永新光学
天融信
奥来德
阿尔特
我爱我家
*ST江泉
*ST湘电
飞亚达
五方光电
鸿合科技
*ST同洲
*ST安通
保力新
国华网安
海星股份
智莱科技
ST宇顺
ST沪科
中微公司
*ST宜生
龙腾光电
*ST华塑
天智航
和远气体
ST通葡
ST厦华
中天火箭
ST地矿
*ST鼎龙
中船应急
祥鑫科技
ST中捷
ST中安
迈得医疗
金科环境
奥普家居
冠盛股份
昊海生科
微芯生物
城建发展
德恩精工
天准科技
当虹科技
中国一重
石头科技
天山铝业
侨银环保
凯撒旅业
凯迪股份
福莱特
*ST西发
*ST力帆
思瑞浦
山大地纬
欣锐科技
海目星
孚能科技
力合科技
长阳科技
科思股份
光正眼科
中国广核
光峰科技
ST摩登
安克创新
爱朋医疗
ST安凯
运达股份
*ST华仪
广大特材
大洋生物
*ST胜利
绿的谐波
迈瑞医疗
安恒信息
晨光新材
长城科技
朝阳科技
太空智造
金春股份
*ST金洲
渤海租赁
交大思诺
吉贝尔
华丰股份
百邦科技
南京证券
ST中基
昂立教育
亿华通
三泰控股
仙乐健康
雷赛智能
电声股份
科威尔
*ST麦趣
*ST海华
ST巴士
广电计量
福然德
*ST中天
中泰证券
华夏航空
大智慧
红塔证券
*ST中昌
威胜信息
晶丰明源
奥福环保
国联股份
国网英大
沪光股份
日久光电
ST昌鱼
ST瑞德
福蓉科技
映翰通
汉宇集团
康辰药业
首都在线
三盛教育
惠程科技
先惠技术
龙磁科技
科德教育
捷佳伟创
雪龙集团
天合光能
卓胜微
*ST林重
ST柳化
郑州银行
立昂微
*ST聚力
宝丽迪
贵州轮胎
华神科技
ST华鼎
姚记科技
固德威
*ST盐湖
亚联发展
*ST天润
*ST东科
山东玻纤
*ST中新
博汇股份
ST游久
嘉元科技
恒银科技
谱尼测试
派克新材
*ST经开
ST宏盛
铁岭新城
*ST环球
万德斯
筑博设计
申联生物
中天精装
ST德豪
天元股份
*ST时万
万泰生物
国瑞科技
岭南股份
淮河能源
晶澳科技
新产业
锦浪科技
*ST华映
*ST友谊
特宝生物
中信出版
华东数控
长飞光纤
药明康德
晶晨股份
优彩资源
旭光电子
豪悦护理
天宜上佳
路德环境
中达安
利通电子
迈为股份
ST圣莱
锦和商业
中国外运
捷强装备
冰山冷热
锐科激光
地铁设计
新媒股份
数知科技
上能电气
克劳斯
迪普科技
金博股份
祥生医疗
卡倍亿
卓越新能
五洲特纸
迪威尔
ST金刚
国林科技
仲景食品
柏楚电子
*ST新光
长沙银行
威派格
天正电气
ST凯瑞
*ST飞乐
嘉必优
宏柏新材
豪美新材
当代文体
张 裕A
大胜达
百奥泰
指南针
奥美医疗
澜起科技
ST云投
七一二
C海融
亚普股份
越博动力
华民股份
宸展光电
ST抚钢
中迪投资
飞龙股份
*ST升达
美瑞新材
仕佳光子
*ST大洲
中铝国际
通达电气
*ST海陆
佰奥智能
*ST金鸿
春光科技
南新制药
ST电能
淮北矿业
*ST金钰
爱博医疗
南大环境
海越能源
日月明
浙海德曼
东鹏控股
松霖科技
宇新股份
中电兴发
金力永磁
开普检测
ST罗普
*ST欧浦
国网信通
三友医疗
三角防务
C亿田
芯朋微
西麦食品
稳健医疗
中岩大地
*ST海创
宇信科技
容百科技
杰普特
锦鸡股份
小熊电器
八亿时空
华辰装备
振德医疗
中芯国际
国联证券
寒武纪
*ST刚泰
*ST拉夏
佰仁医疗
*ST美讯
宝丰能源
艾可蓝
锦盛新材
*ST皇台
博瑞医药
恒实科技
ST中葡
招商港口
*ST秦机
德力股份
鲁商发展
铁科轨道
天地数码
泰永长征
万华化学
恒力石化
*ST东洋
科翔股份
德方纳米
高测股份
芯原股份
敏芯股份
铜牛信息
帝欧家居
中科星图
ST禾盛
紫光国微
*ST中华A
明新旭腾
大宏立
松炀资源
鹏鼎控股
*ST成城
金山办公
仁东控股
乐鑫科技
领益智造
招商南油
拉卡拉
盛德鑫泰
三达膜
长鸿高科
交建股份
回盛生物
苏盐井神
*ST大港
福能东方
*ST安信
燕麦科技
柯力传感
*ST德奥
新智认知
ST猛狮
吉峰科技
华致酒行
巴比食品
深信服
ST椰岛
金石亚药
日海智能
ST天成
宏力达
中新集团
*ST雪莱
金富科技
*ST华讯
捷昌驱动
ST狮头
ST天龙
厦门象屿
八方股份
爱丽家居
均瑶健康
大为股份
泰和科技
麒盛科技
四会富仕
招商积余
瑞松科技
苑东生物
大地熊
航天宏图
*ST融捷
玉禾田
立华股份
龙利得
居然之家
天下秀
芯源微
致远互联
大东海A
贝斯美
君实生物
圣湘生物
渝农商行
安宁股份
珠海中富
华创阳安
ST金花
大东南
*ST永泰
ST威龙
日辰股份
四方科技
ST国重装
斯达半导
旗天科技
建龙微纳
洁特生物
心脉医疗
奇安信
ST坊展
英杰电气
复洁环保
*ST节能
豆神教育
锐新科技
泉阳泉
友发集团
健之佳
ST金泰
七彩化学
汇创达
北汽蓝谷
*ST银河
*ST天夏
*ST永林
和佳医疗
川能动力
派生科技
兴图新科
昂利康
新诺威
*ST富控
航天彩虹
攀钢钒钛
青岛中程
*ST交昂
*ST康得
开能健康
*ST联合
鲁 泰A
重药控股
直真科技
惠发食品
矩子科技
泛亚微透
图南股份
海能实业
*ST中珠
翔丰华
*ST群兴
瑞晟智能
*ST科陆
力鼎光电
中国中免
国光连锁
珈伟新能
海容冷链
ST人乐
中信特钢
法狮龙
澳弘电子
天臣医疗
奥赛康
慧辰资讯
北摩高科
华阳国际
ST仁智
ST索菱
景津环保
科安达
东方环宇
新洁能
恒铭达
中科海讯
瑞达期货
晶科科技
ST乐凯
海航科技
建霖家居
中胤时尚
亚世光电
国安达
国盛智科
爱克股份
中山金马
*ST博信
芒果超媒
长城军工
上纬新材
唐源电气
西部超导
苏宁易购
地素时尚
ST天圣
金雷股份
丹化科技
前沿生物
华润微
万顺新材
辽宁能源
中信建投
巨星农牧
ST中孚
万通发展
*ST科林
中国卫通
TCL科技
隆利科技
ST舍得
万胜智能
启迪环境
圣元环保
*ST雅博
赛摩智能
金冠股份
ST创兴
有友食品
安徽建工
耐普矿机
双飞股份
浩洋股份
北元集团
卧龙电驱
彤程新材
力合微
中密控股
*ST瀚叶
宏川智慧
奕瑞科技
迦南智能
华图山鼎
海象新材
文灿股份
*ST夏利
声迅股份
东来技术
ST庞大
江苏新能
安集科技
工业富联
联瑞新材
ST天雁
国新文化
ST长投
秦川物联
*ST胜尔
蒙泰高新
华兴源创
*ST贵人
松井股份
渤海汽车
中谷物流
柳 工
蓝特光学
新城市
金达莱
卓易信息
伯特利
浙矿股份
苏州银行
泽达易盛
五洋停车
航锦科技
*ST北能
尚纬股份
*ST商城
*ST银亿
长江健康
金现代
雪天盐业
贵州三力
科沃斯
松原股份
康平科技
湘财股份
天禾股份
锐明技术
瑞鹄模具
*ST北讯
顺钠股份
绿色动力
昇辉科技
德马科技
熊猫乳品
ST八菱
金龙鱼
中公教育
越剑智能
嘉美包装
中金公司
ST东网
赛轮轮胎
伟时电子
*ST晨鑫
紫天科技
中创环保
汇得科技
保利联合
财富趋势
博杰股份
盈康生命
三峰环境
壶化股份
普门科技
有方科技
北鼎股份
*ST蓝丰
因赛集团
佳云科技
豪森股份
国盾量子
兴瑞科技
泽璟制药
科前生物
天地在线
恒久科技
密尔克卫
中国人保
左江科技
良品铺子
三只松鼠
彩讯股份
新疆交建
华盛昌
*ST当代
天迈科技
昊华科技
京源环保
同庆楼
永兴材料
威尔药业
龙软科技
瑞玛工业
蠡湖股份
ST天首
*ST荣华
郑中设计
恩捷股份
华光环能
ST毅昌
德林海
芯海科技
大悦城
宝兰德
*ST信通
鸿远电子
亿嘉和
ST百花
震安科技
博汇科技
天奈科技
豪尔赛
江航装备
久日新材
亚钾国际
*ST中商
欧陆通
狄耐克
粤桂股份
长虹美菱
苏美达
南亚新材
芯能科技
顺博合金
光云科技
*ST九有
锋尚文化
中嘉博创
康希诺
康龙化成
*ST高升
顶固集创
ST运盛
济南高新
葫芦娃
新天绿能
天普股份
经纬辉开
沃格光电
特 力A
宝明科技
ST毅达
森霸传感
青岛港
维信诺
西安银行
科博达
永冠新材
博睿数据
鸿泉物联
*ST天马
青鸟消防
新化股份
赛特新材
三人行
正帆科技
佳发教育
神州细胞
深南股份
蓝黛科技
ST宜化
紫金银行
*ST长城
*ST康盛
奥园美谷
润建股份
欣贺股份
金田铜业
江苏北人
准油股份
甘李药业
埃夫特
优刻得
凌志软件
利扬芯片
荣联科技
威奥股份
佳电股份
康泰医学
德利股份
*ST飞马
华文食品
*ST利源
中粮科技
*ST恒康
长华股份
金时科技
大叶股份
中国海防
交控科技
华林证券
派瑞股份
美迪西
艾力斯
格林达
博深股份
热景生物
创源股份
神驰机电
新亚强
ST云网
佳华科技
*ST辉丰
ST生物
海南发展
惠云钛业
山东墨龙
维康药业
*ST江特
东珠生态
海信家电
博通集成
方邦股份
*ST海源
ST远程
美吉姆
丽江股份
国城矿业
海油发展
天阳科技
震有科技
新兴装备
朗进科技
万 科A
赛科希德
酷特智能
ST罗顿
华熙生物
建科机械
ST尤夫
万里股份
*ST斯太
惠城环保
重庆钢铁
雅运股份
华翔股份
安必平
兰剑智能
京北方
华菱钢铁
*ST敦种
华业香料
大有能源
京基智农
*ST目药
康华生物
海昌新材
中航西飞
南华期货
金海高科
福昕软件
维科技术
九洲集团
*ST实达
艾迪药业
华峰测控
上海沿浦
*ST亚振
世华科技
山科智能
值得买
华达新材
洪通燃气
道通科技
拱东医疗
泉峰汽车
测绘股份
丸美股份
中信博
天利科技
西域旅游
锋龙股份
ST科迪
科思科技
神农科技
铂科新材
捷安高科
共创草坪
胜蓝股份
紫晶存储
中船汉光
瑞丰新材
瑞芯微
日丰股份
丽人丽妆
壹网壹创
赛微电子
深粮控股
移远通信
聚辰股份
杰美特
延安必康
华培动力
起帆电缆
和顺石油
ST南风
*ST金贵
ST昌九
沃尔德
盟升电子
创业慧康
中光学
厦门银行
仙鹤股份
东方生物
融捷健康
步科股份
安博通
奥锐特
芯瑞达
邮储银行
中控技术
爱婴室
赛伍技术
三六零
华光新材
协鑫能科
*ST乐通
泰坦科技
志邦家居
山石网科
建业股份
ST步森
览海医疗
炼石航空
神马电力
盛达资源
联赢激光
ST亚邦
*ST辅仁
中创物流
ST亚星
柳药股份
福达合金
ST双环
垒知集团
省广集团
新金路
盛视科技
泸天化
虹软科技
*ST围海
一汽解放
ST岩石
铂力特
郑州煤电
紫光学大
金丹科技
科远智慧
ST康美
*ST盈方
斯迪克
ST海马
中银证券
中简科技
华宝股份
*ST勤上
聆达股份
会通股份
甘化科工
键凯科技
盛新锂能
天奥电子
元利科技
华闻集团
海尔智家
万林物流
华设集团
*ST兆新
莱伯泰科
神工股份
*ST六化
东亚药业
ST加加
泰林生物
协创数据
宇瞳光学
海鸥住工
南微医学
C朗特
赛诺医疗
澳洋健康
*ST长动
宁水集团
成都先导
云涌科技
城发环境
天津普林
海尔生物
复旦张江
皖仪科技
山西路桥
富祥药业
开普云
恒誉环保
隆华科技
聚杰微纤
浙商银行
新赛股份
佛燃能源
清溢光电
明阳智能
新大正
新光光电
富通鑫茂
天邑股份
三生国健
新农股份
海峡创新
新致软件
兆威机电
海融科技
确成股份
C兆龙
联泓新科
朗特智能
C凯龙
博迁新材
C润阳
同兴环保
西上海
C研奥
塞力医疗
特发服务
*ST中孚
*ST鑫科
派能科技
舒华体育
明微电子
启迪药业
蔚蓝锂芯
明冠新材
国机精工
健麾信息
鼎通科技
三旺通信
晋控电力
悦康药业
晋控煤业
东贝集团
伟创电气
C天秦
开元教育
中伟股份
一鸣食品
思进智能
华旺科技
欧科亿
振邦智能
杭华股份
彩虹集团
南山智尚
山西焦煤
亿田智能
科兴制药
恒玄科技
中晶科技
立方制药
南凌科技
吉大正元
航亚科技
森林包装
福立旺
汉马科技
通源环境
兆龙互连
星徽股份
凯龙高科
西大门
侨银股份
华峰化学
研奥股份
C法本
奥普特
润阳科技
C火星人
远东股份
天秦装备
鹏都农牧
天原股份
================================================
FILE: legacy_v1/src/Leorio/tokenization.py
================================================
import __init__
from Kite.database import Database
from Kite import config
from Kite import utils
import jieba
import pkuseg
import logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S')
class Tokenization(object):
def __init__(self, import_module="jieba", user_dict=None, chn_stop_words_dir=None):
#self.database = Database().conn[config.DATABASE_NAME] #.get_collection(config.COLLECTION_NAME_CNSTOCK)
self.database = Database()
self.import_module = import_module
self.user_dict = user_dict
if self.user_dict:
self.update_user_dict(self.user_dict)
if chn_stop_words_dir:
self.stop_words_list = utils.get_chn_stop_words(chn_stop_words_dir)
else:
self.stop_words_list = list()
def update_user_dict(self, old_user_dict_dir, new_user_dict_dir=None):
# 将缺失的(或新的)股票名称、金融新词等,添加进金融词典中
word_list = []
with open(old_user_dict_dir, "r", encoding="utf-8") as file:
for row in file:
word_list.append(row.split("\n")[0])
name_code_df = self.database.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
new_words_list = list(set(name_code_df["name"].tolist()))
for word in new_words_list:
if word not in word_list:
word_list.append(word)
new_user_dict_dir = old_user_dict_dir if not new_user_dict_dir else new_user_dict_dir
with open(new_user_dict_dir, "w", encoding="utf-8") as file:
for word in word_list:
file.write(word + "\n")
def cut_words(self, text):
outstr = list()
sentence_seged = None
if self.import_module == "jieba":
if self.user_dict:
jieba.load_userdict(self.user_dict)
sentence_seged = list(jieba.cut(text))
elif self.import_module == "pkuseg":
seg = pkuseg.pkuseg(user_dict=self.user_dict) # 添加自定义词典
sentence_seged = seg.cut(text) # 进行分词
if sentence_seged:
for word in sentence_seged:
if word not in self.stop_words_list \
and word != "\t" \
and word != " " \
and utils.is_contain_chn(word)\
and len(word) > 1:
outstr.append(word)
return outstr
else:
return False
def find_relevant_stock_codes_in_article(self, article, stock_name_code_dict):
stock_codes_set = list()
cut_words_list = self.cut_words(article)
if cut_words_list:
for word in cut_words_list:
try:
stock_codes_set.append(stock_name_code_dict[word])
except Exception:
pass
return list(set(stock_codes_set))
def update_news_database_rows(self,
database_name,
collection_name,
incremental_column_name="RelatedStockCodes"):
name_code_df = self.database.get_data(config.STOCK_DATABASE_NAME,
config.COLLECTION_NAME_STOCK_BASIC_INFO,
keys=["name", "code"])
name_code_dict = dict(name_code_df.values)
data = self.database.get_collection(database_name, collection_name).find()
for row in data:
# if row["Date"] > "2019-05-20 00:00:00":
# 在新增数据中,并不存在更新列,但是旧数据中已存在更新列,因此需要
# 判断数据结构中是否包含该incremental_column_name字段
if incremental_column_name not in row.keys():
related_stock_codes_list = self.find_relevant_stock_codes_in_article(
row["Article"], name_code_dict)
self.database.update_row(database_name,
collection_name,
{"_id": row["_id"]},
{incremental_column_name: " ".join(related_stock_codes_list)}
)
logging.info("[{} -> {} -> {}] updated {} key value ... "
.format(database_name, collection_name, row["Date"], incremental_column_name))
else:
logging.info("[{} -> {} -> {}] has already existed {} key value ... "
.format(database_name, collection_name, row["Date"], incremental_column_name))
if __name__ == "__main__":
tokenization = Tokenization(import_module="jieba",
user_dict="financedict.txt",
chn_stop_words_dir="chnstopwords.txt")
# documents_list = \
# [
# "中央、地方支持政策频出,煤炭行业站上了风口 券商研报浩如烟海,投资线索眼花缭乱,\
# 第一财经推出《一财研选》产品,挖掘研报精华,每期梳理5条投资线索,便于您短时间内获\
# 取有价值的信息。专业团队每周日至每周四晚8点准时“上新”,助您投资顺利!",
# "郭文仓到重点工程项目督导检查 2月2日,公司党委书记、董事长、总经理郭文仓,公司董事,\
# 股份公司副总经理、总工程师、郭毅民,股份公司副总经理张国富、柴高贵及相关单位负责人到\
# 焦化厂煤场全封闭和干熄焦等重点工程项目建设工地督导检查施工进度和安全工作情况。"
# ]
# for text in documents_list:
# cut_words_list = tokenization.cut_words(text)
# print(cut_words_list)
# tokenization.update_news_database_rows(config.DATABASE_NAME, "jrj")
================================================
FILE: legacy_v1/src/Leorio/topicmodelling.py
================================================
import __init__
import os
import time
from Kite import config
from Kite import utils
from Kite.database import Database
from Leorio.tokenization import Tokenization
from Hisoka.classifier import Classifier
from sklearn import preprocessing
from gensim import corpora
from gensim import models
from gensim.matutils import corpus2dense
import logging
logging.basicConfig(level=logging.INFO,
format="%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s",
datefmt="%a, %d %b %Y %H:%M:%S")
class TopicModelling(object):
def __init__(self):
self.tokenization = Tokenization(import_module="jieba",
user_dict=config.USER_DEFINED_DICT_PATH,
chn_stop_words_dir=config.CHN_STOP_WORDS_PATH)
self.database = Database()
self.classifier = Classifier()
def create_dictionary(self,
raw_documents_list,
save_path=None,
is_saved=False):
"""
将文中每个词汇关联唯一的ID,因此需要定义词汇表
:param: raw_documents_list, 原始语料列表,每个元素即文本,如["洗尽铅华...", "风雨赶路人...", ...]
:param: savepath, corpora.Dictionary对象保存路径
"""
documents_token_list = []
for doc in raw_documents_list:
documents_token_list.append(self.tokenization.cut_words(doc))
_dict = corpora.Dictionary(documents_token_list)
# 找到只出现一次的token
once_items = [_dict[tokenid] for tokenid, docfreq in _dict.dfs.items() if docfreq == 1]
# 在documents_token_list的每一条语料中,删除只出现一次的token
for _id, token_list in enumerate(documents_token_list):
documents_token_list[_id] = list(filter(lambda token: token not in once_items, token_list))
# 极端情况,某一篇语料所有token只出现一次,这样该篇新闻语料的token列表就变为空,因此删除掉
documents_token_list = [token_list for token_list in documents_token_list if (len(token_list) != 0)]
# 找到只出现一次的token对应的id
once_ids = [tokenid for tokenid, docfreq in _dict.dfs.items() if docfreq == 1]
# 删除仅出现一次的词
_dict.filter_tokens(once_ids)
# 消除id序列在删除词后产生的不连续的缺口
_dict.compactify()
if is_saved and save_path:
_dict.save(save_path)
logging.info("new generated dictionary saved in path -> {} ...".format(save_path))
return _dict, documents_token_list
def renew_dictionary(self,
old_dict_path,
new_raw_documents_list,
new_dict_path=None,
is_saved=False):
documents_token_list = []
for doc in new_raw_documents_list:
documents_token_list.append(self.tokenization.cut_words(doc))
_dict = corpora.Dictionary.load(old_dict_path)
_dict.add_documents(documents_token_list)
if new_dict_path:
old_dict_path = new_dict_path
if is_saved:
_dict.save(old_dict_path)
logging.info("updated dictionary by another raw documents serialized in {} ... ".format(old_dict_path))
return _dict, documents_token_list
def create_bag_of_word_representation(self,
raw_documents_list,
old_dict_path=None,
new_dict_path=None,
bow_vector_save_path=None,
is_saved_dict=False):
if old_dict_path:
# 如果存在旧的语料词典,就在原先词典的基础上更新,增加未见过的词
corpora_dictionary, documents_token_list = self.renew_dictionary(old_dict_path,
raw_documents_list,
new_dict_path=new_dict_path)
else:
# 否则重新创建词典
start_time = time.time()
corpora_dictionary, documents_token_list = self.create_dictionary(raw_documents_list,
save_path=new_dict_path,
is_saved=is_saved_dict)
end_time = time.time()
logging.info("there are {} mins spent to create a new dictionary ... ".format((end_time-start_time)/60))
# 根据新词典对文档(或语料)生成对应的词袋向量
start_time = time.time()
bow_vector = [corpora_dictionary.doc2bow(doc_token) for doc_token in documents_token_list]
end_time = time.time()
logging.info("there are {} mins spent to calculate bow-vector ... ".format((end_time - start_time) / 60))
if bow_vector_save_path:
corpora.MmCorpus.serialize(bow_vector_save_path, bow_vector)
return documents_token_list, corpora_dictionary, bow_vector
@staticmethod
def transform_vectorized_corpus(corpora_dictionary,
bow_vector,
model_type="lda",
model_save_path=None):
# 如何没有保存任何模型,重新训练的情况下,可以选择该函数
model_vector = None
if model_type == "lsi":
# LSI(Latent Semantic Indexing)模型,将文本从词袋向量或者词频向量(更好),转为一个低维度的latent空间
# 对于现实语料,目标维度在200-500被认为是"黄金标准"
model_tfidf = models.TfidfModel(bow_vector)
# model_tfidf.save("model_tfidf.tfidf")
tfidf_vector = model_tfidf[bow_vector]
model = models.LsiModel(tfidf_vector,
id2word=corpora_dictionary,
num_topics=config.TOPIC_NUMBER) # 初始化模型
model_vector = model[tfidf_vector]
if model_save_path:
model.save(model_save_path)
elif model_type == "lda":
model = models.LdaModel(bow_vector,
id2word=corpora_dictionary,
num_topics=config.TOPIC_NUMBER) # 初始化模型
model_vector = model[bow_vector]
if model_save_path:
model.save(model_save_path)
elif model_type == "tfidf":
model = models.TfidfModel(bow_vector) # 初始化
# model = models.TfidfModel.load("model_tfidf.tfidf")
model_vector = model[bow_vector] # 将整个语料进行转换
if model_save_path:
model.save(model_save_path)
return model_vector
def classify_stock_news(self,
unseen_raw_document,
database_name,
collection_name,
label_name="60DaysLabel",
topic_model_type="lda",
classifier_model="svm",
ori_dict_path=None,
bowvec_save_path=None,
is_saved_bow_vector=False):
historical_raw_documents_list = []
Y = []
for row in self.database.get_collection(database_name, collection_name).find():
if label_name in row.keys():
if row[label_name] != "":
historical_raw_documents_list.append(row["Article"])
Y.append(row[label_name])
logging.info("fetch symbol '{}' historical news with label '{}' from [DB:'{}' - COL:'{}'] ... "
.format(collection_name, label_name, database_name, collection_name))
le = preprocessing.LabelEncoder()
Y = le.fit_transform(Y)
logging.info("encode historical label list by sklearn preprocessing for training ... ")
label_name_list = le.classes_ # ['中性' '利好' '利空'] -> [0, 1, 2]
# 根据历史新闻数据库创建词典,以及计算每个历史新闻的词袋向量;如果历史数据库创建的字典存在,则加载进内存
# 用未见过的新闻tokens去更新该词典
if not os.path.exists(ori_dict_path):
if not os.path.exists(bowvec_save_path):
_, _, historical_bow_vec = self.create_bag_of_word_representation(historical_raw_documents_list,
new_dict_path=ori_dict_path,
bow_vector_save_path=bowvec_save_path,
is_saved_dict=True)
logging.info("create dictionary of historical news, and serialized in path -> {} ... ".format(ori_dict_path))
logging.info("create bow-vector of historical news, and serialized in path -> {} ... ".format(bowvec_save_path))
else:
_, _, _ = self.create_bag_of_word_representation(historical_raw_documents_list,
new_dict_path=ori_dict_path,
is_saved_dict=True)
logging.info("create dictionary of historical news, and serialized in path -> {} ... ".format(ori_dict_path))
else:
if not os.path.exists(bowvec_save_path):
_, _, historical_bow_vec = self.create_bag_of_word_representation(historical_raw_documents_list,
new_dict_path=ori_dict_path,
bow_vector_save_path=bowvec_save_path,
is_saved_dict=True)
logging.info("historical news dictionary existed, which saved in path -> {}, but not the historical bow-vector"
" ... ".format(ori_dict_path))
else:
historical_bow_vec_mmcorpus = corpora.MmCorpus(bowvec_save_path) # type ->
historical_bow_vec = []
for _bow in historical_bow_vec_mmcorpus:
historical_bow_vec.append(_bow)
logging.info("both historical news dictionary and bow-vector existed, load historical bow-vector to memory ... ")
start_time = time.time()
updated_dictionary_with_old_and_unseen_news, unssen_documents_token_list = self.renew_dictionary(ori_dict_path,
[unseen_raw_document],
is_saved=True)
end_time = time.time()
logging.info("renew dictionary with unseen news tokens, and serialized in path -> {}, "
"which took {} mins ... ".format(ori_dict_path, (end_time-start_time)/60))
unseen_bow_vector = [updated_dictionary_with_old_and_unseen_news.doc2bow(doc_token) for doc_token in
unssen_documents_token_list]
updated_bow_vector_with_old_and_unseen_news = []
updated_bow_vector_with_old_and_unseen_news.extend(historical_bow_vec)
updated_bow_vector_with_old_and_unseen_news.extend(unseen_bow_vector)
# 原先updated_bow_vector_with_old_and_unseen_news是list类型,
# 但是经过下面序列化后重新加载进来的类型是gensim.corpora.mmcorpus.MmCorpus
if is_saved_bow_vector and bowvec_save_path:
corpora.MmCorpus.serialize(bowvec_save_path,
updated_bow_vector_with_old_and_unseen_news) # 保存更新后的bow向量,即包括新旧新闻的bow向量集
logging.info("combined bow vector(type -> 'list') generated by historical news with unseen bow "
"vector to create a new one ... ")
if topic_model_type == "lsi":
start_time = time.time()
updated_tfidf_model_vector = self.transform_vectorized_corpus(updated_dictionary_with_old_and_unseen_news,
updated_bow_vector_with_old_and_unseen_news,
model_type="tfidf") # type ->
end_time = time.time()
logging.info("regenerated TF-IDF model vector by updated dictionary and updated bow-vector, "
"which took {} mins ... ".format((end_time-start_time)/60))
start_time = time.time()
model = models.LsiModel(updated_tfidf_model_vector,
id2word=updated_dictionary_with_old_and_unseen_news,
num_topics=config.TOPIC_NUMBER) # 初始化模型
model_vector = model[updated_tfidf_model_vector] # type ->
end_time = time.time()
logging.info("regenerated LSI model vector space by updated TF-IDF model vector space, "
"which took {} mins ... ".format((end_time-start_time)/60))
elif topic_model_type == "lda":
start_time = time.time()
model_vector = self.transform_vectorized_corpus(updated_dictionary_with_old_and_unseen_news,
updated_bow_vector_with_old_and_unseen_news,
model_type="lda")
end_time = time.time()
logging.info("regenerated LDA model vector space by updated dictionary and bow-vector, "
"which took {} mins ... ".format((end_time-start_time)/60))
# 将gensim.interfaces.TransformedCorpus类型的lsi模型向量转为numpy矩阵
start_time = time.time()
latest_matrix = corpus2dense(model_vector,
num_terms=model_vector.obj.num_terms).T
end_time = time.time()
logging.info("transform {} model vector space to numpy.adarray, "
"which took {} mins ... ".format(topic_model_type.upper(), (end_time-start_time)/60))
# 利用历史数据的话题模型向量(或特征),进一步训练新闻分类器
start_time = time.time()
train_x, train_y, test_x, test_y = utils.generate_training_set(latest_matrix[:-1, :], Y)
clf = self.classifier.train(train_x, train_y, test_x, test_y, model_type=classifier_model)
end_time = time.time()
logging.info("finished training by sklearn {} using latest {} model vector space, which took {} mins ... "
.format(classifier_model.upper(), topic_model_type.upper(), (end_time-start_time)/60))
label_id = clf.predict(latest_matrix[-1, :].reshape(1, -1))[0]
return label_name_list[label_id]
if __name__ == "__main__":
label_name = "3DaysLabel"
database_name = "stocknews"
# sh600004的数据量比较少,可作为跑通代码流程的参数;sz000001的数据量比较大,处理起来也较慢,可以作为后续案例测试
collection_name = "sz000001"
classifier_save_path = "{}_classifier.pkl".format(collection_name)
ori_dict_path = "{}_docs_dict.dict".format(collection_name)
bowvec_save_path = "{}_bowvec.mm".format(collection_name)
# 对(未见过的)新闻进行分类
# unseen_raw_documents_list = ["智通财经APP讯,白云机场(600004.SH)发布公告,公司2020年11月起降40278架次,\
# 同比下降2.47%;旅客吞吐量约501.4万人次,同比下降19.31%;货邮吞吐量约17.32万\
# 吨,同比下降1.27%。此外,公司2020年累计起降约33.2万架次,同比下降26.07%;旅\
# 客吞吐量约3890.14万人次,同比下降42.00%;货邮吞吐量约158.12万吨,同比下降9.14%。",
# "格隆汇 9 月 1日丨白云机场(600004.SH)公布,公司收到中国证券监督管理委员会于2020\
# 年8月20日出具的《中国证监会行政许可项目审查一次反馈意见通知书》(202137号)。根据\
# 《反馈意见》的相关要求,白云机场控股股东广东省机场管理集团有限公司(“机场集团”)\
# 于2020年8月31日出具了《广东省机场管理集团有限公司关于不存在减持广州白云国际机场股\
# 份有限公司股票行为或减持计划的承诺函》,具体内容如下:鉴于机场集团拟以现金的方式参\
# 与认购本次白云机场非公开发行的A股股票。机场集团现作出如下承诺:1、自白云机场本次发\
# 行定价基准日(即2020年4月28日)前六个月至本承诺函出具之日,机场集团及机场集团控制的关\
# 联方未出售或以任何方式减持白云机场的任何股票。2、自本承诺函出具之日起至白云机场本次发\
# 行完成后六个月期间内,机场集团及机场集团控制的关联方将不会出售或以任何方式减持所持有的\
# 白云机场的任何股票,也不存在减持白云机场股票的计划。3、机场集团及机场集团控制的关联方\
# 不存在违反《中华人民共和国证券法》第四十四条的情形。如有违反,机场集团因减持股票所得收\
# 益将归白云机场所有。4、本承诺函自签署之日起对机场集团具有约束力,若机场集团或机场集团\
# 控制的关联方违反上述承诺发生减持情况,则减持所得全部收益归白云机场所有,机场集团依法\
# 承担由此产生的法律责任。",
# "格隆汇11月27日丨白云机场(600004.SH)公布,为增强上市公司经营独立性、业务及资产完整性,\
# 提升公司盈利能力与运行保障能力,扩展白云机场物流业务发展空间,同时减少关联交易,确保上\
# 市公司利益最大化,公司拟实施如下交易:机场集团以所持有的航合公司100%的股权以及铂尔曼酒\
# 店、澳斯特酒店相应的经营性资产及负债与上市公司所持有的物流公司51%的股权进行资产置换,差\
# 额部分以现金补足。其中航合公司100%股权作价7.54亿元,铂尔曼酒店经营性资产及负债作价2.28\
# 亿元,澳斯特酒店经营性资产及负债作价3950.01万元,物流公司51%股权作价8.57亿元,上市公司\
# 需向机场集团以现金方式支付差额1.64亿元。本次交易完成后,公司将持有航合公司100%股权、铂\
# 尔曼酒店和澳斯特酒店经营性资产及负债、物流公司49%股权;机场集团将持有物流公司51%股权。\
# 本次交易除上述资产置换外,还包括:(1)上市公司与机场集团重新划分国内航空主业收入中旅客服\
# 务费(以下简称“旅客服务费”)的分成比例,由上市公司占85%、机场集团占15%,变更为上市公司\
# 占100%,机场集团不再享有旅客服务费分成,2018年15%旅客服务费对应金额为1.19亿元;及(2)上\
# 市公司将按物流公司年营业收入的4%向物流公司收取经营权使用费。2018年,模拟计算物流公司营\
# 业收入4%对应的经营权使用费为2536.07万元。本次资产置换交易完成后,上市公司2018年备考口径\
# 净利润、归母净利润、净资产、归母净资产和每股收益都将增厚约5%,2018年备考每股收益将从\
# 0.5457元每股增厚至0.5717元每股。为充分保障上市公司及中小股东利益,机场集团同意,自本次\
# 资产置换交割之日起五年内,上市公司享有一次回购物流公司股权的权利,即上市公司有权要求机\
# 场集团将本次交易取得的全部物流公司股权(对应同等金额的注册资本金额,包括在此基础上进行\
# 配股、转增、折股等所取得的股权)按届时评估值转让给上市公司。因此,上市公司在本次资产置\
# 换中拥有充分的主动权,可以选择重新取得物流公司的控制权。据悉,旅客服务费是公司主营航空\
# 性业务收入的重要组成部分,对业务完整性具有重要意义。旅客服务费全部由上市公司享有后,将\
# 较大幅度增加上市公司的收入、利润和现金流水平。受益于粤港澳大湾区规划及白云机场T2航站楼\
# 启用,旅客吞吐量逐年提升。未来随着白云机场的T3航站楼及新跑道的建设推进,旅客吞吐量还将\
# 进一步提升,15%旅客服务费对应收入将随之提升,并为公司贡献更多业绩增长空间。"]
unseen_raw_documents_list = ["格隆汇6月23日丨平安银行(000001.SZ)公布,近日收到《中国银保监会关于平安银行变更注册资本\
的批复》(银保监复〔2020〕342号),中国银行保险监督管理委员会同意本行将注册资本由人民币\
17, 170, 411, 366元增加至19, 405, 918, 198元,并修改本行章程相应条款。",
"平安银行(000001,股吧)(000001.SZ)公布,公司于2020年8月19日收到《中国银保监会关于平安理\
财有限责任公司开业的批复》(银保监复〔2020〕513号),中国银行保险监督管理委员会(简称“中\
国银保监会”)已批准公司全资子公司平安理财有限责任公司(简称“平安理财”)开业。根据中国银\
保监会批复,平安理财注册资本为50亿元人民币,注册地为深圳市,主要从事发行公募理财产品、\
发行私募理财产品、理财顾问和咨询等资产管理相关业务。 近年来,公司以打造“中国最卓越\
、全球领先的智能化零售银行”为战略目标,坚持“科技引领、零售突破、对公做精”十二字策略\
方针,强化“综合金融”、“科技赋能”两大核心优势,打造数字化银行、生态银行、平台银行三\
张名片,推动发展迈向新台阶。在此基础上,稳步推进资产管理和理财业务转型,综合服务能力不\
断提升,规模、质量、效益实现协调发展。设立平安理财是本行严格落实监管要求、促进理财业务\
健康发展、推动理财业务回归本源的重要举措。平安理财将秉持“受人之托,代客理财”的服务宗\
旨,深耕理财市场,为客户提供更优质的资管产品和财富管理服务,助力实体经济高质量发展。下\
一步,公司将按照法律法规相关要求严格履行有关程序,推动平安理财尽快开业运营。",
"格隆汇5月26日丨平安银行(000001.SZ)公布,经中国银行保险监督管理委员会和中国人民银行批准\
,公司于近日在全国银行间债券市场成功发行了总额为300亿元人民币的小型微型企业贷款专项金融\
债券。该期债券发行总规模为人民币300亿元,为3年期固定利率债券,票面利率为2.30%,募集资金\
将依据适用法律和监管部门的批准,专项用于发放小型微型企业贷款,其中部分将用于发放与新冠\
肺炎疫情防控相关的小微企业贷款,加大对小型微型企业信贷支持力度,推动小型微型企业业务稳\
健、健康发展。"]
topicmodelling = TopicModelling()
for unseen_doc in unseen_raw_documents_list:
chn_label = topicmodelling.classify_stock_news(unseen_doc,
database_name,
collection_name,
label_name=label_name,
topic_model_type="lsi",
classifier_model="rdforest", # rdforest / svm
ori_dict_path=ori_dict_path,
bowvec_save_path=bowvec_save_path)
logging.info("document '{}...' was classified with label '{}' for symbol {} ... ".format(unseen_doc[:20], chn_label, collection_name))
# lsi Tue, 15 Dec 2020 14:54:08 classifier.py[line:54] INFO train_pred: 0.9829 test_pred: 0.703 (只是去掉停用词、tab符以及空格符) 30DaysLabel
# lsi Tue, 15 Dec 2020 17:00:58 classifier.py[line:54] INFO train_pred: 0.9852 test_pred: 0.7492(去掉不含中文的词以及只有一个字符的词) 30DaysLabel
# lda Tue, 15 Dec 2020 17:29:56 classifier.py[line:54] INFO train_pred: 0.9498 test_pred: 0.7426(去掉不含中文的词以及只有一个字符的词) 30DaysLabel
# lsi Wed, 16 Dec 2020 15:57:28 classifier.py[line:54] INFO train_pred: 0.9872 test_pred: 0.7478(修改create_dictionary后) 30DaysLabel
# lsi Wed, 16 Dec 2020 17:14:57 classifier.py[line:54] INFO train_pred: 0.9777 test_pred: 0.7247(修改create_dictionary后) 3DaysLabel
# lsi Wed, 16 Dec 2020 17:30:15 classifier.py[line:54] INFO train_pred: 0.9883 test_pred: 0.7123(修改create_dictionary后) 60DaysLabel
================================================
FILE: legacy_v1/src/__init__.py
================================================
================================================
FILE: legacy_v1/src/history_spyder_startup.bat
================================================
cd ./Gon
python ./history_starter_stock_price.py
start python ./history_starter_cnstock.py
start python ./history_starter_nbd.py
start python ./history_starter_jrj.py
================================================
FILE: legacy_v1/src/main.py
================================================
import time
import logging
from Kite import config
from Gon.jrjspyder import JrjSpyder
from Gon.nbdspyder import NbdSpyder
from Gon.cnstockspyder import CnStockSpyder
from Gon.stockinfospyder import StockInfoSpyder
from Killua.denull import DeNull
from Killua.deduplication import Deduplication
from Killua.buildstocknewsdb import GenStockNewsDB
# 1. 爬取历史数据
stock_info_spyder = StockInfoSpyder(config.STOCK_DATABASE_NAME, config.COLLECTION_NAME_STOCK_BASIC_INFO)
stock_info_spyder.get_historical_news(start_date="2020-01-01")
cnstock_spyder = CnStockSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
for url_to_be_crawled, type_chn in config.WEBSITES_LIST_TO_BE_CRAWLED_CNSTOCK.items():
logging.info("start crawling {} ...".format(url_to_be_crawled))
cnstock_spyder.get_historical_news(url_to_be_crawled, category_chn=type_chn)
logging.info("finished ...")
time.sleep(30)
jrj_spyder = JrjSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
jrj_spyder.get_historical_news(config.WEBSITES_LIST_TO_BE_CRAWLED_JRJ, start_date="2020-01-01")
nbd_spyder = NbdSpyder(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
nbd_spyder.get_historical_news(60)
# 2. 针对历史数据进行去重清洗
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
Deduplication(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# 3. 将历史数据中包含null值的行去掉
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_NBD).run()
DeNull(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ).run()
# 4. 创建新的数据库,针对每一个股票,将所有涉及该股票的新闻都保存在新的数据库,并贴好"利好","利空"和"中性"标签
gen_stock_news_db = GenStockNewsDB()
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_CNSTOCK)
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_NBD)
gen_stock_news_db.get_all_news_about_specific_stock(config.DATABASE_NAME, config.COLLECTION_NAME_JRJ)
# 5. 开启实时爬取新闻数据
================================================
FILE: legacy_v1/src/realtime_spyder_startup.bat
================================================
@echo off
:again
cls
echo =========================== Please select programs below to run ===========================
echo 1 ./Gon/realtime_starter_cnstock.py
echo 2 ./Gon/realtime_starter_jrj.py
echo 3 ./Gon/realtime_starter_nbd.py
echo 4 ./Gon/realtime_starter_stock_price.py
echo 5 run all
echo.
echo Please input number 1-5:
set /p num=
if "%num%"=="1" (
cd ./Gon
start python ./realtime_starter_redis_queue.py
start python ./realtime_starter_cnstock.py
)
if "%num%"=="2" (
cd ./Gon
start python ./realtime_starter_redis_queue.py
start python ./realtime_starter_jrj.py
)
if "%num%"=="3" (
cd ./Gon
start python ./realtime_starter_redis_queue.py
start python ./realtime_starter_nbd.py
)
if "%num%"=="4" (
cd ./Gon
start python ./realtime_starter_redis_queue.py
start python ./realtime_starter_stock_price.py
)
if "%num%"=="5" (
cd ./Gon
start python ./realtime_starter_redis_queue.py
start python ./realtime_starter_cnstock.py
start python ./realtime_starter_nbd.py
start python ./realtime_starter_jrj.py
start python ./realtime_starter_stock_price.py
)
================================================
FILE: legacy_v1/src/realtime_spyder_stopall.bat
================================================
cd ./Gon
start python ./kill_realtime_spyder_tasks.py
================================================
FILE: reset_all_data.sh
================================================
#!/bin/bash
# 一键清空所有数据并重新开始爬取
set -e
echo "=========================================="
echo " FinnewsHunter 数据重置脚本"
echo "=========================================="
echo ""
echo "⚠️ 警告:此操作将删除所有新闻和任务数据!"
echo "⚠️ 此操作不可恢复!"
echo ""
read -p "确认要清空所有数据吗?(yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "❌ 操作已取消"
exit 0
fi
echo ""
echo "开始清空数据..."
echo ""
# 1. 清空PostgreSQL数据
echo "[1/4] 清空PostgreSQL数据..."
docker exec finnews_postgres psql -U finnews -d finnews_db < str` method.
* Concrete Classes: Implements wrappers for various models like `DISCVFINLLMChatGLM26B`, `DISCVFINLLMBaichuan13BChat`, etc. These classes handle model-specific loading (including **LoRA** fine-tuning via `peft.PeftModel`), tokenization, and the actual generation call.
* **`evaluate.py`**: **Evaluation Logic and Prompt Engineering**.
* Multiple `*Evaluator` Classes (e.g., `FinFEEvaluator`, `FinQAEvaluator`): Each class is responsible for a specific financial task (e.g., sentiment analysis, QA).
* `__init__`: Loads the task-specific evaluation data and few-shot instruction samples.
* `build_zero_shot_prompt` / `build_few_shot_prompt`: Implements prompt engineering by constructing the input text based on predefined templates and few-shot examples.
* `evaluate`: Calculates the final metric (e.g., accuracy for sentiment, F1 for QA) by comparing model predictions (`preds`) with ground truth (`golds`).
* `run_evaluation`: The main evaluation loop, iterating over all data samples, generating responses using the injected `llm.generate()` method, and calculating both zero-shot and few-shot metrics.
* **`autoeval.py`**: **Evaluation Orchestration**.
* `model_lists` and `Eval_datasets`: Dictionaries mapping string names to the respective model and evaluator classes, implementing a **Factory Pattern**.
* `main` block: Parses command-line arguments for model name, LoRA path, and dataset. It instantiates the chosen `llm` and `evaluator` and calls `evaluator().run_evaluation(llm)`.
* **`preprocess.py`**: **Data Preparation**.
* `BBTFinCUGE` class: Manages the downloading and processing of the raw BBT-FinCUGE datasets.
* `download_all()`: Uses `requests` to fetch raw JSON data from a GitHub repository.
* `process_*` methods (e.g., `process_finfe`): Converts the raw dataset format into a standardized list of instances with `id`, `input`, `gold_answer`, and `source` fields.
* **`utils.py`**: **Utility Functions**.
* `write_json`, `load_json`: Standardized JSON file I/O.
* `_mixed_segmentation`, `_remove_punctuation`: Text cleaning and tokenization utilities, crucial for Chinese NLP tasks, using `nltk.word_tokenize`.
* `_find_lcs`, `_compute_f1_score`: Implements the Longest Common Subsequence (LCS) algorithm and F1 score calculation, which is the core metric for generative tasks like QA.
### Dependencies and Error/Performance
**Dependencies**: `transformers`, `peft`, `torch`, `argparse`, `tqdm`, `requests`, `inspect`, `random`, `nltk`.
**Performance**: The use of `torch.float16` and `device_map="auto"` in model loading across all modules is a key performance optimization for large models on GPU. The `tqdm` library is used in `evaluate.py` to provide progress bars, enhancing user experience during long evaluation runs.
**Error Handling**: Basic file existence checks are present in `preprocess.py` (`if not os.path.exists(file_path)`). The `evaluate.py` includes assertions (`assert len(golds) == len(preds)`) to ensure data integrity before metric calculation.
### Module PlantUML Diagrams
### Module 1: Root/Demo Module
```plantuml
@startuml
title Root/Demo Module (cli_demo.py & web_demo.py)
class AutoModelForCausalLM
class AutoTokenizer
class GenerationConfig
class torch
class streamlit as st
class colorama
package "Demo Scripts" {
class cli_demo {
+ init_model()
+ clear_screen()
+ main()
}
class web_demo {
+ @st.cache_resource init_model()
+ clear_chat_history()
+ init_chat_history()
+ main()
}
}
cli_demo ..> AutoModelForCausalLM : loads
cli_demo ..> AutoTokenizer : loads
cli_demo ..> GenerationConfig : loads
cli_demo ..> torch : uses
cli_demo ..> colorama : uses
web_demo ..> AutoModelForCausalLM : loads
web_demo ..> AutoTokenizer : loads
web_demo ..> GenerationConfig : loads
web_demo ..> torch : uses
web_demo ..> st : uses
AutoModelForCausalLM <.. cli_demo : model.chat()
AutoModelForCausalLM <.. web_demo : model.chat()
@enduml
```
### Module 2: Evaluation/Core Logic Module
```plantuml
@startuml
title Evaluation/Core Logic Module (eval/evaluator)
abstract class DISCFINLLMBase {
+ generate(prompt: str): str {abstract}
}
package "LLM Wrappers (finllm.py)" {
class DISCVFINLLMChatGLM26B
class DISCVFINLLMBaichuan13BChat
class FinGPTv3
DISCFINLLMBase <|-- DISCVFINLLMChatGLM26B
DISCFINLLMBase <|-- DISCVFINLLMBaichuan13BChat
DISCFINLLMBase <|-- FinGPTv3
}
package "Data Preprocessing (preprocess.py)" {
class BBTFinCUGE {
+ download_all()
+ process_finfe()
+ process_finqa()
.. other process methods ..
}
}
package "Evaluation Logic (evaluate.py)" {
class FinFEEvaluator {
+ build_zero_shot_prompt()
+ build_few_shot_prompt()
+ evaluate(golds, preds)
+ run_evaluation(llm)
}
class FinQAEvaluator
class FinCQAEvaluator
.. other Evaluators ..
FinFEEvaluator ..> BBTFinCUGE : loads instruct samples
FinFEEvaluator ..> DISCFINLLMBase : calls generate()
}
package "Utilities (utils.py)" {
class Utils {
+ write_json()
+ load_json()
+ _mixed_segmentation()
+ _find_lcs()
+ _compute_f1_score()
}
}
package "Orchestration (autoeval.py)" {
class AutoEval {
+ model_lists
+ Eval_datasets
+ main()
}
}
AutoEval --> DISCFINLLMBase : instantiates model
AutoEval --> FinFEEvaluator : instantiates evaluator
FinFEEvaluator ..> Utils : uses metrics/text processing
BBTFinCUGE ..> Utils : uses load/write_json
@enduml
```
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
The DISC-FinLLM project is structured around a **modular, multi-expert design philosophy** centered on a clear separation of concerns between the LLM interaction, task-specific evaluation, and application demonstration.
The **core abstraction** is the `DISCFINLLMBase` abstract class defined in `finllm.py`. This class establishes a standardized interface (`generate(prompt: str) -> str`) for all underlying Large Language Models (LLMs), effectively decoupling the evaluation and application logic from the specific model implementation (e.g., ChatGLM, Baichuan, Bloomz). This allows the system to be easily extended to support new base models or different fine-tuned versions without modifying the evaluation framework.
The **design philosophy** is a **"Model-as-a-Service"** approach within the evaluation context. The LLM is treated as a black-box component that accepts a prompt and returns a response. The complexity of model loading, LoRA weight merging, and device management is encapsulated within the concrete model wrapper classes (e.g., `DISCVFINLLMBaichuan13BChat`). This encapsulation promotes code reusability and maintainability. Furthermore, the project implicitly follows a **Multi-Expert System** design, where the four data files (`consulting_part.json`, `task_part.json`, etc.) suggest the model is fine-tuned for distinct financial sub-tasks, which is then validated by the corresponding task-specific evaluators in `evaluate.py`.
The **lifecycle management** of the application is straightforward:
1. **Data Preparation**: The `preprocess.py` script manages the initial lifecycle phase by downloading and transforming raw BBT-FinCUGE data into a standardized format for evaluation.
2. **Model Loading**: The model is loaded once at the start of the application, either via `init_model()` in the demo scripts or via the `autoeval.py` orchestrator. Crucially, the use of `torch.float16` and `device_map="auto"` ensures efficient, memory-optimized loading onto available hardware.
3. **Execution**:
* **Demo Lifecycle**: The demo scripts maintain a continuous loop, managing conversation history (`messages` list) and repeatedly calling the model's `chat` method for each user turn.
* **Evaluation Lifecycle**: The `autoeval.py` script orchestrates the evaluation, instantiating the chosen model and evaluator, running the full `run_evaluation` loop, and finally writing the metrics to a JSON file.
#### 3.1.2. Component Interactions
The project exhibits two primary interaction flows: the **Demonstration Flow** and the **Evaluation Flow**.
## 1. Demonstration Flow (e.g., `cli_demo.py`)
This flow is a direct, synchronous interaction between the user interface and the LLM.
1. **Initialization**: `cli_demo.py` calls `init_model()` to load the model and tokenizer.
2. **User Input**: The user provides a `prompt`.
3. **Request**: The script appends the user's prompt to the `messages` history.
4. **Generation**: The script calls the model's custom `model.chat(tokenizer, messages, stream=True)` method.
5. **Response**: The model generates a response, which is either printed as a stream (in `cli_demo.py`) or updated in a placeholder (in `web_demo.py`).
6. **History Update**: The model's response is appended to the `messages` history, maintaining the conversational context.
## 2. Evaluation Flow (`autoeval.py` Orchestration)
This flow is more complex, involving multiple components to systematically test the LLM.
1. **Orchestration**: `autoeval.py` instantiates a specific `DISCFINLLMBase` implementation (`llm`) and one or more `*Evaluator` instances.
2. **Data Access**: The `*Evaluator` (e.g., `FinFEEvaluator`) loads its task-specific evaluation data (`finfe-eval.jsonl`) and few-shot samples (`instruct_samples.json`) using helper functions from `utils.py`.
3. **Prompt Engineering**: Inside `*Evaluator.run_evaluation()`, for each data sample, the appropriate prompt construction method (`build_zero_shot_prompt` or `build_few_shot_prompt`) is called. This is where the task-specific instruction and context are formatted for the LLM.
4. **LLM Interaction**: The evaluator calls `llm.generate(input_text)` on the model wrapper. This is the critical communication point, abstracting the underlying model's API.
5. **Metric Calculation**: The evaluator collects the model's predictions (`preds`) and compares them to the ground truth (`golds`). It uses utility functions from `utils.py` (e.g., `_remove_punctuation`, `_find_lcs`) to clean text and calculate metrics like F1 score or accuracy.
6. **Result Reporting**: The final metrics are returned to `autoeval.py`, which then aggregates and writes the results to a JSON file using `utils.write_json`.
The communication pattern between the `*Evaluator` and the `DISCFINLLMBase` is a clear example of the **Strategy Pattern**, where the evaluation logic (context) uses the model wrapper (strategy) to perform the generation task.
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml
title DISC-FinLLM Overall Architecture
skinparam componentStyle rectangle
package "Application Layer" {
[cli_demo.py] as CLI
[web_demo.py] as WEB
}
package "Core Model Abstraction" {
abstract class DISCFINLLMBase
[Model Wrappers (finllm.py)] as WRAPPER
DISCFINLLMBase <|-- WRAPPER
}
package "Evaluation Framework" {
[autoeval.py] as ORCHESTRATOR
[evaluate.py] as EVAL_LOGIC
[preprocess.py] as PREPROCESS
[utils.py] as UTILS
[Task Evaluators (e.g., FinFEEvaluator)] as EVALUATOR
EVAL_LOGIC ..> EVALUATOR
}
package "External Dependencies" {
[Hugging Face Transformers] as HF
[PEFT (LoRA)] as PEFT
[BBT-FinCUGE Data] as DATA
}
CLI --> WRAPPER : loads & interacts
WEB --> WRAPPER : loads & interacts
ORCHESTRATOR --> WRAPPER : instantiates LLM
ORCHESTRATOR --> EVALUATOR : instantiates Task Logic
EVALUATOR --> WRAPPER : calls generate()
EVALUATOR --> UTILS : uses metrics/helpers
PREPROCESS --> DATA : downloads
PREPROCESS --> UTILS : uses I/O
WRAPPER --> HF : uses AutoModel/Tokenizer
WRAPPER --> PEFT : loads LoRA weights
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
The codebase, particularly the evaluation framework, leverages several fundamental design patterns to manage complexity and promote extensibility.
## 1. Factory Pattern (Simple Factory)
* **Description**: The Factory Pattern is used to create objects without exposing the instantiation logic to the client.
* **Implementation**: In `autoeval.py`, the dictionaries `model_lists` and `Eval_datasets` act as simple factories.
* **Code Example (`autoeval.py`):**
```python
# Factory for LLM models
model_lists = {
'chatglm-6b': DISCVFINLLMChatGLM6B,
'baichuan-13b-chat': DISCVFINLLMBaichuan13BChat,
# ...
}
# Factory for Evaluators
Eval_datasets = {
'finfe': FinFEEvaluator,
'finqa': FinQAEvaluator,
# ...
}
# Client code instantiates based on string key
llm = model_lists.get(model_name)(device, lora_path)
# ...
evaluator = Eval_datasets.get(eval_data)
```
## 2. Abstract Factory / Template Method Pattern
* **Description**: The Abstract Factory pattern provides an interface for creating families of related or dependent objects without specifying their concrete classes. The Template Method pattern defines the skeleton of an algorithm in the superclass but lets subclasses override specific steps.
* **Implementation**: The `DISCFINLLMBase` abstract class defines the common interface (`generate`), while each concrete model wrapper (e.g., `DISCVFINLLMBaichuan13BChat`) implements the specific steps for model loading, tokenization, and generation logic, which varies significantly between models (e.g., ChatGLM's `chat` method vs. Baichuan's prompt templating).
## 3. Strategy Pattern
* **Description**: The Strategy Pattern defines a family of algorithms, encapsulates each one, and makes them interchangeable.
* **Implementation**: The `*Evaluator` classes (the context) use the `DISCFINLLMBase` instance (`llm`, the strategy) to perform the text generation. The evaluation logic remains the same regardless of which concrete LLM implementation is used.
#### 3.3.2. Project Highlights
The DISC-FinLLM project demonstrates several key design strengths, primarily focused on rigorous evaluation and model flexibility.
* **Comprehensive Evaluation Framework**: The most significant highlight is the dedicated, multi-task evaluation framework. By integrating the BBT-FinCUGE benchmark and creating distinct `*Evaluator` classes for tasks like sentiment analysis (`FinFE`), question answering (`FinQA`), and relation extraction (`FinRE`), the project ensures a **systematic and reproducible assessment** of the LLM's performance across the financial domain.
* **Model Agnosticism via Abstraction**: The use of the `DISCFINLLMBase` abstract class provides excellent **extensibility**. New LLMs (e.g., Llama, Qwen) can be integrated simply by creating a new concrete wrapper class that implements the `generate` method, without altering the core evaluation or demonstration logic.
* **LoRA Fine-Tuning Support**: The model wrappers in `finllm.py` are designed to support **LoRA (Low-Rank Adaptation)** fine-tuning out-of-the-box via the `peft` library. This allows developers to load a base model and merge LoRA weights dynamically, which is crucial for efficient experimentation and deployment of specialized financial models.
* **Dual Interface for Demonstration**: Providing both a **Command-Line Interface (`cli_demo.py`)** and a **Web Interface (`web_demo.py`)** using Streamlit enhances the project's **accessibility and usability**. This dual approach caters to both developers who prefer a quick terminal check and end-users who need a more polished, graphical demonstration.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
While the project is well-structured, several areas could be improved to enhance performance, architectural robustness, and code quality.
## 1. Architectural Optimization: Model Loading
* **Suggestion**: Implement a **Singleton Pattern** or a dedicated **Model Manager** class for the LLM.
* **Reasoning**: Currently, the model loading logic is duplicated across the demo scripts and the evaluation wrappers, and the evaluation wrappers themselves contain repetitive model loading code. A Singleton pattern would ensure the large LLM is loaded only once per process, centralizing resource management and reducing memory overhead.
## 2. Code Quality: Refactoring `evaluate.py`
* **Suggestion**: Introduce a common `BaseEvaluator` class in `evaluate.py` to abstract common methods like `__init__`, `run_evaluation`, and prompt building logic.
* **Reasoning**: The current `evaluate.py` file is excessively long (nearly 1000 lines) due to the high degree of code duplication across the many `*Evaluator` classes. Abstracting the common structure (loading data, iterating samples, calling `llm.generate`, calculating metrics) would significantly reduce file size and improve maintainability.
## 3. Robustness and Error Handling
* **Suggestion**: Enhance error handling, particularly in `preprocess.py` and model loading.
* **Reasoning**: The `preprocess.py` download function only prints an error message on failure (`print('failed to download dataset {}, {}'.format(eval_dataset, e))`) but does not raise an exception or retry. In a production environment, network failures should be handled with retries or graceful failure. Similarly, model loading should include more robust exception handling for missing files or incompatible hardware.
## 4. Performance: Text Processing
* **Suggestion**: Replace the dependency on `nltk` for simple Chinese segmentation and punctuation removal in `utils.py` with a lighter, custom regex-based function or a more modern, dedicated Chinese NLP library like `jieba`.
* **Reasoning**: The current implementation relies on `nltk.word_tokenize`, which may not be optimized for Chinese text and introduces a heavy dependency for simple tasks. A more streamlined approach could improve the performance of the metric calculation step.
#### 3.4.2. Secondary Development Guide
This guide outlines the best path for developers looking to explore, modify, or extend the DISC-FinLLM project.
## 1. Code Exploration and Entry Points
* **Application Flow**: Start with `cli_demo.py` to understand how the model is loaded (`init_model`) and how the chat loop is managed. This is the simplest entry point for testing model responses.
* **Evaluation Flow**: The core logic is orchestrated by `autoeval.py`. Examine this file to see how models and evaluators are instantiated using the Factory Pattern.
* **Model Abstraction**: Study `eval/evaluator/finllm.py`. This file is crucial for understanding how different LLMs are wrapped and how LoRA weights are integrated.
## 2. Extending Model Support
To integrate a new LLM (e.g., Llama-3):
1. Create a new class in `finllm.py` (e.g., `DISCVFINLLMLlama3`) inheriting from `DISCFINLLMBase`.
2. Implement the `__init__` method to handle the specific model and tokenizer loading for Llama-3, including any necessary `trust_remote_code` or LoRA integration.
3. Implement the `generate(prompt: str)` method, ensuring it correctly formats the prompt and calls the model's generation function to return a clean string response.
4. Add the new class to the `model_lists` dictionary in `autoeval.py`.
## 3. Adding a New Evaluation Task
To add a new financial NLP task:
1. Create a new class in `evaluate.py` (e.g., `FinNewTaskEvaluator`) following the structure of existing evaluators.
2. Define the `zero_shot_prompts` and `few_shot_prompts` templates specific to the new task.
3. Implement the `evaluate(golds, preds)` static method to calculate the correct metric (e.g., F1, accuracy, exact match) for the task, leveraging helper functions in `utils.py`.
4. Add the new evaluator class to the `Eval_datasets` dictionary in `autoeval.py`.
## 4. Customizing Data and Metrics
* **Data**: The `preprocess.py` script is the place to modify how raw data is converted into the standardized `input`/`gold_answer` format.
* **Metrics**: The `utils.py` file contains the core logic for text cleaning (`_mixed_segmentation`) and metric calculation (`_compute_f1_score`). Modifications here will affect all generative evaluation tasks.
================================================
FILE: thirdparty/ElegantRL.md
================================================
# ElegantRL - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
The ElegantRL project is structured to separate the core reinforcement learning logic from examples, documentation, and utility components. The core logic resides primarily in the `elegantrl` directory, which is further divided into functional modules: `agents`, `envs`, and `train`.
```
/home/ubuntu/ElegantRL
|____.github/ # GitHub configuration files (e.g., FUNDING.yml)
|____docs/ # Documentation source files (using Sphinx/reStructuredText)
|____elegantrl/ # Core Reinforcement Learning Library
| |______init__.py # Package initialization
| |____agents/ # Implementations of various DRL agents (AgentBase, AgentPPO, AgentSAC, etc.)
| |____envs/ # Custom and wrapper environments (StockTradingEnv, CustomGymEnv, etc.)
| |____train/ # Core training components (config, evaluator, replay_buffer, run)
|____examples/ # Scripts demonstrating how to use the library with different algorithms and environments
|____figs/ # Figures and images used in documentation and README
|____helloworld/ # Simple, single-file examples for quick start and tutorials
|____requirements.txt # Python dependencies
|____rlsolver/ # A separate, specialized solver component, likely for combinatorial optimization (CO) problems
|____unit_tests/ # Test files for agents, environments, and training components
```
The primary focus is on the `elegantrl` directory, which contains the fundamental components of the DRL library. The separation into `agents`, `envs`, and `train` enforces a clear modular design, making the codebase maintainable and extensible. The top-level folders like `examples`, `helloworld`, and `unit_tests` serve to support the core library by providing usage demonstrations and ensuring code quality. The `rlsolver` folder suggests a specialized application of the DRL framework to optimization problems.
```
### 1.2. Core Folders for Analysis
- **elegantrl/agents**: Contains the base class `AgentBase` and concrete implementations for various Deep Reinforcement Learning (DRL) algorithms, including on-policy (PPO, A2C) and off-policy (SAC, TD3, DDPG, DQN) methods, as well as multi-agent extensions (MADDPG, MAPPO, QMix, VDN).
- **elegantrl/envs**: Houses custom and specialized environment implementations, such as `StockTradingEnv` for financial applications and wrappers for vectorized environments.
- **elegantrl/train**: Manages the training infrastructure, including configuration (`config.py`), the main execution logic (`run.py`), experience storage (`replay_buffer.py`), and performance monitoring (`evaluator.py`).
## Phase 2: Module-by-Module Deep Analysis
### 1. Module: `elegantrl/agents`
**Core Responsibility:** Implements the core logic for Deep Reinforcement Learning (DRL) agents, defining the interaction between the agent and the environment, and managing the policy and value networks.
**Key Files and Functions:**
- **`AgentBase.py`**: Defines the abstract base class `AgentBase` for all DRL agents. It handles initialization parameters (network dimensions, environment info, hyperparameters), device management (CPU/GPU), exploration logic (`explore_env`, `explore_action`), network update boilerplate (`update_net`, `optimizer_backward`, `soft_update`), and utility network classes (`ActorBase`, `CriticBase`, `build_mlp`).
- **`AgentPPO.py`**: Implements the **Proximal Policy Optimization (PPO)** algorithm, an on-policy method. It extends `AgentBase` and includes specific logic for Generalized Advantage Estimation (GAE), ratio clipping, and entropy regularization. It also contains `AgentA2C` as a simpler variant.
- **`AgentSAC.py`**: Implements the **Soft Actor-Critic (SAC)** algorithm, an off-policy, maximum entropy DRL method. It uses an ensemble of critics (`CriticEnsemble`) and includes logic for automatic temperature parameter (`alpha`) adjustment.
- **`AgentTD3.py`**: Implements the **Twin Delayed DDPG (TD3)** algorithm, an off-policy method that improves upon DDPG with clipped double Q-learning and delayed policy updates. It includes `AgentDDPG` as a simpler variant.
- **`AgentDQN.py`**: Implements **Deep Q-Network (DQN)** and its variants (Double DQN, Dueling DQN) for discrete action spaces.
- **`MAgent*.py`**: Contains multi-agent extensions like `MAgentMADDPG`, `MAgentMAPPO`, `MAgentQMix`, and `MAgentVDN`, which adapt single-agent algorithms for multi-agent systems.
**Core Implementation Details:**
- **Network Abstraction**: Agents rely on `ActorBase` and `CriticBase` (defined in `AgentBase.py`) which are essentially wrappers around PyTorch `nn.Module`s built using the `build_mlp` utility.
- **Exploration**: The `explore_env` method is central, handling the collection of trajectories from the environment, distinguishing between single-environment (`_explore_one_env`) and vectorized environment (`_explore_vec_env`) scenarios.
- **Update Logic**: The `update_net` method orchestrates the training. The core difference between on-policy (PPO) and off-policy (SAC, TD3) agents is evident here: PPO calculates advantages and reward sums from the collected batch, while off-policy agents sample from the `ReplayBuffer`.
### 2. Module: `elegantrl/envs`
**Core Responsibility:** Provides custom and specialized environment interfaces, particularly for financial and multi-agent tasks, and handles the creation of vectorized environments.
**Key Files and Functions:**
- **`CustomGymEnv.py`**: A template or wrapper for integrating custom environments that follow the OpenAI Gym/Gymnasium interface.
- **`StockTradingEnv.py`**: A specialized environment for financial reinforcement learning, a key feature of the AI4Finance foundation. It defines the state, action, and reward space for a stock trading problem.
- **`PlanIsaacGymEnv.py`**: Integration with NVIDIA's Isaac Gym for highly parallelized, high-performance simulation environments.
- **`PointChasingEnv.py`**: A simple multi-agent environment used for testing and demonstration of multi-agent algorithms.
**Core Implementation Details:**
- **Standard Interface**: All environments adhere to the standard `reset()` and `step()` methods, ensuring compatibility with the `AgentBase`'s exploration logic.
- **Vectorization**: The concept of a vectorized environment (`VecEnv` in `config.py`) is crucial, allowing multiple environment instances to run in parallel, which is essential for the "Massively Parallel" aspect of ElegantRL.
### 3. Module: `elegantrl/train`
**Core Responsibility:** Manages the overall training workflow, configuration, data storage, and performance evaluation.
**Key Files and Functions:**
- **`config.py`**: Defines the `Config` class, which holds all hyperparameters and environment metadata. It includes logic to automatically determine if an agent is on-policy or off-policy (`get_if_off_policy`) and contains the `VecEnv` and `SubEnv` classes for parallel environment execution using Python's `multiprocessing.Pipe` and `Process`.
- **`replay_buffer.py`**: Implements the `ReplayBuffer` class for off-policy algorithms. It supports both standard sampling and **Prioritized Experience Replay (PER)** using the `SumTree` data structure.
- **`run.py`**: Contains the main entry points for training (`train_agent`, `train_agent_single_process`, `train_agent_multiprocessing`). It defines the `Learner`, `Worker`, and `EvaluatorProc` classes for distributed training using Python's `multiprocessing`.
- **`evaluator.py`**: Implements the `Evaluator` class for logging, saving checkpoints, and calculating performance metrics (average return, steps, loss values). It supports both single and vectorized environment evaluation and includes utilities for plotting the learning curve.
**Core Implementation Details:**
- **Parallelism**: The multi-process architecture in `run.py` is the backbone of ElegantRL's "Massively Parallel" claim. `Worker` processes collect experience from environments, and the `Learner` process updates the agent's networks, communicating via `Pipe`s.
- **Data Flow**: In off-policy training, `Worker`s send collected experience to the `Learner`, which stores it in the `ReplayBuffer` and samples batches for network updates. In on-policy training, the collected experience is used directly for a few epochs of updates before being discarded.
### Module PlantUML Diagrams
### 1. `elegantrl/agents` Module Diagram (Simplified Core)
```puml
@startuml
skinparam classAttributeIconVisible false
abstract class AgentBase {
+ if_discrete: bool
+ if_off_policy: bool
+ net_dims: list
+ state_dim: int
+ action_dim: int
+ device: torch.device
+ act: ActorBase
+ cri: CriticBase
+ act_optimizer: Adam
+ cri_optimizer: Adam
+ explore_env(env, horizon_len)
+ explore_action(state)
+ update_net(buffer)
+ update_objectives(buffer, update_t)
+ soft_update(target_net, current_net, tau)
}
abstract class ActorBase extends nn.Module {
+ net: nn.Sequential
+ forward(state)
+ get_action(state)
}
abstract class CriticBase extends nn.Module {
+ net: nn.Sequential
+ forward(state, action)
+ get_q_values(state, action)
}
class AgentPPO extends AgentBase {
+ ratio_clip: float
+ lambda_gae_adv: float
+ get_advantages(states, rewards, undones, unmasks, values)
}
class AgentSAC extends AgentBase {
+ num_ensembles: int
+ alpha_log: Parameter
}
class AgentTD3 extends AgentBase {
+ update_freq: int
+ policy_noise_std: float
}
class ActorPPO extends ActorBase {
+ action_std_log: Parameter
+ state_norm(state)
+ get_logprob_entropy(state, action)
}
class CriticPPO extends CriticBase {
+ state_norm(state)
}
class CriticEnsemble extends CriticBase {
+ decoder_qs: list
+ get_q_values(state, action)
}
AgentBase <|-- AgentPPO
AgentBase <|-- AgentSAC
AgentBase <|-- AgentTD3
AgentBase <|-- AgentDDPG
AgentBase <|-- AgentDQN
ActorBase <|-- ActorPPO
CriticBase <|-- CriticPPO
CriticBase <|-- CriticEnsemble
AgentPPO *-- ActorPPO : uses
AgentPPO *-- CriticPPO : uses
AgentSAC *-- ActorSAC : uses
AgentSAC *-- CriticEnsemble : uses
AgentTD3 *-- Actor : uses
AgentTD3 *-- CriticTwin : uses
@enduml
```
### 2. `elegantrl/train` Module Diagram (Core Components)
```puml
@startuml
skinparam classAttributeIconVisible false
class Config {
+ num_envs: int
+ agent_class: class
+ env_class: class
+ gamma: float
+ learning_rate: float
+ batch_size: int
+ horizon_len: int
+ buffer_size: int
+ gpu_id: int
+ init_before_training()
+ get_if_off_policy()
}
class SumTree {
+ buf_len: int
+ tree: Tensor
+ update_ids(data_ids, prob)
+ important_sampling(batch_size, beg, end, per_beta)
}
class ReplayBuffer {
+ max_size: int
+ num_seqs: int
+ states: Tensor
+ actions: Tensor
+ if_use_per: bool
+ sum_trees: list[SumTree]
+ update(items)
+ sample(batch_size)
+ sample_for_per(batch_size)
}
class Evaluator {
+ cwd: str
+ total_step: int
+ max_r: float
+ recorder: list
+ evaluate_and_save(actor, steps, exp_r, logging_tuple)
+ save_training_curve_jpg()
}
class SubEnv extends Process {
+ sub_pipe0: Pipe
+ vec_pipe1: Pipe
+ run()
}
class VecEnv {
+ num_envs: int
+ sub_envs: list[SubEnv]
+ sub_pipe1s: list[Pipe]
+ vec_pipe0: Pipe
+ reset()
+ step(action)
}
class Worker extends Process {
+ worker_pipe: Pipe
+ learner_pipe: Pipe
+ run()
}
class Learner extends Process {
+ recv_pipe: Pipe
+ send_pipes: list[Pipe]
+ run()
}
Config *-- ReplayBuffer : configures
ReplayBuffer *-- SumTree : uses (for PER)
Config *-- VecEnv : creates
VecEnv *-- SubEnv : manages
Learner *-- ReplayBuffer : updates
Learner *-- Worker : communicates
Learner *-- EvaluatorProc : communicates
Worker *-- VecEnv : uses
@enduml
```
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
The ElegantRL architecture is built around a set of highly modular and decoupled abstractions, primarily focused on the Actor-Critic paradigm and parallel execution.
1. **Agent (`AgentBase`)**: The central abstraction for any DRL algorithm. It encapsulates the policy (`act`), value function (`cri`), optimization logic, and exploration strategy. Concrete implementations like `AgentPPO` and `AgentSAC` inherit from this base class, ensuring a consistent interface for the training loop.
2. **Network (`ActorBase`, `CriticBase`)**: These define the neural network structures for the policy and value functions, respectively. They are decoupled from the agent logic, allowing for flexible network designs (e.g., `CriticTwin` for TD3, `CriticEnsemble` for SAC).
3. **Configuration (`Config`)**: A single source of truth for all hyperparameters, environment details, and training settings. This abstraction simplifies experiment management and ensures consistency across the entire framework.
4. **Experience Storage (`ReplayBuffer`, `SumTree`)**: Manages the collection and sampling of experience. The inclusion of `SumTree` for Prioritized Experience Replay (PER) highlights the focus on sample efficiency.
5. **Parallelism Components (`Learner`, `Worker`, `VecEnv`)**: These are the core components enabling the "Massively Parallel" design. The `Learner` handles model updates, while `Worker`s handle environment interaction, and `VecEnv` manages multiple environment instances in parallel processes (`SubEnv`).
**Design Philosophy: Massively Parallel and Modular DRL**
ElegantRL's design philosophy is centered on two main pillars:
1. **Decoupled Parallelism**: The framework adopts a clear separation between the **data collection** (exploration) and the **model update** (learning) phases, a design common in high-throughput DRL systems. `Worker` processes run in parallel to collect massive amounts of experience, which is then asynchronously sent to the `Learner` process for efficient GPU-based training. This maximizes hardware utilization and significantly speeds up training.
2. **Modularity and Extensibility**: The codebase is highly modular, with clear boundaries between the `agents`, `envs`, and `train` components. This modularity makes it easy to implement new algorithms (by extending `AgentBase`), integrate new environments, or swap out core components like the `ReplayBuffer`.
**Lifecycle Management**
The training lifecycle is managed by the `run.py` module:
1. **Initialization**: The `Config` object is initialized, and the `Learner`, `Worker`s, and `EvaluatorProc` processes are instantiated.
2. **Exploration (Worker)**: Each `Worker` process continuously interacts with its assigned `VecEnv` instances, collecting trajectories.
3. **Learning (Learner)**: The `Learner` receives batches of experience from all `Worker`s. It stores them in the `ReplayBuffer`, samples a batch, calculates the loss, updates the networks, and soft-updates the target networks.
4. **Synchronization**: The `Learner` periodically sends the updated policy network parameters back to the `Worker`s.
5. **Evaluation (Evaluator)**: The `Evaluator` process runs evaluation episodes, logs performance metrics, and handles model checkpointing.
#### 3.1.2. Component Interactions
The inter-component communication is primarily handled by Python's `multiprocessing.Pipe` for inter-process communication (IPC), enabling the asynchronous and parallel nature of the framework.
| Component | Role | Communication Pattern | Data Flow |
| :--- | :--- | :--- | :--- |
| **Worker** | Experience Collector | Sends data to `Learner` via `Pipe`. Receives model parameters from `Learner` via `Pipe`. | Trajectories (states, actions, rewards, etc.) -> `Learner`. Latest `Actor` state dict -> `Worker`. |
| **Learner** | Model Updater | Receives data from `Worker`s. Sends model to `Worker`s and `Evaluator`. | Trajectories from `Worker`s -> `ReplayBuffer`. Sampled batches from `ReplayBuffer` -> `Agent` for update. |
| **VecEnv** | Parallel Environment Manager | Manages multiple `SubEnv` processes using `Pipe`s. | Actions from `Worker` -> `SubEnv`. New states, rewards, dones from `SubEnv` -> `Worker`. |
| **ReplayBuffer** | Experience Storage | Accessed exclusively by the `Learner` process. | Stores trajectories from `Worker`s. Provides sampled batches to `Learner`'s `Agent`. |
| **Evaluator** | Performance Monitor | Receives training statistics from `Learner` via `Pipe`. | Training metrics (step, avgR, losses) -> `Evaluator`. |
**Key Interaction Flow (Off-Policy Training):**
1. **Exploration**: `Worker` receives the latest `Actor` from `Learner`.
2. **Data Collection**: `Worker` calls `agent.explore_env(VecEnv)`, which executes `VecEnv.step()` across all `SubEnv`s in parallel, collecting a batch of trajectories.
3. **Data Transfer**: `Worker` sends the collected trajectories (e.g., 2048 steps * 8 environments) to the `Learner` via a `Pipe`.
4. **Storage**: `Learner` receives the data and calls `ReplayBuffer.update()`.
5. **Learning**: `Learner` repeatedly calls `ReplayBuffer.sample()` and passes the batch to `agent.update_net()`.
6. **Synchronization**: After a set number of learning steps, `Learner` sends the updated `Actor` weights back to the `Worker`s.
7. **Monitoring**: Periodically, `Learner` sends performance metrics to the `Evaluator` for logging and checkpointing.
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml
skinparam defaultFontName Courier
skinparam classAttributeIconVisible false
skinparam packageStyle rectangle
title ElegantRL Overall Architecture
package "elegantrl.train" {
class Config
class ReplayBuffer
class Evaluator
class Learner extends Process
class Worker extends Process
class VecEnv
class SubEnv extends Process
}
package "elegantrl.agents" {
abstract class AgentBase
abstract class ActorBase
abstract class CriticBase
}
package "elegantrl.envs" {
class Environment
}
' Relationships
' 1. Configuration and Initialization
Config .> AgentBase : configures
Config .> ReplayBuffer : configures
Config .> VecEnv : configures
' 2. Agent Core
AgentBase <|-- AgentPPO
AgentBase <|-- AgentSAC
AgentBase <|-- AgentTD3
AgentBase *-- ActorBase : uses
AgentBase *-- CriticBase : uses
' 3. Training Loop Components
Learner *-- AgentBase : updates
Learner *-- ReplayBuffer : manages
Learner .> Evaluator : sends stats (Pipe)
Worker .> AgentBase : uses for exploration
Worker *-- VecEnv : collects data
' 4. Inter-Process Communication (IPC)
Worker .> Learner : sends data (Pipe)
Learner .> Worker : sends model (Pipe)
' 5. Environment Interaction
VecEnv *-- SubEnv : manages parallel instances
VecEnv .> Environment : wraps/uses
' 6. Data Flow
ReplayBuffer .> AgentBase : samples data
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
ElegantRL leverages several established software and reinforcement learning design patterns to achieve its modularity, stability, and performance goals.
1. **Actor-Critic Pattern (Reinforcement Learning Pattern)**
* **Description**: Separates the policy (Actor) that selects actions from the value function (Critic) that estimates the expected return.
* **Implementation**:
* `AgentBase` is the abstract base for the entire pattern.
* `ActorBase` and `CriticBase` define the network interfaces.
* **Example (AgentPPO.py)**: The `AgentPPO` class explicitly instantiates `self.act = ActorPPO(...)` and `self.cri = CriticPPO(...)`, and the `update_objectives` method uses both to calculate the actor and critic losses.
2. **Target Network Pattern (Reinforcement Learning Pattern)**
* **Description**: Used in off-policy algorithms (DDPG, TD3, SAC) to stabilize training by using a separate, delayed-update copy of the Q-network.
* **Implementation**:
* The `AgentBase` constructor initializes `self.act_target` and `self.cri_target`.
* The static method `AgentBase.soft_update(target_net, current_net, tau)` implements the exponential moving average (EMA) update rule.
* **Example (AgentTD3.py)**: The `update_objectives` method calculates the target Q-value using `next_q = self.cri_target.get_q_values(next_state, next_action).min(dim=1)[0]`.
3. **Factory Method Pattern (Software Design Pattern)**
* **Description**: Defines an interface for creating an object, but lets subclasses alter the type of objects that will be created.
* **Implementation**:
* The `Config` object stores `self.agent_class` and `self.env_class`.
* The `run.py` module uses these classes to instantiate the actual objects: `agent = args.agent_class(...)` and `env = build_env(args.env_class, ...)`.
4. **Strategy Pattern (Software Design Pattern)**
* **Description**: Defines a family of algorithms, encapsulates each one, and makes them interchangeable.
* **Implementation**:
* The core training loop in `run.py` interacts only with the `AgentBase` interface (`agent.explore_env`, `agent.update_net`).
* The specific implementation is encapsulated within the concrete strategy classes (`AgentPPO`, `AgentSAC`), making them interchangeable.
5. **Observer Pattern (Software Design Pattern)**
* **Description**: Defines a one-to-many dependency between objects so that when one object changes state, all its dependents are notified and updated automatically.
* **Implementation**:
* The `Learner` acts as the Subject, generating updated model parameters.
* The `Worker`s and `Evaluator` act as Observers, receiving the updated model parameters (or performance data) via the IPC `Pipe`s.
#### 3.3.2. Project Highlights
ElegantRL's design includes several innovative features that contribute to its high performance and usability:
* **Massively Parallel Architecture (Cloud-Native DRL)**: The core highlight is the clear separation of concerns into `Learner` (GPU-heavy computation) and multiple `Worker`s (CPU-heavy environment interaction), communicating via IPC. This design is highly scalable and is explicitly optimized for cloud-native DRL applications, allowing for efficient utilization of multi-core CPUs and single/multi-GPU setups.
* **Vectorized Environment Support (`VecEnv`)**: The framework natively supports running multiple environment instances in parallel within a single `Worker` process, dramatically increasing the data throughput (samples per second) and reducing the wall-clock time required for training. This is a crucial feature for on-policy algorithms like PPO.
* **Prioritized Experience Replay (PER) with `SumTree`**: The implementation of PER in `replay_buffer.py` using a dedicated `SumTree` data structure is a highlight. It ensures that the most "surprising" or high-error transitions are sampled more frequently, leading to faster convergence and better sample efficiency for off-policy methods.
* **Unified Agent Interface (`AgentBase`)**: By abstracting the core DRL logic into `AgentBase`, the framework provides a clean, consistent API for all algorithms (PPO, SAC, TD3, DQN, etc.). This significantly lowers the barrier to entry for users wanting to compare or switch between different algorithms.
* **Financial Reinforcement Learning Focus**: The inclusion of specialized environments like `StockTradingEnv` and the project's association with the AI4Finance-Foundation indicate a strong focus on applying DRL to complex financial problems, which often require the stability and efficiency ElegantRL provides.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
Based on the code structure and design, the following areas could be considered for improvement:
1. **Standardize Environment Interface**: The `elegantrl/envs` module contains custom environment implementations. While functional, adopting the latest Gymnasium API standards more strictly, possibly through a dedicated wrapper layer, would improve compatibility with the broader RL ecosystem and future-proof the environment integrations.
2. **Configuration Management**: The `Config` class is a simple data container. For large-scale experiments, migrating to a more robust configuration management system (e.g., Hydra, Gin-config) would allow for easier tracking, overriding, and composition of hyperparameter sets, especially for the multi-GPU and multi-process setups.
3. **Network Abstraction for Complex Architectures**: The current network building utility (`build_mlp`) is limited to simple Multi-Layer Perceptrons. Expanding the network module to include more complex, pre-built architectures (e.g., ResNets, attention-based models) or a more flexible network composition API would simplify the implementation of state-of-the-art DRL agents that require specialized network structures.
4. **Asynchronous Communication Overhead**: The reliance on Python's `multiprocessing.Pipe` for IPC, while simple, can introduce serialization/deserialization overhead, especially when transferring large batches of data (tensors) between `Worker` and `Learner`. Investigating more efficient IPC mechanisms like shared memory (e.g., PyTorch's `multiprocessing.shared_memory` or Ray) could further reduce latency and increase the overall throughput.
5. **Type Hinting and Documentation**: While type hints are present, expanding their use, especially in the core `AgentBase` and `run.py` components, along with more comprehensive docstrings, would significantly improve code readability and maintainability for secondary developers.
#### 3.4.2. Secondary Development Guide
For developers looking to extend or build upon the ElegantRL framework, the following guide provides the best path for code exploration and secondary development:
1. **Implement a New Agent (Algorithm)**:
* **Start with `AgentBase.py`**: Create a new class (e.g., `AgentNewRL`) that inherits from `AgentBase`.
* **Define Networks**: Implement the specific Actor and Critic network architectures required by the new algorithm (e.g., `ActorNewRL`, `CriticNewRL`), inheriting from `ActorBase` and `CriticBase`.
* **Override `__init__`**: Initialize the new agent, setting algorithm-specific hyperparameters and instantiating the new networks.
* **Override `update_objectives`**: This is the most critical step. Implement the algorithm's core loss functions and optimization steps here.
2. **Integrate a New Environment**:
* **Follow Gym/Gymnasium Standard**: Ensure the new environment implements the standard `__init__`, `reset`, and `step` methods.
* **Use `elegantrl/envs` as a Template**: If the environment is complex, use `StockTradingEnv.py` as a template for structuring the state, action, and reward logic.
* **Vectorization**: Ensure the environment is compatible with the `VecEnv` wrapper defined in `config.py` for high throughput.
3. **Explore the Training Workflow**:
* **Configuration**: All experiments start with `config.py`. Understand how to set `agent_class`, `env_class`, and key hyperparameters.
* **Execution**: The `run.py` module is the entry point. Focus on the `train_agent_multiprocessing` function to understand how `Learner` and `Worker` processes are launched and communicate.
* **Data Flow**: Trace the data from `Worker.run()` (collection) through the `Pipe` to `Learner.run()` (storage and update) to fully grasp the parallel data pipeline.
4. **Debugging and Monitoring**:
* **Logging**: Use the `Evaluator` in `evaluator.py` to monitor training progress.
* **PyTorch Debugging**: Standard PyTorch debugging techniques can be applied directly within the `update_objectives` methods.
================================================
FILE: thirdparty/FinCast-fts.md
================================================
# FinCast-fts - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
**Project Name:** FinCast-fts
**Project Path:** /home/ubuntu/FinCast-fts
```
/home/ubuntu/FinCast-fts
|____.git/ (EXCLUDE: Git version control metadata)
|____.gitattributes (EXCLUDE: Git configuration)
|____.gitignore (EXCLUDE: Files to ignore for Git)
|____Inference/ (EXCLUDE: Jupyter notebook for model inference and demonstration)
| |____inference_future.ipynb
|____LICENSE (EXCLUDE: Project license)
|____README.md (EXCLUDE: Project documentation)
|____dep_install.sh (EXCLUDE: Script for dependency installation)
|____env_setup.sh (EXCLUDE: Script for environment setup)
|____experiments/ (EXCLUDE: Scripts for running long-horizon benchmarks and evaluations)
| |____long_horizon_benchmarks/
| | |____Freq_map_eval.py
| | |____run_eval_ffm.py
| | |____run_eval_ffm_dataset.py
| | |____run_eval_ffm_stock.py
|____notebooks/ (EXCLUDE: Jupyter notebook for result summary and visualization)
| |____result_summary.ipynb
|____paper.pdf (EXCLUDE: Associated research paper)
|____peft_Fincast/ (CORE: Implementation for Parameter-Efficient Fine-Tuning (PEFT) integration)
| |____peft_injector.py
|____pics/ (EXCLUDE: Example images)
| |____example1_APPL.png
| |____example2_ETHUSD.png
|____requirement_v2.txt (EXCLUDE: Project dependencies list)
|____scripts/ (EXCLUDE: Shell scripts for running PEFT and evaluation)
| |____Fincast_PEFT/
| | |____local_4090_t1.sh
| |____Fincast_eval/
| | |____eval_stock_loop.sh
| | |____eval_stock_loop_supervised42.sh
|____setup.py (EXCLUDE: Python package setup file)
|____src/ (CORE: Main source code directory)
| |______init__.py
| |____data_tools/ (CORE: Data loading, processing, and batch sampling utilities)
| | |____Inference_dataset.py
| | |____TSdataset.py
| | |____batch_sampler.py
| | |____batch_sampler_ddp.py
| |____ffm/ (CORE: Core Financial Foundation Model (FFM) implementation)
| | |______init__.py
| | |____data_loader.py
| | |____ffm_base.py
| | |____ffm_torch_moe.py
| | |____pytorch_patched_decoder_MOE.py
| | |____time_features.py
| | |____xreg_lib.py
| |____st_moe_pytorch/ (CORE: Implementation of the Spatio-Temporal Mixture of Experts (ST-MoE) layer)
| | |______init__.py
| | |____distributed.py
| | |____st_moe_pytorch.py
| |____tools/ (CORE: General utility functions, metrics, model utils, and visualization)
| | |______init__.py
| | |____inference_utils.py
| | |____metrics.py
| | |____model_utils.py
| | |____result_vis_plt.ipynb
| | |____utils.py
| |____unit_test/ (EXCLUDE: Contains a unit test script)
| | |____BS_DDP_tc4.py
```
The project is organized into five core logical modules under the root and `src/` directory: `peft_Fincast` for model adaptation, `src/data_tools` for data pipeline, `src/ffm` for the core model logic, `src/st_moe_pytorch` for the MoE implementation, and `src/tools` for utilities. The rest of the folders contain non-core elements like scripts, notebooks, and documentation.
```
### 1.2. Core Folders for Analysis
- `/home/ubuntu/FinCast-fts/peft_Fincast`: Implementation for Parameter-Efficient Fine-Tuning (PEFT) integration.
- `/home/ubuntu/FinCast-fts/src/data_tools`: Data loading, processing, and batch sampling utilities.
- `/home/ubuntu/FinCast-fts/src/ffm`: Core Financial Foundation Model (FFM) implementation.
- `/home/ubuntu/FinCast-fts/src/st_moe_pytorch`: Spatio-Temporal Mixture of Experts (ST-MoE) layer implementation.
- `/home/ubuntu/FinCast-fts/src/tools`: General utility functions, metrics, and model utilities.
## Phase 2: Module-by-Module Deep Analysis
## Module Analysis
The FinCast-fts project is structured around a core deep learning model, the Financial Foundation Model (FFM), and its supporting infrastructure for data handling, training utilities, and inference. The architecture is heavily influenced by the TimesFM design, with significant modifications to incorporate a Spatio-Temporal Mixture of Experts (ST-MoE) layer.
### 1. Module: `peft_Fincast` (Parameter-Efficient Fine-Tuning)
* **Files**: `peft_injector.py`
* **Core Responsibility**: This module is responsible for integrating Parameter-Efficient Fine-Tuning (PEFT), specifically LoRA (Low-Rank Adaptation) or DoRA, into the pre-trained FFM. This allows for efficient fine-tuning of the large model on downstream tasks by only training a small fraction of new parameters.
* **Key Implementation Details**:
* **`wrap_with_peft` Function**: The main entry point, which takes the base model and LoRA hyperparameters (`lora_r`, `lora_alpha`, `lora_dropout`, `lora_targets_preset`). It uses the external `peft` library's `LoraConfig` and `get_peft_model` to inject the adapters.
* **Target Selection (`_default_targets`)**: Defines presets for selecting which linear layers (`nn.Linear`) within the FFM should receive LoRA adapters. Presets include:
* `attn`: Targets the attention mechanism's query/key/value projection (`qkv_proj`) and output projection (`o_proj`).
* `attn_mlp`: Extends `attn` to include the feed-forward layers in both the input and horizon blocks.
* `attn_mlp_gating`: Further extends to include the MoE gating mechanism (`moe.gate.to_gates`), indicating a focus on routing behavior.
* `experts_heavy`: Targets the most parameters by including the experts themselves (`experts.experts`, `gate_proj`, `down_proj`).
### 2. Module: `src/data_tools` (Data Handling and Batching)
* **Files**: `Inference_dataset.py`, `TSdataset.py`, `batch_sampler.py`, `batch_sampler_ddp.py`
* **Core Responsibility**: Manages the entire data pipeline, from reading raw CSV files to preparing batched, windowed, and optionally masked time-series data for both training and inference.
* **Key Implementation Details**:
* **`TimeSeriesDataset_MultiCSV_train_Production` (`TSdataset.py`)**: The primary training dataset class. It reads multiple CSVs, converts multi-column data into a collection of univariate series, applies Z-score normalization (`sklearn.preprocessing.StandardScaler`), and generates sliding windows with a configurable stride (`data_slice_interval`) and variable context lengths (`possible_context_lengths`). It also implements input masking (`mask_ratio`) for potential pre-training objectives.
* **`TimeSeriesDataset_SingleCSV_Inference` (`Inference_dataset.py`)**: A specialized dataset for inference on a single CSV, supporting both "last window" and "sliding window" modes. It returns metadata for traceability, which is crucial for post-inference analysis and plotting.
* **`GroupByLengthBatchSampler_Production` (`batch_sampler.py`)**: A custom PyTorch `BatchSampler` that groups samples by their context length (`get_length`). This is a critical optimization, as it eliminates the need for padding within a batch, maximizing GPU efficiency for the Transformer architecture.
* **`GroupByLengthBatchSampler_DDP` (`batch_sampler_ddp.py`)**: Extends the batch sampler for Distributed Data Parallel (DDP) training, ensuring that all ranks process a synchronized, deterministically shuffled subset of the data.
### 3. Module: `src/ffm` (Financial Foundation Model Core)
* **Files**: `data_loader.py`, `ffm_base.py`, `ffm_torch_moe.py`, `pytorch_patched_decoder_MOE.py`, `time_features.py`, `xreg_lib.py`
* **Core Responsibility**: Contains the model definition, configuration, base API, and components for handling time-series features and external regressors.
* **Key Implementation Details**:
* **`FFmBase` (`ffm_base.py`)**: Defines the abstract interface for the FFM API, including shared utilities like `_normalize` and `_renormalize` for per-time-series normalization. It also includes the complex logic for integrating **eXogenous Regressors (XReg)**, supporting two modes: "timesfm + xreg" (forecast residuals) and "xreg + timesfm" (forecast on residuals).
* **`FFmTorch` (`ffm_torch_moe.py`)**: The concrete PyTorch implementation of the FFM API. It initializes the core model (`PatchedTimeSeriesDecoder_MOE`) and implements the inference loop (`_forecast`), handling checkpoint loading (including compiled models) and device placement (CPU/GPU).
* **`PatchedTimeSeriesDecoder_MOE` (`pytorch_patched_decoder_MOE.py`)**: The main model class. It implements the Transformer-based decoder architecture, which operates on time-series patches.
* **Patching**: Input time-series are reshaped into patches (`[B, N, P]`) before being passed to the transformer.
* **Feature Injection**: It uses a `ResidualBlock` (`input_ff_layer`) to project the concatenated time-series patch and padding mask (`[P*2]`) into the model's hidden dimension.
* **Frequency Embedding**: A learnable embedding (`freq_emb`) is added to the input to condition the model on the time-series frequency (e.g., high, medium, low).
* **Output Head**: A final `ResidualBlock` (`horizon_ff_layer`) projects the transformer output to the prediction horizon, outputting both the mean and multiple quantiles.
* **`TimesFMDecoderLayer` (`pytorch_patched_decoder_MOE.py`)**: The core building block of the transformer stack. It consists of:
* **Attention**: `TimesFMAttention` (a standard multi-head attention with RMSNorm).
* **Mixture of Experts (MoE)**: `SparseMoEBlock` (from `st_moe_pytorch`) is used as the feed-forward network, which is the key architectural innovation.
* **`TimeCovariates` (`time_features.py`)**: Extracts a rich set of time-based features (minute, hour, day of week/month/year, month/week of year) and optional holiday features, which are then normalized.
### 4. Module: `src/st_moe_pytorch` (Spatio-Temporal MoE)
* **Files**: `distributed.py`, `st_moe_pytorch.py`
* **Core Responsibility**: Provides the implementation for the Mixture of Experts (MoE) layer, which is integrated into the FFM's transformer blocks. This module is adapted from a general-purpose MoE library.
* **Key Implementation Details**:
* **`MoE` (`st_moe_pytorch.py`)**: The main MoE class, composed of a `TopNGating` router and an `Experts` container.
* **`TopNGating`**: The router computes raw gate logits, applies Gumbel noise (during training), and uses a differentiable top-K selection to choose the top `top_n` experts for each token. It also calculates auxiliary losses (`balance_loss`, `router_z_loss`) to encourage balanced expert usage.
* **`Experts`**: A container for the individual `Expert` modules (which are simple MLPs). It handles the dispatching of tokens to the selected experts and combining the outputs.
* **`SparseMoEBlock`**: Wraps the `MoE` layer, adding pre- and post-feed-forward layers (`ff_before`, `ff_after`) and a residual connection, which is noted in the source code as a stabilization technique.
* **`distributed.py`**: Contains utility functions (`all_gather_variable_dim`, `AllGatherFunction`) for handling distributed communication (All-Gather) of variable-sized tensors, necessary for efficient distributed training of MoE models.
### 5. Module: `src/tools` (Utilities)
* **Files**: `inference_utils.py`, `metrics.py`, `model_utils.py`, `utils.py`
* **Core Responsibility**: Provides miscellaneous utilities for model loading, evaluation, metrics calculation, and visualization.
* **Key Implementation Details**:
* **`inference_utils.py`**: Contains the high-level `FinCast_Inference` class, which orchestrates the entire inference process: dataset creation, model loading, running the `DataLoader`, and post-processing the results. It also includes functions for plotting (`plot_last_outputs`) and saving outputs to CSV.
* **`metrics.py`**: Implements standard time-series evaluation metrics using NumPy, including MAE, MSE, RMSE, MAPE, MSPE, RSE, and CORR.
* **`model_utils.py`**: Simple helper to instantiate the FFM model (`FFM`) and its configuration (`FFmHparams`) from a checkpoint path.
* **`utils.py`**: Provides logging and parameter counting utilities (`log_model_statistics`) for tracking model size and configuration.
---
## Module PlantUML Diagrams
### 1. Module: `peft_Fincast`
```puml
@startuml peft_Fincast
skinparam classAttributeIconSize 0
package "peft_Fincast" {
class peft_injector {
+ wrap_with_peft(model, ...)
-- Private --
- _default_targets(model, preset)
- resolve_linear_targets(model, patterns)
- _unfreeze_all_params(model)
}
}
package "External: peft" {
class LoraConfig
class get_peft_model
}
package "External: torch" {
class nn.Module
class nn.Linear
}
peft_injector ..> LoraConfig : uses
peft_injector ..> get_peft_model : uses
peft_injector ..> nn.Module : operates on
peft_injector ..> nn.Linear : targets
@enduml
```
### 2. Module: `src/data_tools`
```puml
@startuml data_tools
skinparam classAttributeIconSize 0
package "data_tools" {
class TimeSeriesDataset_MultiCSV_train_Production {
+ __init__(...)
+ __len__()
+ get_length(idx)
+ __getitem__(idx)
-- Private --
- _read_csvs()
- _prepare_index_records()
}
class TimeSeriesDataset_SingleCSV_Inference {
+ __init__(...)
+ __len__()
+ get_length(idx)
+ __getitem__(idx)
-- Private --
- _make_meta(series_idx, window_start)
}
class GroupByLengthBatchSampler_Production {
+ __init__(dataset, batch_size, ...)
+ __iter__()
+ __len__()
}
class GroupByLengthBatchSampler_DDP {
+ __init__(dataset, batch_size, ...)
+ __iter__()
+ __len__()
+ set_epoch(epoch)
}
object function {
+ freq_reader(file_path, freq_dict, mode)
}
}
TimeSeriesDataset_MultiCSV_train_Production ..> function : uses freq_reader
TimeSeriesDataset_SingleCSV_Inference ..> function : uses freq_reader
GroupByLengthBatchSampler_Production ..> TimeSeriesDataset_MultiCSV_train_Production : operates on
GroupByLengthBatchSampler_DDP ..> TimeSeriesDataset_MultiCSV_train_Production : operates on
TimeSeriesDataset_MultiCSV_train_Production .up.|> torch.utils.data.Dataset
TimeSeriesDataset_SingleCSV_Inference .up.|> torch.utils.data.Dataset
GroupByLengthBatchSampler_DDP .up.|> torch.utils.data.Sampler
GroupByLengthBatchSampler_Production .up.|> torch.utils.data.BatchSampler
@enduml
```
### 3. Module: `src/ffm` (Core Model)
```puml
@startuml ffm_core
skinparam classAttributeIconSize 0
package "ffm" {
class FFmHparams << (D,orchid) dataclass >> {
+ context_len : int
+ horizon_len : int
+ num_experts : int
+ gating_top_n : int
+ ...
}
abstract class FFmBase {
+ __init__(hparams, checkpoint, ...)
+ forecast(...)
+ forecast_on_df(...)
-- Private --
- _preprocess(inputs, freq)
- _forecast(...)
}
class FFmTorch {
+ __init__(hparams, checkpoint, ...)
+ load_from_checkpoint_ffm(checkpoint)
+ model_eval_mode()
-- Private --
- _forecast(...)
}
class PatchedTimeSeriesDecoder_MOE {
+ config : FFMConfig
+ input_ff_layer : ResidualBlock
+ horizon_ff_layer : ResidualBlock
+ stacked_transformer : StackedDecoder
+ decode(...)
+ forward(...)
-- Private --
- _preprocess_input(...)
- _postprocess_output(...)
- _forward_transform(...)
- _reverse_transform(...)
}
class TimeSeriesdata << (T,yellow) TensorFlow >> {
+ __init__(...)
+ train_gen()
+ test_val_gen(mode, shift)
+ tf_dataset(mode, shift)
}
class TimeCovariates {
+ __init__(datetimes, ...)
+ get_covariates()
}
class BatchedInContextXRegLinear {
+ fit(...)
+ create_covariate_matrix(...)
}
}
FFmTorch --|> FFmBase
FFmTorch o-- PatchedTimeSeriesDecoder_MOE : wraps
FFmBase o-- FFmHparams : config
FFmBase ..> BatchedInContextXRegLinear : uses for XReg
TimeSeriesdata ..> TimeCovariates : uses
PatchedTimeSeriesDecoder_MOE ..> FFMConfig : config
PatchedTimeSeriesDecoder_MOE ..> StackedDecoder : contains
PatchedTimeSeriesDecoder_MOE ..> ResidualBlock : contains
@enduml
```
### 4. Module: `src/st_moe_pytorch` (Spatio-Temporal MoE)
```puml
@startuml st_moe_pytorch
skinparam classAttributeIconSize 0
package "st_moe_pytorch" {
class MoE {
+ gate : TopNGating
+ experts : Experts
+ forward(x, ...) : MixtureOfExpertsReturn
}
class SparseMoEBlock {
+ moe : MoE
+ ff_before : Expert
+ ff_after : Expert
+ forward(x, ...) : MixtureOfExpertsReturn
}
class TopNGating {
+ to_gates : nn.Linear
+ forward(x, ...) : dispatch_tensor, combine_tensor, ...
}
class Experts {
+ experts : ModuleList
+ forward(x, ...)
}
class Expert {
+ gate_proj : nn.Linear
+ down_proj : nn.Linear
+ forward(x, paddings)
}
class AllGatherFunction << (F,darkgreen) Distributed >>
class AllGather << (M,darkgreen) Distributed >>
}
SparseMoEBlock o-- MoE
MoE o-- TopNGating
MoE o-- Experts
Experts o-- Expert
TopNGating ..> AllGather : uses (indirectly via distributed utils)
@enduml
```
### 5. Module: `src/tools` (Utilities)
```puml
@startuml tools
skinparam classAttributeIconSize 0
package "tools" {
class FinCast_Inference {
+ __init__(config)
+ run_inference(...)
-- Private --
- _make_inference_loader(...)
}
object function {
+ plot_last_outputs(...)
+ _save_outputs_to_csv(...)
+ get_model_api(...)
+ log_model_statistics(...)
+ MAE, MSE, RMSE, MAPE, RSE, CORR
}
}
FinCast_Inference ..> data_tools.TimeSeriesDataset_SingleCSV_Inference : creates
FinCast_Inference ..> ffm.FFmTorch : loads model API
FinCast_Inference ..> function : uses utilities
@enduml
```
### Module PlantUML Diagrams
### 1. Module: `peft_Fincast`
```puml
@startuml peft_Fincast
skinparam classAttributeIconSize 0
package "peft_Fincast" {
class peft_injector {
+ wrap_with_peft(model, ...)
-- Private --
- _default_targets(model, preset)
- resolve_linear_targets(model, patterns)
- _unfreeze_all_params(model)
}
}
package "External: peft" {
class LoraConfig
class get_peft_model
}
package "External: torch" {
class nn.Module
class nn.Linear
}
peft_injector ..> LoraConfig : uses
peft_injector ..> get_peft_model : uses
peft_injector ..> nn.Module : operates on
peft_injector ..> nn.Linear : targets
@enduml
```
### 2. Module: `src/data_tools`
```puml
@startuml data_tools
skinparam classAttributeIconSize 0
package "data_tools" {
class TimeSeriesDataset_MultiCSV_train_Production {
+ __init__(...)
+ __len__()
+ get_length(idx)
+ __getitem__(idx)
-- Private --
- _read_csvs()
- _prepare_index_records()
}
class TimeSeriesDataset_SingleCSV_Inference {
+ __init__(...)
+ __len__()
+ get_length(idx)
+ __getitem__(idx)
-- Private --
- _make_meta(series_idx, window_start)
}
class GroupByLengthBatchSampler_Production {
+ __init__(dataset, batch_size, ...)
+ __iter__()
+ __len__()
}
class GroupByLengthBatchSampler_DDP {
+ __init__(dataset, batch_size, ...)
+ __iter__()
+ __len__()
+ set_epoch(epoch)
}
object function {
+ freq_reader(file_path, freq_dict, mode)
}
}
TimeSeriesDataset_MultiCSV_train_Production ..> function : uses freq_reader
TimeSeriesDataset_SingleCSV_Inference ..> function : uses freq_reader
GroupByLengthBatchSampler_Production ..> TimeSeriesDataset_MultiCSV_train_Production : operates on
GroupByLengthBatchSampler_DDP ..> TimeSeriesDataset_MultiCSV_train_Production : operates on
TimeSeriesDataset_MultiCSV_train_Production .up.|> torch.utils.data.Dataset
TimeSeriesDataset_SingleCSV_Inference .up.|> torch.utils.data.Dataset
GroupByLengthBatchSampler_DDP .up.|> torch.utils.data.Sampler
GroupByLengthBatchSampler_Production .up.|> torch.utils.data.BatchSampler
@enduml
```
### 3. Module: `src/ffm` (Core Model)
```puml
@startuml ffm_core
skinparam classAttributeIconSize 0
package "ffm" {
class FFmHparams << (D,orchid) dataclass >> {
+ context_len : int
+ horizon_len : int
+ num_experts : int
+ gating_top_n : int
+ ...
}
abstract class FFmBase {
+ __init__(hparams, checkpoint, ...)
+ forecast(...)
+ forecast_on_df(...)
-- Private --
- _preprocess(inputs, freq)
- _forecast(...)
}
class FFmTorch {
+ __init__(hparams, checkpoint, ...)
+ load_from_checkpoint_ffm(checkpoint)
+ model_eval_mode()
-- Private --
- _forecast(...)
}
class PatchedTimeSeriesDecoder_MOE {
+ config : FFMConfig
+ input_ff_layer : ResidualBlock
+ horizon_ff_layer : ResidualBlock
+ stacked_transformer : StackedDecoder
+ decode(...)
+ forward(...)
-- Private --
- _preprocess_input(...)
- _postprocess_output(...)
- _forward_transform(...)
- _reverse_transform(...)
}
class TimeSeriesdata << (T,yellow) TensorFlow >> {
+ __init__(...)
+ train_gen()
+ test_val_gen(mode, shift)
+ tf_dataset(mode, shift)
}
class TimeCovariates {
+ __init__(datetimes, ...)
+ get_covariates()
}
class BatchedInContextXRegLinear {
+ fit(...)
+ create_covariate_matrix(...)
}
}
FFmTorch --|> FFmBase
FFmTorch o-- PatchedTimeSeriesDecoder_MOE : wraps
FFmBase o-- FFmHparams : config
FFmBase ..> BatchedInContextXRegLinear : uses for XReg
TimeSeriesdata ..> TimeCovariates : uses
PatchedTimeSeriesDecoder_MOE ..> FFMConfig : config
PatchedTimeSeriesDecoder_MOE ..> StackedDecoder : contains
PatchedTimeSeriesDecoder_MOE ..> ResidualBlock : contains
@enduml
```
### 4. Module: `src/st_moe_pytorch` (Spatio-Temporal MoE)
```puml
@startuml st_moe_pytorch
skinparam classAttributeIconSize 0
package "st_moe_pytorch" {
class MoE {
+ gate : TopNGating
+ experts : Experts
+ forward(x, ...) : MixtureOfExpertsReturn
}
class SparseMoEBlock {
+ moe : MoE
+ ff_before : Expert
+ ff_after : Expert
+ forward(x, ...) : MixtureOfExpertsReturn
}
class TopNGating {
+ to_gates : nn.Linear
+ forward(x, ...) : dispatch_tensor, combine_tensor, ...
}
class Experts {
+ experts : ModuleList
+ forward(x, ...)
}
class Expert {
+ gate_proj : nn.Linear
+ down_proj : nn.Linear
+ forward(x, paddings)
}
class AllGatherFunction << (F,darkgreen) Distributed >>
class AllGather << (M,darkgreen) Distributed >>
}
SparseMoEBlock o-- MoE
MoE o-- TopNGating
MoE o-- Experts
Experts o-- Expert
TopNGating ..> AllGather : uses (indirectly via distributed utils)
@enduml
```
### 5. Module: `src/tools` (Utilities)
```puml
@startuml tools
skinparam classAttributeIconSize 0
package "tools" {
class FinCast_Inference {
+ __init__(config)
+ run_inference(...)
-- Private --
- _make_inference_loader(...)
}
object function {
+ plot_last_outputs(...)
+ _save_outputs_to_csv(...)
+ get_model_api(...)
+ log_model_statistics(...)
+ MAE, MSE, RMSE, MAPE, RSE, CORR
}
}
FinCast_Inference ..> data_tools.TimeSeriesDataset_SingleCSV_Inference : creates
FinCast_Inference ..> ffm.FFmTorch : loads model API
FinCast_Inference ..> function : uses utilities
@enduml
```
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
## Core Abstractions, Design Philosophy, and Lifecycle Management
The FinCast-fts project implements a sophisticated architecture for financial time-series forecasting, centered on the **Financial Foundation Model (FFM)**. Its design is characterized by a set of powerful abstractions and a clear philosophy focused on scalability, efficiency, and predictive richness.
### Core Abstractions
The system is built upon four primary abstractions that govern how time-series data is processed and modeled:
1. **Time-Series Patch**: The fundamental unit of data input is not a single time step but a **patch** (defined by `patch_len`, typically 32). The input time-series is segmented into a sequence of overlapping or non-overlapping patches, transforming the 1D series into a 2D sequence (`[N_patches, Patch_len]`). This patching mechanism is a core component of the underlying TimesFM architecture, enabling the Transformer to process local temporal patterns efficiently.
2. **Frequency Embedding**: The model explicitly handles time-series data of varying frequencies (e.g., high, medium, low) by introducing a **Frequency Embedding** (`freq_emb`). This categorical embedding is added to the input representation, allowing the single FFM to condition its internal weights and attention mechanisms based on the inherent periodicity and characteristics of the input data.
3. **Spatio-Temporal Mixture of Experts (ST-MoE)**: This is the central architectural innovation. The traditional Feed-Forward Network (FFN) within the Transformer block is replaced by a **SparseMoEBlock**. This abstraction allows the model to scale its parameter count dramatically (via multiple "experts") while maintaining a constant computational cost during inference. For any given input token (patch), a router selects only the top $K$ experts to process the data, enabling high capacity with sparse activation.
4. **Quantile Forecast**: The model's output is abstracted beyond a simple point prediction (mean/median). The final layer predicts a full set of **Quantiles** (e.g., 0.1, 0.5, 0.9), providing a complete predictive distribution. This is essential for financial applications where risk assessment and uncertainty quantification are critical.
### Design Philosophy
The project's design adheres to three key philosophical tenets:
* **Foundation Model Paradigm**: The FFM is designed as a large, pre-trained model capable of zero-shot or few-shot generalization across diverse financial time-series datasets. The goal is to capture universal temporal patterns and financial market dynamics, making it a powerful base model for various downstream tasks.
* **Efficiency and Scalability**: The combination of **ST-MoE** and **Parameter-Efficient Fine-Tuning (PEFT)** drives the efficiency philosophy. ST-MoE ensures that the model can scale its capacity (number of experts) without a proportional increase in computational load. PEFT, implemented via the `peft_Fincast` module, allows for rapid, low-resource fine-tuning by only training small, low-rank adapters (LoRA) instead of the entire massive model.
* **Data-Centric Optimization**: The use of the custom `GroupByLengthBatchSampler` is a pragmatic design choice to maximize hardware utilization. By grouping time-series samples by their context length, the system eliminates the need for zero-padding within batches, ensuring that all computation is meaningful and accelerating the training process significantly.
### Lifecycle Management
The project's lifecycle is clearly delineated across its modules:
| Phase | Module(s) Responsible | Key Components |
| :--- | :--- | :--- |
| **Data Ingestion & Preparation** | `src/data_tools`, `src/ffm/time_features.py` | `TSdataset`, `Inference_dataset`, `TimeCovariates` |
| **Training Optimization** | `src/data_tools` | `GroupByLengthBatchSampler_Production`, `GroupByLengthBatchSampler_DDP` |
| **Model Definition & Training** | `src/ffm`, `src/st_moe_pytorch` | `PatchedTimeSeriesDecoder_MOE`, `MoE`, `Expert` |
| **Model Adaptation** | `peft_Fincast` | `peft_injector.py` (LoRA/DoRA) |
| **Inference & Evaluation** | `src/tools` | `FinCast_Inference`, `metrics.py`, `plot_last_outputs` |
The `FinCast_Inference` class acts as the central orchestrator for the inference lifecycle, managing the loading of the model, the data flow from the `Inference_dataset`, and the final post-processing and visualization of the quantile forecasts.
#### 3.1.2. Component Interactions
## Component Interactions, Data Flow, and Communication Patterns
The FinCast-fts architecture is a tightly integrated system where data flows sequentially from raw input through data preparation, model processing, and finally to output generation. The core interaction pattern is a pipeline-style data transformation, with a critical internal loop governed by the Mixture of Experts (MoE) mechanism.
### 1. Data Flow Pipeline
The overall data flow can be broken down into three main stages:
| Stage | Source Module | Destination Module | Data Transformation |
| :--- | :--- | :--- | :--- |
| **Input & Preprocessing** | Raw CSV Files | `src/data_tools` | Raw time-series data is read, normalized (Z-score), and segmented into context windows and future horizons. Time features (e.g., day of week, month) are extracted by `TimeCovariates` and potentially used as eXogenous Regressors (XReg). |
| **Model Forward Pass** | `src/data_tools` (Batches) | `src/ffm` (Model) | Batches of time-series windows (`x_context`, `x_padding`, `freq`) are fed into the `PatchedTimeSeriesDecoder_MOE`. The input is patched, normalized, and embedded with frequency information. |
| **Output & Post-processing** | `src/ffm` (Forecasts) | `src/tools` | The model outputs a tensor of mean and quantile forecasts. This is denormalized, sliced to the required horizon, and then processed by `FinCast_Inference` for saving to CSV or visualization (`plot_last_outputs`). |
### 2. Core Model Interaction: The Transformer Block with MoE
The most complex interaction occurs within the `PatchedTimeSeriesDecoder_MOE` (the FFM). Each layer of the `StackedDecoder` (a `TimesFMDecoderLayer`) involves a sequence of interactions:
1. **Input**: The hidden state (`hidden_states`) from the previous layer enters the current layer.
2. **Attention**: The hidden state first passes through the **TimesFMAttention** module. This is a standard self-attention mechanism, where the input interacts with itself to capture long-range temporal dependencies.
3. **Normalization**: The output of the attention block is normalized using **RMSNorm** before entering the MoE block.
4. **MoE Routing (Sparse Activation)**:
* The normalized hidden state enters the **SparseMoEBlock**.
* The **TopNGating** module (the router) calculates the probability of sending the token (patch) to each expert.
* It selects the top $K$ experts (e.g., $K=2$) based on these probabilities.
* A **Dispatch Tensor** is created, which maps each token to its selected expert(s) and their position within the expert's mini-batch.
5. **Expert Computation**:
* The tokens are dispatched to the **Experts** module.
* Each expert (a simple MLP) processes its assigned subset of tokens in parallel.
6. **MoE Combination**:
* The **Combine Tensor** (containing the weights from the router) is used to aggregate the outputs from the activated experts back into the original sequence order and dimension.
7. **Output**: The combined output is added to the input via a residual connection, and the process repeats for the next layer.
This sparse activation pattern is the key communication pattern: it ensures that only a small, dynamic subset of the model's total parameters is activated for any given input, enabling the model's high capacity.
### 3. Communication Patterns (Distributed)
The `src/st_moe_pytorch/distributed.py` module reveals the project's design for handling distributed training (DDP), which is essential for scaling MoE models:
* **All-Gather for Variable-Sized Tensors**: The `AllGather` class and its underlying `AllGatherFunction` are designed to collect tensors from all Distributed Data Parallel (DDP) ranks. Crucially, it handles **variable sequence lengths** (`all_gather_variable_dim`).
* In a typical MoE setup, the tokens dispatched to an expert on one GPU might have a different batch size than the tokens dispatched to the same expert on another GPU.
* The `AllGather` mechanism ensures that the necessary data is collected across all ranks, padded to a uniform size (`max_size`), and then unpadded after the operation, allowing for correct processing and gradient flow in a distributed environment.
This pattern is a low-level optimization to ensure that the MoE's routing and expert computation can be correctly synchronized and scaled across multiple GPUs.
### 4. External Regressor (XReg) Interaction
The `FFmBase` class includes complex logic for integrating external regressors using `xreg_lib.py`. This interaction is highly configurable:
* **Data Preparation**: The `BatchedInContextXRegLinear` class prepares the time-series data (`targets`) and the external covariates (numerical, categorical, static, dynamic) into a flattened, batched matrix format (`x_train`, `x_test`).
* **Two-Way Interaction**:
* **Mode 1 (`timesfm + xreg`)**: The FFM forecasts the time-series, and the XReg model is trained on the *residuals* (the difference between the FFM's forecast and the true value). The final forecast is the FFM output plus the XReg residual forecast.
* **Mode 2 (`xreg + timesfm`)**: The XReg model is trained on the *raw time-series*. The FFM is then trained on the *residuals* (the difference between the XReg model's forecast and the true value). The final forecast is the XReg output plus the FFM residual forecast.
This flexible interaction pattern allows the FFM to focus on complex, non-linear temporal dependencies while offloading the modeling of linear, exogenous effects to a simpler, more interpretable linear regression model.
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml FinCast_Architecture_v4
!theme toy
title FinCast-fts Overall Architecture
' Define Modules (Packages)
package "Data Pipeline (src/data_tools)" as Data {
class TSdataset
class Inference_dataset
class BatchSampler
}
package "Model Core (src/ffm)" as FFM {
class FFmTorch
class PatchedTimeSeriesDecoder_MOE
class TimeCovariates
class BatchedInContextXRegLinear
}
package "MoE Implementation (src/st_moe_pytorch)" as MoE {
class SparseMoEBlock
class MoE_Router
class Expert_MLP
}
package "Utilities & Inference (src/tools)" as Tools {
class FinCast_Inference
class Metrics
}
package "Adaptation (peft_Fincast)" as PEFT {
class peft_injector
}
' External Entities
[Raw CSV Data] as RawData
[External Libraries] as ExtLibs
' 1. Data Flow
RawData --> TSdataset : Reads
TSdataset --> FFmTorch : Supplies Batches
' 2. Model Instantiation and Configuration
FFmTorch o-- PatchedTimeSeriesDecoder_MOE : Instantiates
' 3. Model Structure (FFM)
PatchedTimeSeriesDecoder_MOE o-- SparseMoEBlock : Uses (in Transformer Layer)
PatchedTimeSeriesDecoder_MOE ..> TimeCovariates : Uses for Time Features
PatchedTimeSeriesDecoder_MOE ..> BatchedInContextXRegLinear : Uses for XReg
' 4. MoE Structure
SparseMoEBlock o-- MoE_Router : Routes Tokens
SparseMoEBlock o-- Expert_MLP : Executes Computation
' 5. Inference and Output
FinCast_Inference ..> Inference_dataset : Uses Dataset
FinCast_Inference ..> FFmTorch : Calls Forecast API
FinCast_Inference ..> Metrics : Calculates Performance
FFmTorch --> FinCast_Inference : Returns Forecasts
' 6. Adaptation
peft_injector ..> PatchedTimeSeriesDecoder_MOE : Wraps Model for Fine-Tuning
' 7. Data Flow within Data Module
TSdataset ..> BatchSampler : Uses for Batching
' 8. External Dependencies
ExtLibs .up.> MoE : (einops, torch.distributed)
ExtLibs .up.> PEFT : (peft library)
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
## Design Patterns
The FinCast-fts codebase employs several established software design patterns and specialized architectural patterns common in deep learning to achieve modularity, flexibility, and performance.
### 1. Architectural Pattern: Mixture of Experts (MoE)
The core architectural pattern is the **Mixture of Experts (MoE)**, which is implemented in the `src/st_moe_pytorch` module and integrated into the FFM's transformer layers.
* **Pattern**: Replaces the standard Feed-Forward Network (FFN) with a collection of expert networks and a trainable gating network (router).
* **Implementation**:
* The `MoE` class in `st_moe_pytorch.py` encapsulates the entire mechanism.
* The `TopNGating` component acts as the router, using a soft-max over logits to determine the weight of each expert for a given token.
* The `Expert` class represents the individual, specialized MLPs.
* **Code Example (from `st_moe_pytorch.py`):**
```python
# MoE class initialization
self.gate = TopNGating(...)
self.experts = Experts(...)
# MoE forward pass
dispatch_tensor, combine_tensor, balance_loss, router_z_loss = self.gate(x, ...)
expert_inputs = einsum('b n d, b n e c -> b e c d', x, dispatch_tensor)
expert_outputs = self.experts(expert_inputs, ...)
output = einsum('b e c d, b n e c -> b n d', expert_outputs, combine_tensor)
```
### 2. Structural Pattern: Adapter
The **Adapter Pattern** is used to reconcile the core model implementation with the desired external API interface.
* **Pattern**: Converts the interface of a class into another interface clients expect.
* **Implementation**: The `FFmTorch` class (`ffm_torch_moe.py`) acts as an adapter, inheriting from the abstract `FFmBase` (`ffm_base.py`) and wrapping the concrete PyTorch model (`PatchedTimeSeriesDecoder_MOE`). This allows the model to conform to the TimesFM-inspired API (`forecast`, `forecast_on_df`) while using a custom PyTorch implementation.
### 3. Behavioral Pattern: Strategy
The integration of eXogenous Regressors (XReg) follows the **Strategy Pattern**, allowing the user to select one of two distinct XReg integration methods at runtime.
* **Pattern**: Defines a family of algorithms, encapsulates each one, and makes them interchangeable.
* **Implementation**: The `FFmBase` class's `forecast_with_xreg` method accepts an `xreg_mode` parameter (`"timesfm + xreg"` or `"xreg + timesfm"`), which determines the strategy for combining the FFM forecast with the linear regressor (`BatchedInContextXRegLinear`).
### 4. Creational Pattern: Factory Method
A simple form of the **Factory Method Pattern** is used for model instantiation.
* **Pattern**: Defines an interface for creating an object, but lets subclasses decide which class to instantiate.
* **Implementation**: The `get_model_FFM` function in `src/tools/model_utils.py` centralizes the logic for creating the FFM model instance (`FFM`) and its configuration (`FFmHparams`) from a checkpoint path, abstracting the complex setup from the main inference logic.
### 5. Idiomatic Pattern: Skip Connections (Residual Block)
The **Residual Block** pattern is fundamental to the stability and training of deep neural networks.
* **Pattern**: Adds the input of a layer to its output, bypassing one or more layers.
* **Implementation**:
* The `ResidualBlock` class in `pytorch_patched_decoder_MOE.py` explicitly implements this pattern for the input and horizon feed-forward layers.
* The `TimesFMDecoderLayer` and `SparseMoEBlock` also utilize residual connections around their main computational units (attention and MoE).
* **Code Example (from `pytorch_patched_decoder_MOE.py`):**
```python
class ResidualBlock(nn.Module):
# ... (hidden_layer, output_layer, residual_layer defined)
def forward(self, x):
hidden = self.hidden_layer(x)
output = self.output_layer(hidden)
residual = self.residual_layer(x)
return output + residual # The skip connection
```
#### 3.3.2. Project Highlights
## Project Highlights
The FinCast-fts project showcases several innovative features and design choices that contribute to its effectiveness, extensibility, and efficiency in financial time-series forecasting.
* **Spatio-Temporal Mixture of Experts (ST-MoE) Integration**:
* **Highlight**: The core innovation is the seamless integration of the MoE architecture into the Transformer decoder, replacing the standard FFN. This allows the model to achieve a massive parameter count (high capacity) while maintaining a low, constant computational cost during the forward pass (sparse activation).
* **Benefit**: This is crucial for foundation models, as it enables the FFM to learn highly specialized patterns (experts) for different types of time-series or market regimes without becoming prohibitively slow or expensive to run. The `st_moe_pytorch` module, with its custom `TopNGating` and auxiliary loss functions, ensures the experts are used efficiently and balanced during training.
* **Efficient Training via Length-Based Batching**:
* **Highlight**: The use of the custom `GroupByLengthBatchSampler` in `src/data_tools` is a significant performance optimization. This sampler groups time-series samples with identical context lengths into the same batch.
* **Benefit**: In a Transformer architecture, padding is a major source of wasted computation. By eliminating intra-batch padding, the project maximizes the utilization of GPU memory and compute, leading to faster training times and higher throughput, especially when dealing with time-series of varying lengths.
* **Parameter-Efficient Fine-Tuning (PEFT) Support**:
* **Highlight**: The dedicated `peft_Fincast` module provides first-class support for PEFT techniques like LoRA and DoRA. It includes predefined presets (`attn`, `attn_mlp_gating`, `experts_heavy`) to target specific layers for adapter injection.
* **Benefit**: This design choice directly addresses the challenge of fine-tuning large foundation models. Instead of retraining the entire FFM, users can fine-tune a small set of parameters (the adapters) for a new task, drastically reducing training time, memory footprint, and storage requirements for task-specific models. This enhances the model's **extensibility** to new financial datasets.
* **Comprehensive Time-Series Feature Engineering**:
* **Highlight**: The `TimeCovariates` class in `src/ffm/time_features.py` extracts a rich, normalized set of temporal features (e.g., minute-of-hour, day-of-year, holiday proximity).
* **Benefit**: This feature set provides the model with explicit, high-quality information about the time context, which is vital for financial data where seasonality and calendar effects (like holidays) are strong predictors. This design improves the model's **flexibility** and predictive power across different time granularities.
* **Quantile Forecasting for Risk Management**:
* **Highlight**: The model's output head is designed to predict not just the mean, but a full distribution of quantiles (e.g., 0.1 to 0.9).
* **Benefit**: In finance, point forecasts are often insufficient. By providing a full predictive distribution, the FFM enables advanced risk management, Value-at-Risk (VaR) calculations, and confidence interval estimation, making the model's output more **actionable** for trading and investment strategies.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
## Improvement Suggestions
Based on the comprehensive analysis of the FinCast-fts codebase, the following suggestions are proposed to address potential performance bottlenecks, optimize the architecture, and enhance code quality.
### 1. Performance Bottlenecks and Optimization
| Area | Suggestion | Rationale and Impact |
| :--- | :--- | :--- |
| **Data Loading (CPU)** | Implement a more efficient data loading mechanism for large-scale datasets, potentially using Apache Arrow or Parquet format instead of CSV. | The current implementation in `TSdataset.py` relies on `pd.read_csv` and `np.vstack`, which can be slow and memory-intensive for massive financial datasets. Using columnar formats and memory-mapped files can significantly reduce I/O overhead and memory usage. |
| **XReg Solver** | Replace the JAX-based `BatchedInContextXRegLinear` with a PyTorch-native or highly optimized C++/CUDA linear algebra solver (e.g., using `torch.linalg.solve`). | The current XReg implementation in `xreg_lib.py` uses JAX, which introduces a dependency on a separate ecosystem and requires data transfer between PyTorch (model) and JAX (XReg). A unified PyTorch solution would eliminate this overhead and simplify the dependency stack. |
| **MoE Dispatch** | Optimize the MoE dispatch and combine operations for GPU. | The `st_moe_pytorch` module relies heavily on `einsum` and tensor manipulation (`rearrange`, `pack`, `unpack`). While flexible, these operations can be less performant than highly optimized custom CUDA kernels used in production-grade MoE implementations (e.g., Fairseq's Fused MoE). Investigating a fused kernel implementation for the dispatch/combine steps could yield significant speedups. |
### 2. Architecture Optimization
* **Decouple FFM from XReg**: The tight coupling of the FFM (`FFmBase`) with the XReg logic makes the core model API complex. It is recommended to separate the XReg functionality into a standalone wrapper class that takes a trained FFM model and applies the XReg logic externally. This would simplify the `FFmBase` interface and make the core model more modular.
* **Standardize Configuration Management**: The current configuration is spread across `FFmHparams` (dataclass) and `FFMConfig` (dataclass). It is recommended to consolidate all hyperparameters into a single, canonical configuration class (e.g., using `dataclasses` or `pydantic`) and pass this single object throughout the system. This improves clarity and reduces the risk of inconsistent parameter settings.
* **Refactor `pytorch_patched_decoder_MOE.py`**: This file is excessively large (over 800 lines) and contains multiple classes (`FFMConfig`, `TimesFMAttention`, `TimesFMDecoderLayer`, `PatchedTimeSeriesDecoder_MOE`). Breaking this file into smaller, more focused modules (e.g., `attention.py`, `decoder_layer.py`, `model.py`) would significantly improve code navigation and maintainability.
### 3. Code Quality and Maintainability
* **Type Hinting and Docstrings**: While type hints are present, consistency can be improved, especially in utility functions and complex tensor manipulation code. Comprehensive docstrings following a standard format (e.g., Google or NumPy style) should be added to all public methods and classes, particularly in the `st_moe_pytorch` module, which is complex due to its distributed nature.
* **Remove Redundant TensorFlow Code**: The `src/ffm/data_loader.py` file contains a TensorFlow-based data loader (`TimeSeriesdata`). Since the rest of the project is PyTorch-native, this file appears to be vestigial code from the original TimesFM project. It should be removed or clearly marked as deprecated to avoid confusion and unnecessary dependencies.
* **Consistent Naming Conventions**: The project uses a mix of naming conventions (e.g., `FFmTorch`, `PatchedTimeSeriesDecoder_MOE`, `peft_injector`). Adopting a consistent style (e.g., all classes using `PascalCase` and all functions using `snake_case`) across all modules would enhance readability.
#### 3.4.2. Secondary Development Guide
## Secondary Development Guide
This guide provides a structured approach for exploring the FinCast-fts codebase and conducting secondary development, such as fine-tuning, adding new features, or integrating new data sources.
### 1. Code Exploration Path
To understand the project, follow the data flow and model architecture sequentially:
1. **Data Preparation (`src/data_tools`)**:
* Start with `src/data_tools/TSdataset.py` to understand how raw CSV data is converted into univariate time-series and how sliding windows are generated for training.
* Examine `src/data_tools/batch_sampler.py` to grasp the length-based batching optimization, which is crucial for efficient training.
2. **Model Core and Configuration (`src/ffm`)**:
* Review `src/ffm/ffm_base.py` and `src/ffm/ffm_torch_moe.py` to understand the high-level API and model loading process.
* The core model logic is in `src/ffm/pytorch_patched_decoder_MOE.py`. Focus on the `PatchedTimeSeriesDecoder_MOE` class, particularly the `_preprocess_input` method (patching, normalization) and the `forward` method (Transformer stack, frequency embedding).
3. **Architectural Innovation (`src/st_moe_pytorch`)**:
* Deep dive into `src/st_moe_pytorch/st_moe_pytorch.py`. This module defines the MoE mechanism. Understanding the `TopNGating` (router) and `MoE` (expert dispatch/combine) is key to modifying the model's capacity or routing behavior.
### 2. Best Practices for Fine-Tuning (PEFT)
The recommended path for secondary development is **Parameter-Efficient Fine-Tuning (PEFT)** using the provided `peft_Fincast` module.
* **Select a Target Preset**: Use the `peft_injector.py` to wrap your pre-trained FFM. Start with a minimal preset like `"attn"` or `"attn_mlp"` to ensure stability. For maximum capacity increase, use `"experts_heavy"`.
* **Hyperparameter Tuning**: Focus on tuning the LoRA rank (`lora_r`) and alpha (`lora_alpha`). A higher rank increases the number of trainable parameters and model capacity but also increases memory usage.
* **Training Loop**: The fine-tuning process should be identical to the original training loop, but only the LoRA adapter parameters will have `requires_grad=True`.
### 3. Adding New Features
* **New Time Features**: To add a new temporal covariate (e.g., lunar cycle, specific market hours), modify the `TimeCovariates` class in `src/ffm/time_features.py`. Ensure the new feature is correctly normalized and added to the output DataFrame.
* **New Exogenous Regressors (XReg)**: If you are adding new external data (e.g., sentiment scores, macroeconomic indicators), ensure they are prepared in the `FFmBase`'s `forecast_with_xreg` method and integrated into the `BatchedInContextXRegLinear` in `src/ffm/xreg_lib.py`. This requires providing the new data as `dynamic_numerical_covariates` or `static_numerical_covariates` to the XReg fitting process.
* **Custom Expert**: To experiment with a different expert architecture (e.g., a different activation function or a deeper MLP), modify the `Expert` class definition in `src/st_moe_pytorch/st_moe_pytorch.py`. Ensure the input and output dimensions remain consistent with the model's `hidden_size`.
================================================
FILE: thirdparty/FinGPT.md
================================================
# FinGPT - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
The FinGPT repository is structured as a collection of distinct, yet related, sub-projects, each focusing on a specific financial application of Large Language Models (LLMs). This modular structure facilitates independent development and deployment of different FinLLM capabilities.
```
/home/ubuntu/FinGPT/
├── fingpt/
│ ├── FinGPT_Benchmark/ # Module 1: Benchmarking and Fine-tuning Utilities
│ │ ├── benchmarks/ # Contains scripts for various financial NLP benchmarks (e.g., ConvFinQA, FiQA).
│ │ ├── data/ # Data download and preparation scripts for benchmarks.
│ │ ├── train_lora.py # Script for LoRA-based fine-tuning of models on benchmark datasets.
│ │ └── utils.py # Utility functions for model path parsing, dataset loading, and tokenization.
│ ├── FinGPT_FinancialReportAnalysis/ # Module 2: Financial Report Analysis (RAG)
│ │ ├── reportanalysis.ipynb # Jupyter notebook demonstrating the RAG analysis flow.
│ │ └── utils/ # Core RAG implementation, including document formatting and clustering (Raptor).
│ │ ├── earning_calls.py # Utilities for processing earning call transcripts.
│ │ ├── format_pdf.py # Utilities for formatting PDF documents.
│ │ └── rag.py # Core implementation of the Recursive Abstractive Clustering (Raptor) RAG system.
│ ├── FinGPT_Forecaster/ # Module 3: Financial Forecasting
│ │ ├── AAAI-Good-Data/ # Sub-module for a specific dataset/training configuration (e.g., AAAI paper data).
│ │ ├── FinGPT-Forecaster-Chinese/ # Sub-module for Chinese-specific forecasting data and models.
│ │ ├── app.py # Streamlit or Flask application for the forecaster interface.
│ │ ├── data_pipeline.py # Script for data acquisition, prompt generation, and dataset creation.
│ │ ├── data.py # Core data preparation functions.
│ │ ├── indices.py # Definitions of financial indices (DOW, EURO-STOXX, CRYPTO).
│ │ └── prompt.py # Functions for generating prompts for the LLM.
│ ├── FinGPT_MultiAgentsRAG/ # Module 4: Multi-Agent RAG and Evaluation (Experimental)
│ │ ├── Evaluation_methods/ # Contains evaluation scripts (HaluEval, MMLU, TruthfulQA).
│ │ ├── Fine_tune_model/ # Notebooks for fine-tuning models (e.g., GLM2, Llama2).
│ │ ├── MultiAgents/ # Notebooks demonstrating multi-agent inference.
│ │ └── RAG/ # Notebooks for RAG implementation.
│ ├── FinGPT_Others/ # Module 5: Miscellaneous/Older Projects
│ │ ├── FinGPT_Low_Code_Development/ # Low-code development examples.
│ │ ├── FinGPT_Robo_Advisor/ # Robo-advisor examples.
│ │ └── FinGPT_Trading/ # Trading examples.
│ ├── FinGPT_RAG/ # Module 6: General RAG and Data Scraping
│ │ ├── instruct-FinGPT/ # Scripts for supervised fine-tuning (SFT) and inference.
│ │ └── multisource_retrieval/ # Web scraping and data retrieval utilities.
│ │ ├── external_LLMs/ # Utilities for external LLM integration.
│ │ ├── scrapers/ # Specific web scrapers (Yahoo, CNBC, Google, etc.).
│ │ └── utils/ # Classification and formatting utilities.
│ ├── FinGPT_Sentiment_Analysis_v1/ # Module 7: Sentiment Analysis (Older Version)
│ └── FinGPT_Sentiment_Analysis_v3/ # Module 8: Sentiment Analysis (Latest Version)
│ ├── benchmark/ # Benchmarking notebooks.
│ ├── data/ # Data preparation notebooks.
│ │ └── training_parallel/ # Parallel training scripts (e.g., using DeepSpeed).
├── requirements.txt # Project dependencies.
└── setup.py # Installation script.
```
```
### 1.2. Core Folders for Analysis
* `/home/ubuntu/FinGPT/fingpt/FinGPT_Benchmark`: Contains the infrastructure for evaluating and fine-tuning FinLLMs on various financial NLP tasks. It includes utilities for data preparation, model loading, and LoRA-based training.
* `/home/ubuntu/FinGPT/fingpt/FinGPT_FinancialReportAnalysis/utils`: Houses the core logic for the RAG system applied to financial documents, notably the **Raptor** (Recursive Abstractive Clustering) implementation for document chunking and summarization.
* `/home/ubuntu/FinGPT/fingpt/FinGPT_Forecaster`: Contains the complete pipeline for financial forecasting, from data acquisition and prompt engineering to dataset creation for model training.
* `/home/ubuntu/FinGPT/fingpt/FinGPT_RAG/multisource_retrieval`: The primary module for web scraping and multi-source data retrieval, which is a critical component for feeding real-time financial news into the LLM.
* `/home/ubuntu/FinGPT/fingpt/FinGPT_Sentiment_Analysis_v3`: The latest implementation for sentiment analysis model training, including parallel training configurations and benchmarking tools.
## Phase 2: Module-by-Module Deep Analysis
### Module 1: FinGPT_Benchmark
- **Core Responsibility**: Provides a standardized environment for fine-tuning and evaluating various base LLMs (Llama2, ChatGLM2, Qwen, etc.) on financial tasks using the LoRA technique.
- **Key Files**:
- `utils.py`: Defines model-specific LoRA target modules (`lora_module_dict`), prompt templates (`template_dict`), model path parsing (`parse_model_name`), and a robust dataset loading mechanism (`load_dataset`) that supports replication and remote/local loading.
- `train_lora.py`: The main training script. It loads the model, tokenizer, and dataset, applies LoRA configuration, and uses the Hugging Face `Trainer` with DeepSpeed for efficient, parallelized fine-tuning. It also integrates with **WandB** for experiment tracking.
- **Implementation Details**: The `tokenize` function in `utils.py` is critical, handling the concatenation of instruction, input, and output, and ensuring the sequence length does not exceed the model's maximum length, a common challenge in LLM fine-tuning. The use of `parse_model_name` centralizes the mapping between a simple model name (e.g., 'llama2') and its corresponding Hugging Face repository path.
### Module 2: FinGPT_FinancialReportAnalysis/utils
- **Core Responsibility**: Implements the **Raptor** (Recursive Abstractive Clustering) RAG framework for processing large financial documents (like earning call transcripts or PDFs) by recursively clustering and summarizing text chunks to create a hierarchical index.
- **Key Files**:
- `rag.py`: Contains the `Raptor` class. This class uses **UMAP** for dimensionality reduction and **Gaussian Mixture Model (GMM)** with **BIC** for optimal cluster determination. The key methods are `recursive_embed_cluster_summarize` and `text_spliter`, which implement the hierarchical chunking and summarization process.
- `format_pdf.py`: Handles the initial processing and formatting of PDF documents.
- `earning_calls.py`: Contains specific logic for handling earning call data.
- **Implementation Details**: The `Raptor` class is a sophisticated implementation of hierarchical RAG. It first splits the text using `RecursiveCharacterTextSplitter`, then iteratively applies embedding, UMAP reduction, GMM clustering (using BIC for optimal cluster count), and LLM-based summarization. This recursive process creates a multi-layered knowledge base, significantly improving the context quality for RAG queries on long documents.
### Module 3: FinGPT_Forecaster
- **Core Responsibility**: Manages the end-to-end pipeline for generating structured financial forecasting datasets suitable for LLM fine-tuning.
- **Key Files**:
- `data_pipeline.py`: The orchestrator. It defines the flow: 1) Acquire data for symbols in a given index (DOW, EURO, CRYPTO) via `prepare_data_for_symbol`. 2) Generate prompts and query an external LLM (GPT-4) for forecasts/rationales via `query_gpt4`. 3) Transform the results into a final training dataset via `create_dataset`.
- `indices.py`: Simple file defining lists of stock/crypto symbols for different indices.
- `prompt.py`: Contains the logic for constructing the detailed, structured prompts used to query the external LLM for forecasting.
- **Implementation Details**: The pipeline is a strong example of using an LLM for data labeling and rationale generation. The `query_gpt4` function is the bottleneck, as it relies on an external, non-deterministic API call to enrich the raw financial data with LLM-generated forecasts and explanations, which are then used as the "output" for the fine-tuning dataset.
### Module 4: FinGPT_RAG/multisource_retrieval
- **Core Responsibility**: A comprehensive web scraping and data retrieval layer designed to gather real-time financial news from multiple sources, which serves as the knowledge base for the RAG system.
- **Key Files**:
- `news_scraper.py`: The main scraping logic. It uses `requests` and `BeautifulSoup` for static scraping and includes logic for handling various financial news sites (Seeking Alpha, Reuters, Bloomberg, Yahoo, CNBC, MarketWatch). It also contains a `select_column_and_classify` function, suggesting an interactive or GUI-driven workflow for data labeling.
- `scrapers/`: Sub-directory containing site-specific scraping implementations (e.g., `scrape_yahoo.py`, `scrape_cnbc.py`).
- `external_LLMs/`: Utilities for tokenization and interaction with external LLMs (e.g., ChatGPT, g4f).
- **Implementation Details**: The scraping logic is highly decentralized, with a central dispatcher (`scraping_by_url` in `news_scraper.py`) delegating to site-specific scrapers. This design is necessary due to the varied HTML structures of different news sites but makes the system fragile to website changes. The use of `similarity_score` attempts to filter for relevance before extracting the full article text.
### Module 5: FinGPT_Sentiment_Analysis_v3
- **Core Responsibility**: Provides the latest, optimized training pipeline for sentiment analysis models, focusing on efficiency and parallel processing.
- **Key Files**:
- `training_parallel/train_lora.py`: A specialized LoRA training script, similar to the benchmark one but with custom `ModifiedTrainer` and `data_collator` classes. The `ModifiedTrainer` overrides `compute_loss` and `prediction_step` to handle the specific input/output format of the sentiment task, and customizes `save_model` to only save the LoRA adapter weights. It is configured for DeepSpeed and parallel training.
- **Implementation Details**: The custom `ModifiedTrainer` is a key feature, allowing the project to bypass the standard Hugging Face Trainer's assumptions about loss calculation and model saving, which is often necessary when working with specialized models like ChatGLM or when only saving adapter weights. The `data_collator` handles padding and label masking specific to the sentiment fine-tuning task.
### Module PlantUML Diagrams
@startuml FinGPT_Benchmark
title FinGPT_Benchmark Module Class Diagram
package "HuggingFace/PEFT" {
class AutoModelForCausalLM
class AutoTokenizer
class TrainingArguments
class Trainer
class LoraConfig
class get_peft_model
}
package "Datasets" {
class Dataset
class concatenate_datasets
}
package "Benchmark Utilities" {
class Utils {
+ template_dict: Dict
+ lora_module_dict: Dict
+ get_prompt(template, instruction, input_text)
+ tokenize(args, tokenizer, feature)
+ parse_model_name(name, from_remote)
+ load_dataset(names, from_remote)
}
class TrainLoRA {
- main(args)
}
}
TrainLoRA ..> Utils : uses
TrainLoRA ..> AutoModelForCausalLM : loads
TrainLoRA ..> AutoTokenizer : loads
TrainLoRA ..> TrainingArguments : configures
TrainLoRA ..> Trainer : initializes
TrainLoRA ..> LoraConfig : configures
TrainLoRA ..> get_peft_model : applies
TrainLoRA ..> concatenate_datasets : combines
Utils ..> Dataset : loads
Utils ..> AutoTokenizer : uses in tokenize
@enduml
@startuml FinGPT_FinancialReportAnalysis_RAG
title FinGPT_FinancialReportAnalysis RAG Module Class Diagram
package "LangChain/Utils" {
class ChatPromptTemplate
class StrOutputParser
class RecursiveCharacterTextSplitter
}
package "Clustering/Reduction" {
class UMAP
class GaussianMixture
}
class Raptor {
- model: LLM
- embd: Embeddings
+ global_cluster_embeddings(embeddings, dim)
+ local_cluster_embeddings(embeddings, dim)
+ get_optimal_clusters(embeddings) : int
+ GMM_cluster(embeddings, threshold) : Tuple[labels, n_clusters]
+ perform_clustering(embeddings, dim, threshold) : List[np.ndarray]
+ embed(texts) : np.ndarray
+ embed_cluster_texts(texts) : DataFrame
+ fmt_txt(df) : str
+ embed_cluster_summarize_texts(texts, level) : Tuple[DataFrame, DataFrame]
+ recursive_embed_cluster_summarize(texts, level, n_levels) : Dict
+ text_spliter(text, chunk_size_tok, level, n_levels) : List[str]
}
Raptor ..> UMAP : uses for reduction
Raptor ..> GaussianMixture : uses for clustering
Raptor ..> ChatPromptTemplate : uses for summarization prompt
Raptor ..> StrOutputParser : uses for summarization output
Raptor ..> RecursiveCharacterTextSplitter : uses for initial chunking
Raptor "1" *-- "1" UMAP
Raptor "1" *-- "1" GaussianMixture
Raptor "1" *-- "1" ChatPromptTemplate
Raptor "1" *-- "1" StrOutputParser
Raptor "1" *-- "1" RecursiveCharacterTextSplitter
@enduml
@startuml FinGPT_Forecaster
title FinGPT_Forecaster Module Class Diagram
package "Data Components" {
class Indices {
+ DOW_30: List[str]
+ EURO_STOXX_50: List[str]
+ CRYPTO: List[str]
}
class Data {
+ prepare_data_for_symbol(symbol, data_dir, start_date, end_date, with_basics)
+ query_gpt4(index, data_dir, start_date, end_date, min_past_weeks, max_past_weeks, with_basics)
+ create_dataset(index, data_dir, start_date, end_date, train_ratio, with_basics)
}
class Prompt {
+ get_all_prompts(index, data_dir, start_date, end_date, min_past_weeks, max_past_weeks, with_basics)
}
class DataInferenceFetch {
+ get_curday()
+ fetch_all_data()
+ get_all_prompts_online()
}
}
class DataPipeline {
+ main(args)
}
DataPipeline ..> Indices : uses
DataPipeline ..> Data : uses
DataPipeline ..> Prompt : uses
DataPipeline ..> DataInferenceFetch : uses
@enduml
@startuml FinGPT_RAG_MultisourceRetrieval
title FinGPT_RAG Multisource Retrieval Module Class Diagram
package "Web Scraping Tools" {
class BeautifulSoup
class requests_get
class split_sentence
class similarity_score
}
package "Site Specific Scrapers" {
class ScrapeYahoo
class ScrapeCNBC
class ScrapeMarketScreener
class ScrapeGoogle
}
class NewsScraper {
+ scraping_by_url(link, subject) : Tuple[url, subject]
+ scrape_bloomberg(subject) : List[str]
+ scrape_reuters(subject) : Tuple[url, subject]
+ scrape_market_watch_article_page(url, subject) : Tuple[url, subject]
+ select_column_and_classify() : void
}
NewsScraper ..> BeautifulSoup : uses
NewsScraper ..> requests_get : uses
NewsScraper ..> split_sentence : uses
NewsScraper ..> similarity_score : uses
NewsScraper ..> ScrapeYahoo : delegates
NewsScraper ..> ScrapeCNBC : delegates
NewsScraper ..> ScrapeMarketScreener : delegates
NewsScraper ..> ScrapeGoogle : delegates
@enduml
@startuml FinGPT_Sentiment_Analysis_v3
title FinGPT_Sentiment_Analysis_v3 Training Module Class Diagram
package "HuggingFace/PEFT" {
class AutoModel
class AutoTokenizer
class TrainingArguments
class Trainer
class LoraConfig
class get_peft_model
}
class ModifiedTrainer extends Trainer {
+ compute_loss(model, inputs, return_outputs=False)
+ prediction_step(model, inputs, prediction_loss_only, ignore_keys)
+ save_model(output_dir)
}
class CastOutputToFloat {
+ forward(x)
}
class TrainLoRA {
+ main()
}
class DataCollator {
+ data_collator(features: list) : dict
}
TrainLoRA ..> AutoModel : loads
TrainLoRA ..> AutoTokenizer : loads
TrainLoRA ..> TrainingArguments : configures
TrainLoRA ..> ModifiedTrainer : initializes
TrainLoRA ..> LoraConfig : configures
TrainLoRA ..> get_peft_model : applies
ModifiedTrainer ..> DataCollator : uses (via trainer init)
TrainLoRA ..> DataCollator : uses
@enduml
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
The FinGPT project is built upon a **modular, LLM-centric, and data-driven design philosophy**, aiming to provide an accessible, open-source framework for financial LLMs. The core abstractions are centered around three main pillars: **Parameter-Efficient Fine-Tuning (PEFT)**, **Hierarchical Retrieval-Augmented Generation (RAG)**, and **End-to-End Data Pipelines**.
The **LoRA Adapter** is the central abstraction for the model layer. Instead of fine-tuning the entire large language model, the project utilizes LoRA (Low-Rank Adaptation) to inject a small number of trainable parameters into the base LLM (e.g., Llama2, ChatGLM2). This abstraction allows for efficient domain adaptation with minimal computational resources, making the project highly accessible. The `lora_module_dict` in `FinGPT_Benchmark/utils.py` explicitly manages which modules of different base models are targeted for adaptation, demonstrating a flexible approach to model heterogeneity.
The **Raptor (Recursive Abstractive Clustering)** system, implemented in `FinGPT_FinancialReportAnalysis/utils/rag.py`, is the key abstraction for handling large, unstructured financial documents. It abstracts the complex process of document chunking, embedding, dimensionality reduction (UMAP), optimal clustering (GMM/BIC), and recursive summarization into a single, hierarchical RAG index. This allows the LLM to retrieve context from multiple levels of abstraction (raw text, cluster summaries, meta-summaries), significantly improving the quality of grounded responses.
The **Data Pipeline** abstraction, exemplified by `FinGPT_Forecaster/data_pipeline.py`, manages the entire lifecycle of creating a structured dataset. This pipeline abstracts data acquisition, prompt engineering, external LLM querying (e.g., GPT-4 for labeling/rationales), and final dataset transformation into a sequential, reproducible process.
The project’s **lifecycle management** follows a clear sequence:
1. **Data Acquisition**: Raw financial data (news, reports) is gathered via the `multisource_retrieval` layer.
2. **Data Preparation**: Data is cleaned, structured, and transformed into domain-specific datasets (Forecasting, Sentiment) or hierarchical RAG indices (Raptor).
3. **Model Adaptation**: Base LLMs are fine-tuned using the LoRA Adapter via the `train_lora.py` scripts.
4. **Application**: The adapted FinLLM is deployed within application agents (Forecaster, Sentiment Classifier, RAG Query Engine) to serve end-user tasks.
#### 3.1.2. Component Interactions
The FinGPT architecture is characterized by a unidirectional, layered data flow, starting from external sources and culminating in the application layer.
**Data Flow:**
1. **External Sources** (Websites, APIs, PDFs) feed into the **Data Acquisition Layer** (`multisource_retrieval`).
2. The **Scraper/Retriever** component extracts raw text and links.
3. Raw text is routed to two main paths:
* **Structured Dataset Path**: Text is processed by `data_pipeline.py` (Forecaster) or similar scripts (Sentiment) to generate `instruction` and `output` pairs, often involving an external LLM (GPT-4) for initial labeling or rationale generation. This results in a Hugging Face `Dataset` object.
* **RAG Index Path**: Large documents are processed by the **Raptor** component (`rag.py`), which generates a multi-level index of summaries and embeddings.
4. The **Fine-Tuning Layer** (`train_lora.py`) consumes the structured `Dataset` and applies the LoRA Adapter to the **Base LLM**.
5. The resulting **FinLLM Core** (Base LLM + LoRA Adapter) is used by the **Application Agents** (RAG Query Engine, Forecaster Agent, Sentiment Classifier) for inference.
**Communication Patterns:**
* **Hugging Face Ecosystem**: The primary communication pattern for model training is the Hugging Face `Trainer` class, which manages the entire training loop, including data loading, optimization, and checkpointing. This is heavily integrated with the **PEFT** library for LoRA.
* **LangChain-Style Chains**: The RAG component in `rag.py` uses a functional chain pattern (`prompt | self.model | StrOutputParser()`) for summarization, a pattern popularized by LangChain, demonstrating a clear separation of prompt, model, and output parsing.
* **Inter-Module Python Calls**: Data flow within the pipelines (e.g., `data_pipeline.py` calling `indices.py`, `data.py`, and `prompt.py`) relies on standard Python function and class imports, maintaining a tightly coupled but clear execution sequence.
* **External API Calls**: The system communicates with external services for two main purposes: web scraping (`requests`, `BeautifulSoup` in `news_scraper.py`) and external LLM querying (e.g., `query_gpt4` in `data.py`, which is assumed to make an API call).
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml FinGPT_Overall_Architecture
title FinGPT Overall Architecture
skinparam componentStyle rectangle
package "1. Data Acquisition Layer" as DataAcquisition {
[Multisource Retrieval] as Scraper
[Data Fetchers] as Fetchers
[Financial News Sources] as Sources
Sources --> Scraper : Scrapes raw data
Scraper --> Fetchers : Provides raw data
}
package "2. Data Processing & Preparation" as DataProcessing {
[Forecaster Data Pipeline] as ForecasterDP
[Sentiment Data Preparation] as SentimentDP
[Document Chunking & Clustering] as Raptor
[Financial Documents (PDFs)] as Docs
Fetchers --> ForecasterDP : Structured data
Fetchers --> SentimentDP : Labeled data
Docs --> Raptor : Unstructured text
}
package "3. Model Fine-Tuning Layer" as FineTuning {
[Base LLM (e.g., Llama2)] as BaseLLM
[LoRA Adapter] as Adapter
[Training Scripts (DeepSpeed)] as Trainer
ForecasterDP --> Trainer : Forecasting Dataset
SentimentDP --> Trainer : Sentiment Dataset
Trainer --> Adapter : Fine-tunes weights
BaseLLM <--> Adapter : Loads adapter
}
package "4. Application & Inference Layer" as Application {
[FinLLM Core] as FinLLM
[RAG Query Engine] as RAGEngine
[Forecasting Agent] as ForecasterAgent
[Sentiment Classifier] as SentimentAgent
BaseLLM -[hidden]right-> Adapter
BaseLLM --> FinLLM : Core Model
Adapter --> FinLLM : Domain Knowledge
Raptor --> RAGEngine : Hierarchical Index
FinLLM --> RAGEngine : Contextual Generation
FinLLM --> ForecasterAgent : Prediction
FinLLM --> SentimentAgent : Classification
}
' Interactions
DataAcquisition --> DataProcessing : Raw Data Flow
DataProcessing --> FineTuning : Structured Datasets
DataProcessing --> Application : Knowledge Base (Raptor Index)
RAGEngine .> FinLLM : Queries for grounded response
ForecasterAgent .> FinLLM : Queries for prediction
SentimentAgent .> FinLLM : Queries for classification
[User/API] --> ForecasterAgent
[User/API] --> SentimentAgent
[User/API] --> RAGEngine
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
The FinGPT codebase employs several established software design patterns to manage complexity and promote modularity:
1. **Adapter Pattern (LoRA)**:
* **Description**: The LoRA mechanism acts as an adapter, allowing a new interface (domain-specific fine-tuning) to be used with an existing class (the frozen base LLM).
* **Implementation**: In `FinGPT_Benchmark/train_lora.py`, the `LoraConfig` and `get_peft_model` functions wrap the `AutoModelForCausalLM` instance, effectively adapting its behavior for financial tasks without modifying its massive original weights.
* **Code Example**:
```python
# FinGPT_Benchmark/train_lora.py
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8,
lora_alpha=32,
target_modules=lora_module_dict[args.base_model], # The adaptation logic
# ...
)
model = get_peft_model(model, peft_config) # The adapter application
```
2. **Pipeline Pattern (Data Flow)**:
* **Description**: A sequence of processing steps where the output of one step becomes the input of the next.
* **Implementation**: The `main` function in `FinGPT_Forecaster/data_pipeline.py` clearly defines the pipeline stages: Acquire Data -> Generate Prompt/Query GPT-4 -> Transform to Training Format.
* **Code Example**:
```python
# FinGPT_Forecaster/data_pipeline.py (Simplified)
# 1. Acquire data
for symbol in tqdm(index):
prepare_data_for_symbol(symbol, data_dir, start_date, end_date, with_basics=with_basics)
# 2. Generate prompt and query GPT-4
query_gpt4(index, data_dir, start_date, end_date, min_past_weeks, max_past_weeks, with_basics=with_basics)
# 3. Transform into training format
dataset = create_dataset(index, data_dir, start_date, end_date, train_ratio, with_basics=with_basics)
```
3. **Strategy Pattern (Model Configuration)**:
* **Description**: Defines a family of algorithms, encapsulates each one, and makes them interchangeable.
* **Implementation**: The `lora_module_dict` in `FinGPT_Benchmark/utils.py` holds different strategies (target modules) for applying LoRA based on the specific base model architecture (e.g., `chatglm2` uses `query_key_value`, while `llama2` uses `q_proj`, `k_proj`, `v_proj`).
* **Code Example**:
```python
# FinGPT_Benchmark/utils.py
lora_module_dict = {
'chatglm2': ['query_key_value'],
'llama2': ['q_proj', 'k_proj', 'v_proj'],
# ...
}
# ...
target_modules=lora_module_dict[args.base_model],
```
4. **Composite Pattern (Raptor RAG)**:
* **Description**: Composes objects into tree structures to represent part-whole hierarchies.
* **Implementation**: The `recursive_embed_cluster_summarize` function in `rag.py` recursively processes summaries from one level as the "documents" for the next level, creating a hierarchical index where a cluster summary is a composite of its underlying document chunks.
#### 3.3.2. Project Highlights
The FinGPT project demonstrates several innovative features that enhance its utility and flexibility in the financial domain:
* **Hierarchical RAG with Raptor**: The most innovative feature is the **Raptor** RAG system. By combining **UMAP** (dimensionality reduction) and **Gaussian Mixture Models (GMM)** for clustering, it creates a multi-level index of document summaries. This allows the RAG engine to retrieve not just granular text chunks but also high-level conceptual summaries, leading to more coherent and contextually rich answers from the LLM.
* **Accessibility through PEFT**: The core focus on **LoRA-based fine-tuning** significantly lowers the barrier to entry for financial LLM development. It allows researchers and developers to adapt massive models to financial tasks using consumer-grade GPUs, promoting the open-source spirit of the project.
* **End-to-End Financial Forecasting Pipeline**: The `FinGPT_Forecaster` module provides a complete, runnable example of how to convert raw market data into a structured, LLM-ready dataset, including the crucial step of using an external LLM for generating rationales and labels. This is a highly valuable, innovative feature for quantitative finance.
* **Robust Multisource Data Retrieval**: The dedicated `multisource_retrieval` component, with its site-specific scrapers (Yahoo, CNBC, Bloomberg), ensures the LLM can be grounded in up-to-date, real-world financial news, which is critical for time-sensitive financial applications.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
While the project is robust, several areas could be improved to enhance performance, maintainability, and architectural clarity:
* **Standardization and Code Consolidation**:
* **Suggestion**: Consolidate the redundant `train_lora.py` and `utils.py` files found in multiple sub-projects (`FinGPT_Benchmark`, `FinGPT_Forecaster`, `FinGPT_Sentiment_Analysis_v3`).
* **Benefit**: Reduces code duplication, simplifies maintenance, and ensures a single source of truth for core utilities like `tokenize` and `load_dataset`.
* **External Dependency Abstraction**:
* **Suggestion**: Abstract the external LLM calls (e.g., `query_gpt4` in `data.py`) into a dedicated, configurable service layer (e.g., an `ExternalLLMService` class).
* **Benefit**: Decouples the data pipeline from specific LLM providers, making it easier to switch between GPT-4, Claude, or other models, and simplifies API key management.
* **RAG System Optimization**:
* **Suggestion**: The Raptor RAG system is computationally intensive due to UMAP and GMM clustering. Implement caching for the clustered embeddings and summaries, especially for static documents like financial reports.
* **Benefit**: Reduces processing time and cost for repeated queries or application restarts.
* **Web Scraping Robustness**:
* **Suggestion**: The `news_scraper.py` is highly dependent on HTML structure. Implement more resilient scraping techniques (e.g., using a general-purpose content extraction library) and add robust retry logic with exponential backoff to handle transient network errors and rate limits.
#### 3.4.2. Secondary Development Guide
For developers looking to explore or extend the FinGPT codebase, the following path is recommended:
1. **Initial Exploration (Fine-Tuning)**:
* Start by examining the **FinGPT_Benchmark** module. The `utils.py` file is essential for understanding model-specific configurations (LoRA targets) and data handling.
* Review `train_lora.py` to grasp the standard fine-tuning workflow using Hugging Face and LoRA. This is the template for all model adaptation tasks.
2. **Understanding Data Flow (Forecasting)**:
* The **FinGPT_Forecaster** module provides the clearest example of an end-to-end pipeline. Analyze `data_pipeline.py` to see how raw data is transformed into a structured dataset suitable for LLM training.
3. **Secondary Development - New Application Agent**:
* To create a new financial application (e.g., a Merger & Acquisition Agent), the best approach is to reuse the existing components:
* **Data**: Use the `multisource_retrieval` scrapers to gather M&A news.
* **Model**: Use the `FinGPT_Benchmark/train_lora.py` script to fine-tune a base LLM on a new M&A-specific dataset.
* **RAG**: If the task involves large documents (e.g., SEC filings), integrate the **Raptor** system from `FinGPT_FinancialReportAnalysis/utils/rag.py` to build the knowledge base.
4. **Contribution Focus**:
* Focus contributions on developing new, robust scrapers in the `multisource_retrieval/scrapers` directory or creating new, standardized financial datasets for the community.
* When adding new models, ensure the `lora_module_dict` in the core `utils.py` is updated with the correct target modules.
================================================
FILE: thirdparty/FinGenius.md
================================================
# FinGenius - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
The FinGenius project exhibits a clean, modular structure typical of a well-organized Python application, with a clear separation of concerns between the core framework, agents, environments, and external capabilities.
```
/home/ubuntu/FinGenius
├── config/ # Configuration files for LLM settings and MCP server endpoints.
│ ├── config.example.toml # Primary configuration for LLM, logging, and general settings.
│ └── mcp.example.json # Configuration for Model Context Protocol (MCP) server addresses.
├── docs/ # Documentation and visual assets (architecture diagrams, flow charts).
├── main.py # The application's entry point and primary orchestration script.
├── requirements.txt # Lists all Python dependencies (e.g., pydantic, akshare, loguru).
└── src/ # The core source code directory.
├── agent/ # **Core Module 1: Agent Definitions**
│ ├── base.py # Defines BaseAgent, the abstract foundation for all agents.
│ ├── react.py # Implements the ReAct (Reasoning and Acting) pattern.
│ ├── mcp.py # Defines MCPAgent, integrating the Model Context Protocol.
│ └── [specialized].py# Contains the concrete, domain-specific agents (e.g., chip_analysis.py).
├── environment/ # **Core Module 2: Execution Contexts**
│ ├── base.py # Defines BaseEnvironment and the EnvironmentFactory.
│ ├── research.py # Implements the Research Phase (data collection and analysis).
│ └── battle.py # Implements the Battle Phase (adversarial debate and voting).
├── tool/ # **Core Module 3: External Capabilities**
│ ├── base.py # Defines BaseTool and ToolCollection, the tool interface.
│ ├── battle.py # The tool agents use to interact within the BattleEnvironment.
│ ├── search/ # Contains various web search tools (Baidu, Google, DuckDuckGo).
│ └── [specialized].py# Contains tools for financial data fetching (e.g., big_deal_analysis.py).
├── mcp/ # **Core Module 4: MCP Server Stubs**
│ └── [server].py # Contains stubs for the specialized financial data servers (e.g., sentiment_server.py).
├── prompt/ # **Core Module 5: Agent Prompts**
│ └── [agent_name].py # Stores the extensive system and next-step prompts for each agent.
├── schema.py # Pydantic models for data structures (Message, Memory, AgentState).
├── llm.py # Wrapper for LLM API calls.
└── logger.py # Configuration for the loguru logging system.
```
The structure clearly separates the core framework (`src/`), configuration (`config/`), and entry point (`main.py`). The `src/` directory is further divided into functional modules: `agent` for the actors, `environment` for the stages, `tool` for the capabilities, and `prompt` for the agent's "mindset." This organization adheres to the principles of modular design and separation of concerns, which is essential for a complex multi-agent system.
```
### 1.2. Core Folders for Analysis
* `/home/ubuntu/FinGenius/src/agent`: Contains the definitions for all specialized AI agents, including the base classes (`BaseAgent`, `ReActAgent`, `ToolCallAgent`, `MCPAgent`) and the domain-specific agents (e.g., `ChipAnalysisAgent`, `HotMoneyAgent`).
* `/home/ubuntu/FinGenius/src/environment`: Defines the two core operational environments (`ResearchEnvironment`, `BattleEnvironment`) and their base class (`BaseEnvironment`), which manage agent execution and interaction flow.
* `/home/ubuntu/FinGenius/src/tool`: Houses the definitions for all external capabilities and internal actions available to the agents, such as data fetching tools (`BigDealAnalysisTool`) and interaction tools (`Battle`, `Terminate`).
* `/home/ubuntu/FinGenius/src/mcp`: Contains the logic for the Model Context Protocol (MCP) integration, including the client-side logic used by `MCPAgent` and the server-side stubs for the specialized financial data services.
* `/home/ubuntu/FinGenius/src/prompt`: Stores the extensive system and next-step prompt templates (in Python string format) used to guide the behavior and reasoning of the various agents.
* `/home/ubuntu/FinGenius/src`: Contains core utility files and foundational classes like `llm.py`, `logger.py`, `schema.py`, and the main entry point logic.
## Phase 2: Module-by-Module Deep Analysis
The FinGenius project is structured around five core Python modules, each serving a distinct purpose in the multi-agent system.
### 1. `src/agent` Module (The Actors)
This module defines the entire agent hierarchy, from the abstract base to the specialized financial experts.
* **Files Enumerated:** `base.py`, `react.py`, `toolcall.py`, `mcp.py`, `chip_analysis.py`, `big_deal_analysis.py`, `hot_money.py`, `risk_control.py`, `sentiment.py`, `technical_analysis.py`, `report.py`.
* **Core Responsibility:** To provide the foundational logic for agent execution, memory management, LLM interaction, and to define the specific roles and capabilities of each financial expert agent.
* **Key Implementation Details:**
* **`BaseAgent` (`base.py`):** Implements the main `run()` loop, state transitions (`AgentState`), and memory updates. It includes logic to detect and handle a "stuck state" (duplicate responses) by modifying the `next_step_prompt`.
* **`ReActAgent` (`react.py`):** Overrides `step()` to implement the **ReAct pattern**, parsing the LLM's response to determine if the next action is a `thought` or a `tool_call`.
* **`MCPAgent` (`mcp.py`):** The final base class, which integrates the `MCPClient` for specialized tool access. All domain agents inherit from this, ensuring they are "MCP-enabled."
* **Specialized Agents:** Agents like `ChipAnalysisAgent` and `BigDealAnalysisAgent` are simple, highly-configured classes. Their primary implementation is setting their unique `name`, `description`, `system_prompt`, and the specific `ToolCollection` they are allowed to use. This adheres to the **Strategy Pattern**.
### 2. `src/environment` Module (The Stage)
This module defines the execution contexts that govern agent interaction and the overall workflow.
* **Files Enumerated:** `base.py`, `research.py`, `battle.py`.
* **Core Responsibility:** To manage the lifecycle of agents, define the rules of engagement, and orchestrate the two-phase analysis process (Research and Battle).
* **Key Implementation Details:**
* **`BaseEnvironment` (`base.py`):** Provides the abstract interface and a factory (`EnvironmentFactory`) for creating environments. It manages the registration and retrieval of agents.
* **`ResearchEnvironment` (`research.py`):** Manages the initial data collection. Its `run()` method executes all specialized agents, typically in parallel, and aggregates their final reports into a single `research_results` dictionary.
* **`BattleEnvironment` (`battle.py`):** Implements the core innovation: the adversarial debate. It uses the **`BattleState`** class to track the debate history, agent order, and voting results. The `run()` method manages the multi-round debate, constructing a **cumulative context** (research results + previous speeches) for each agent before its turn. It acts as a **Mediator** for agent communication via the `Battle` tool.
### 3. `src/tool` Module (The Capabilities)
This module provides the external and internal actions available to the agents, serving as the interface between the LLM-driven logic and the external world.
* **Files Enumerated:** `base.py`, `terminate.py`, `tool_collection.py`, `battle.py`, `big_deal_analysis.py`, `chip_analysis.py`, `search/` (various web search tools).
* **Core Responsibility:** To define a standard interface (`BaseTool`) for all capabilities and to implement the logic for data fetching, web searching, and inter-agent communication.
* **Key Implementation Details:**
* **`BaseTool` (`base.py`):** An abstract class that defines the `name`, `description`, `parameters` (for LLM function calling), and the `async execute()` method. It also includes utility classes like `ToolResult` and `ToolFailure`.
* **`ToolCollection` (`tool_collection.py`):** A container class that holds all available tools for an agent, mapping tool names to instances and providing the list of tool schemas to the LLM.
* **`BigDealAnalysisTool` (`big_deal_analysis.py`):** A specialized tool that wraps the `akshare` library to fetch and process big order fund flow data, including a simple retry mechanism for unstable API calls.
* **`Battle` (`battle.py`):** A unique tool that allows agents to `speak` and `vote` within the `BattleEnvironment`, acting as the communication channel for the debate.
### 4. `src/mcp` Module (The Protocol Integration)
This module handles the Model Context Protocol (MCP) integration, which is key to accessing specialized financial data.
* **Files Enumerated:** `__init__.py`, `battle_server.py`, `big_deal_analysis_server.py`, `server.py`, etc.
* **Core Responsibility:** To define the server-side stubs for the specialized financial data services. These stubs are likely used in a separate deployment environment but are included here to define the protocol endpoints that the `MCPAgent`s are designed to call.
* **Key Implementation Details:** The files primarily contain `MCPServer` implementations (or stubs) for services like `sentiment_server` and `chip_analysis_server`, defining the expected input and output schemas for the financial data APIs.
### 5. `src/prompt` Module (The Agent Mindset)
This module contains the extensive, Chinese-language prompt templates that define the personality, role, and instructions for each agent.
* **Files Enumerated:** `battle.py`, `big_deal_analysis.py`, `chip_analysis.py`, `hot_money.py`, `risk_control.py`, `sentiment.py`, `technical_analysis.py`, etc.
* **Core Responsibility:** To provide the system prompts (`SYSTEM_PROMPT`) and next-step prompts (`NEXT_STEP_PROMPT_ZN`) that guide the LLM's behavior within the ReAct loop, ensuring the agents adhere to their specialized financial roles and the rules of the environment. The prompts are critical for the project's A-share market specialization.
### Module PlantUML Diagrams
## Agent Module PlantUML Diagram
```plantuml
@startuml
skinparam classAttributeIconVisible false
skinparam defaultFontName Monospaced
skinparam defaultFontSize 12
package "src.agent" {
abstract class BaseAgent {
+ name: str
+ memory: Memory
+ state: AgentState
+ run(request)
+ {abstract} step()
+ is_stuck()
}
abstract class ReActAgent {
+ step()
- _parse_llm_response()
}
abstract class ToolCallAgent {
+ available_tools: ToolCollection
+ step()
- _execute_tool(tool_call)
}
class MCPAgent {
+ mcp_client: MCPClient
}
class ChipAnalysisAgent
class BigDealAnalysisAgent
class HotMoneyAgent
class RiskControlAgent
class SentimentAgent
class TechnicalAnalysisAgent
class ReportAgent
BaseAgent <|-- ReActAgent
ReActAgent <|-- ToolCallAgent
ToolCallAgent <|-- MCPAgent
MCPAgent <|-- ChipAnalysisAgent
MCPAgent <|-- BigDealAnalysisAgent
MCPAgent <|-- HotMoneyAgent
MCPAgent <|-- RiskControlAgent
MCPAgent <|-- SentimentAgent
MCPAgent <|-- TechnicalAnalysisAgent
MCPAgent <|-- ReportAgent
BaseAgent ..> [src.schema.Memory] : uses
ToolCallAgent ..> [src.tool.ToolCollection] : manages
MCPAgent ..> [src.mcp.MCPClient] : uses
}
@enduml
```
## Environment Module PlantUML Diagram
```plantuml
@startuml
skinparam classAttributeIconVisible false
skinparam defaultFontName Monospaced
skinparam defaultFontSize 12
package "src.environment" {
abstract class BaseEnvironment {
+ name: str
+ agents: Dict[str, BaseAgent]
+ register_agent(agent)
+ {abstract} run()
}
class ResearchEnvironment {
+ run()
- _create_agents()
- _aggregate_results()
}
class BattleEnvironment {
+ battle_state: BattleState
+ run()
+ handle_speak(agent_id, speak)
+ handle_vote(agent_id, vote)
- _get_cumulative_context()
}
class BattleState {
+ agent_order: List[str]
+ debate_history: List[Dict]
+ final_votes: Dict[str, str]
+ _recalculate_vote_results()
}
class EnvironmentFactory {
+ {static} create_environment(type, agents)
}
BaseEnvironment <|-- ResearchEnvironment
BaseEnvironment <|-- BattleEnvironment
BattleEnvironment o-- BattleState : manages
BaseEnvironment ..> [src.agent.BaseAgent] : contains
EnvironmentFactory ..> BaseEnvironment : creates
}
@enduml
```
## Tool Module PlantUML Diagram
```plantuml
@startuml
skinparam classAttributeIconVisible false
skinparam defaultFontName Monospaced
skinparam defaultFontSize 12
package "src.tool" {
abstract class BaseTool {
+ name: str
+ description: str
+ parameters: Dict
+ {abstract} execute(**kwargs)
+ to_param()
}
class ToolResult {
+ output: Any
+ error: Optional[str]
}
class ToolCollection {
+ tools: Dict[str, BaseTool]
+ get_tool_schemas()
+ execute_tool(name, **kwargs)
}
class Terminate
class Battle {
+ agent_id: str
+ controller: BattleEnvironment
+ execute(speak, vote)
}
class BigDealAnalysisTool {
+ execute(stock_code)
- _safe_fetch(akshare_func)
}
class ChipAnalysisTool
class CreateChatCompletion
class WebSearchTool
BaseTool <|-- Terminate
BaseTool <|-- Battle
BaseTool <|-- BigDealAnalysisTool
BaseTool <|-- ChipAnalysisTool
BaseTool <|-- CreateChatCompletion
BaseTool <|-- WebSearchTool
ToolCollection o-- BaseTool : aggregates
BaseTool ..> ToolResult : returns
Battle ..> [src.environment.BattleEnvironment] : interacts with (controller)
}
@enduml
```
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
The FinGenius architecture is built upon a set of well-defined core abstractions that facilitate the multi-agent, dual-environment design.
**1. Agent Hierarchy (The Actors):**
The agent system follows a clear inheritance chain, embodying the **Strategy Pattern** and **Template Method Pattern**.
* **`BaseAgent` (`src/agent/base.py`):** The foundational abstract class. It provides core agent capabilities: state management (`AgentState`), memory (`Memory`), logging, and the main execution loop (`run()`). It enforces the abstract method `step()`, which is the single unit of work for any agent.
* **`ReActAgent` (`src/agent/react.py`):** Implements the **ReAct (Reasoning and Acting) pattern**. It extends `BaseAgent` by structuring the `step()` method to alternate between internal thought (reasoning) and external action (tool use).
* **`ToolCallAgent` (`src/agent/toolcall.py`):** Extends `ReActAgent` to manage and execute tools. It handles the parsing of LLM responses for function calls and the execution of the tools contained within the `ToolCollection`.
* **`MCPAgent` (`src/agent/mcp.py`):** The final, specialized base class. It extends `ToolCallAgent` to integrate the **Model Context Protocol (MCP)**, allowing agents to access specialized financial data servers via `MCPClient`. All domain-specific agents (e.g., `ChipAnalysisAgent`) inherit from this class.
**2. Environment Hierarchy (The Stage):**
The environments define the context and rules of interaction for the agents.
* **`BaseEnvironment` (`src/environment/base.py`):** The abstract base class for all environments. It manages a collection of agents (`self.agents`) and defines the abstract `run()` method. It also includes an `EnvironmentFactory` for creating specific environment types.
* **`ResearchEnvironment` (`src/environment/research.py`):** Implements the data collection and initial analysis phase. It is responsible for initializing the specialized agents and running them to gather their individual reports.
* **`BattleEnvironment` (`src/environment/battle.py`):** Implements the adversarial validation phase. It manages the structured debate, tracks the debate history, and records agent votes using the **`BattleState`** class. This environment acts as a **Mediator**, controlling the flow of communication between agents.
**3. Data and Utility Abstractions:**
* **`Memory` and `Message` (`src/schema.py`):** These Pydantic models define the structure for agent memory and communication. `Memory` stores a list of `Message` objects, which adhere to the OpenAI chat format (system, user, assistant, tool roles).
* **`BaseTool` and `ToolCollection` (`src/tool/base.py`):** `BaseTool` is the abstract interface for all external capabilities, enforcing the `execute()` method. `ToolCollection` is a container that maps tool names to `BaseTool` instances, simplifying tool management for agents.
* **`LLM` (`src/llm.py`):** A wrapper class for interacting with the Large Language Model API, centralizing LLM configuration and call logic.
The design philosophy is a modular, layered approach, separating the core agent logic, the interaction protocols (environments), and the external capabilities (tools). This separation of concerns ensures high extensibility, allowing new agents, tools, or even new debate formats to be introduced with minimal impact on the core framework. The use of Pydantic for data models enforces strict data validation and structure across the system.
#### 3.1.2. Component Interactions
The FinGenius system operates on a two-stage, sequential pipeline: **Research** followed by **Battle**. The entire process is orchestrated by `main.py`.
**1. Initialization and Research Phase (Data Collection & Analysis):**
* **`main.py`** acts as the orchestrator. It initializes the `EnvironmentFactory` to create the `ResearchEnvironment` and a team of specialized `MCPAgent`s (e.g., `ChipAnalysisAgent`, `HotMoneyAgent`).
* **`ResearchEnvironment.run()`** executes the agents, typically in parallel or a defined sequence.
* **`MCPAgent.run()`** initiates the agent's ReAct loop, calling `step()` repeatedly.
* **`ToolCallAgent.step()`** (inherited by `MCPAgent`) is the core of the interaction. It sends the current memory and prompt to the `LLM` to decide on the next action.
* **LLM** responds with a `tool_call` (e.g., `big_deal_analysis_tool`).
* **`ToolCallAgent`** executes the tool via the **`ToolCollection`**.
* **`BigDealAnalysisTool.execute()`** (a specialized `BaseTool`) uses external libraries like `akshare` to fetch real-time financial data. This is the primary external data flow.
* The tool returns a `ToolResult` (structured data) to the agent.
* The agent incorporates the tool result into its memory and continues the ReAct loop until it decides to `Terminate`.
* The `ResearchEnvironment` collects the final output from all agents into a comprehensive `research_results` dictionary.
**2. Battle Phase (Adversarial Validation & Decision):**
* **`main.py`** then initializes the `BattleEnvironment`, passing the `research_results` as context.
* **`BattleEnvironment.run()`** starts the multi-round debate, managed by the `BattleState`.
* Agents are instructed to speak and vote using the **`Battle`** tool.
* **`MCPAgent`** receives the full research context and the debate history (cumulative context) and uses the `Battle` tool to submit its argument (`speak`) and final decision (`vote`).
* **`Battle.execute()`** is handled by the `BattleEnvironment`'s controller, which records the speech in the `debate_history` and updates the `BattleState`'s `final_votes`.
* After a set number of rounds, the `BattleEnvironment` synthesizes the final conclusion based on the vote results (`vote_results` in `BattleState`).
**3. Final Reporting:**
* The final decision and report are passed back to `main.py`, which uses the `ReportAgent` (or a similar mechanism) to format the output into a structured HTML or JSON report for the user.
The communication pattern is primarily **sequential orchestration** (`main.py` -> Research -> Battle) with **internal parallel execution** (agents running concurrently in the `ResearchEnvironment`) and a **Mediator pattern** (`BattleEnvironment` managing agent interactions via the `Battle` tool).
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml
skinparam classAttributeIconVisible false
skinparam defaultFontName Monospaced
skinparam defaultFontSize 12
package "FinGenius" {
package "src" {
package "agent" {
abstract class BaseAgent
abstract class ReActAgent
abstract class ToolCallAgent
class MCPAgent
class ChipAnalysisAgent
class BigDealAnalysisAgent
class HotMoneyAgent
class RiskControlAgent
class SentimentAgent
class TechnicalAnalysisAgent
class ReportAgent
}
package "environment" {
abstract class BaseEnvironment
class ResearchEnvironment
class BattleEnvironment
class EnvironmentFactory
class BattleState
}
package "tool" {
abstract class BaseTool
class ToolCollection
class Terminate
class Battle
class BigDealAnalysisTool
class ChipAnalysisTool
class CreateChatCompletion
class FinancialDeepSearchTool
class WebSearchTool
}
package "mcp" {
class MCPClient
class MCPServer
}
package "core" {
class LLM
class Memory
class Message
class AgentState
}
[main.py]
}
}
' Inheritance
BaseAgent <|-- ReActAgent
ReActAgent <|-- ToolCallAgent
ToolCallAgent <|-- MCPAgent
MCPAgent <|-- ChipAnalysisAgent
MCPAgent <|-- BigDealAnalysisAgent
MCPAgent <|-- HotMoneyAgent
MCPAgent <|-- RiskControlAgent
MCPAgent <|-- SentimentAgent
MCPAgent <|-- TechnicalAnalysisAgent
MCPAgent <|-- ReportAgent
BaseEnvironment <|-- ResearchEnvironment
BaseEnvironment <|-- BattleEnvironment
' Dependencies
BaseAgent ..> LLM : uses
BaseAgent ..> Memory : uses
BaseAgent ..> AgentState : manages
MCPAgent ..> MCPClient : uses
ToolCallAgent ..> ToolCollection : manages
ToolCollection o-- BaseTool : aggregates
ResearchEnvironment o-- MCPAgent : contains (Research Team)
BattleEnvironment o-- MCPAgent : contains (Battle Team)
BattleEnvironment ..> BattleState : manages
BattleEnvironment ..> Battle : uses (Tool)
[main.py] ..> EnvironmentFactory : creates
[main.py] ..> ResearchEnvironment : runs
[main.py] ..> BattleEnvironment : runs
BaseTool <|-- Battle
BaseTool <|-- BigDealAnalysisTool
BaseTool <|-- ChipAnalysisTool
BaseTool <|-- Terminate
' Data Flow / Interaction
[main.py] --> ResearchEnvironment : Start Analysis
ResearchEnvironment --> MCPAgent : Execute Step
MCPAgent --> ToolCollection : Call Tool
ToolCollection --> BaseTool : Execute
ResearchEnvironment --> BattleEnvironment : Pass Results
BattleEnvironment --> MCPAgent : Debate Round
MCPAgent --> Battle : Speak/Vote
BattleEnvironment --> [main.py] : Final Report
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
The FinGenius project effectively utilizes several key design patterns to manage complexity, promote modularity, and implement the multi-agent logic.
**1. Chain of Responsibility / Template Method Pattern (Agent Hierarchy):**
The agent structure is a classic example of the **Template Method Pattern** implemented via a **Chain of Responsibility**.
* **Implementation:** The inheritance chain `BaseAgent` -> `ReActAgent` -> `ToolCallAgent` -> `MCPAgent` defines a fixed sequence of responsibilities. `BaseAgent` handles the execution loop, `ReActAgent` injects the reasoning/acting logic, and `ToolCallAgent` adds tool execution. The abstract `step()` method in `BaseAgent` is the template method that is refined at each level.
* **Example:** `MCPAgent`'s `step()` method calls `ToolCallAgent`'s logic, which in turn relies on `ReActAgent`'s logic to decide whether to reason or call a tool.
**2. Strategy Pattern (Specialized Agents):**
The domain-specific agents (e.g., `ChipAnalysisAgent`, `HotMoneyAgent`) are concrete strategies that implement the agent interface defined by `MCPAgent`.
* **Implementation:** Each specialized agent is configured with a unique `system_prompt` and a specific `ToolCollection` containing only the tools relevant to its domain (e.g., `ChipAnalysisAgent` gets `ChipAnalysisTool`).
* **Example:** The difference between a `RiskControlAgent` and a `SentimentAgent` is primarily their system prompt (strategy) and the set of tools they are allowed to use (capabilities).
**3. Mediator Pattern (BattleEnvironment):**
The `BattleEnvironment` acts as a mediator, controlling the interactions between the agents during the debate phase.
* **Implementation:** Agents do not communicate directly. Instead, they use the **`Battle`** tool, which routes their `speak` and `vote` actions to the `BattleEnvironment`'s controller. The environment then updates the shared `BattleState` and broadcasts the new context to the next agent.
* **Example:** When an agent calls `battle(speak="...", vote="bullish")`, the `BattleEnvironment` processes this, records it in `debate_history`, and then constructs the cumulative context for the next agent, ensuring controlled, structured communication.
**4. Factory Method Pattern (EnvironmentFactory):**
The `EnvironmentFactory` is responsible for creating and initializing the correct environment type (`ResearchEnvironment` or `BattleEnvironment`) based on an input parameter.
* **Implementation:** The static method `EnvironmentFactory.create_environment(environment_type, ...)` encapsulates the logic for instantiating the correct environment class and registering the necessary agents. This decouples the client (`main.py`) from the concrete environment classes.
**5. Adapter Pattern (BaseTool and ToolCollection):**
The `BaseTool` and `ToolCollection` serve as an adapter layer to integrate external capabilities (like `akshare` or the `Battle` mechanism) into the LLM's function-calling interface.
* **Implementation:** `BaseTool.to_param()` converts the Python class definition into the required JSON schema for the LLM. The `execute()` method then adapts the LLM's call into the actual Python function logic.
| Pattern | Component | Role in FinGenius |
| :--- | :--- | :--- |
| **Template Method** | `BaseAgent` | Defines the skeleton of the agent's execution loop (`run`, `step`). |
| **Strategy** | Specialized Agents | Each agent is a strategy with a unique prompt and toolset for a specific financial domain. |
| **Mediator** | `BattleEnvironment` | Controls and structures the communication and debate flow between agents. |
| **Factory Method** | `EnvironmentFactory` | Centralizes the creation and initialization of `Research` and `Battle` environments. |
| **Adapter** | `BaseTool` / `ToolCollection` | Adapts external functions and internal logic for the LLM's function-calling interface. |
#### 3.3.2. Project Highlights
The FinGenius project stands out due to its innovative approach to financial analysis, leveraging a sophisticated multi-agent architecture tailored for the Chinese A-share market.
* **Research–Battle Dual-Environment Architecture:** This is the core innovation. The system separates the process into two distinct phases: the **Research Environment** for parallel, specialized data collection and analysis, and the **Battle Environment** for adversarial validation. This dual structure ensures that the final conclusion is not just a summary of individual findings but a synthesis derived from a structured, competitive debate, significantly reducing the risk of LLM "hallucination."
* **A-Share Market Specialization and Localization:** The project is explicitly designed to overcome the "water-soil incompatibility" of general-purpose AI in the Chinese financial context. This is achieved through:
* **Specialized Agents:** Agents like the **Hot Money Agent (游资agent)** and **Chip Agent (筹码agent)** are based on unique A-share market concepts (e.g., Dragon and Tiger Lists, chip distribution).
* **Localized Tools:** Integration with Chinese financial data APIs like `akshare` and localized search tools (Baidu search) ensures relevance and accuracy.
* **Chinese Prompts:** The use of extensive, high-quality Chinese system prompts in `src/prompt` ensures the LLM's reasoning is grounded in the correct market terminology and context.
* **Cumulative Debate Mechanism:** The `BattleEnvironment` implements a sophisticated debate structure where each agent's argument is informed by the full research context and the speeches of all preceding agents in the current round. This **cumulative context** fosters a deeper, more context-aware discussion, simulating a real-world, progressive analysis process.
* **Modular and Extensible Design:** The clear separation of concerns using the **Agent-Environment-Tool** architecture (Strategy and Factory patterns) makes the system highly extensible. Adding a new financial expert (Agent) or a new data source (Tool) requires minimal changes to the core framework, primarily involving configuration and inheritance.
* **Robust State and Memory Management:** The use of Pydantic models for `Message`, `Memory`, and `BattleState` enforces strict data structure and validation. The `BaseAgent`'s built-in logic to detect and handle "stuck states" (duplicate responses) enhances the robustness of the autonomous execution loop.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
The FinGenius project is architecturally sound, but several areas can be optimized for performance, robustness, and maintainability.
**1. Performance and Robustness:**
* **Asynchronous Data Fetching and Caching:** The current tool implementations, particularly those relying on external APIs like `akshare` (e.g., `BigDealAnalysisTool`), appear to use synchronous calls within an `async` framework. While the `execute` method is `async`, the internal `_with_retry` and `_safe_fetch` functions use `time.sleep()`, which blocks the event loop.
* **Suggestion:** Refactor all external API calls to use `aiohttp` or an asynchronous wrapper for `akshare` to prevent blocking the main event loop, significantly improving concurrency in the `ResearchEnvironment`. Implement a time-to-live (TTL) cache (e.g., using Redis) for frequently requested financial data to reduce redundant API calls and improve speed.
* **Tool Execution Timeout:** The `ToolCallAgent` should implement a strict timeout mechanism for tool execution to prevent a single unresponsive tool from stalling the entire agent's `run()` loop.
**2. Architecture Optimization:**
* **Dynamic Tool Registration:** The `ToolCollection` is currently a static container. For a highly extensible system, consider implementing a dynamic tool discovery mechanism (e.g., using Python entry points or a configuration file) that automatically loads tools into the `ToolCollection` based on the agent's configuration, rather than requiring manual import and instantiation in each agent file.
* **Environment State Management:** The `BattleState` is a large Pydantic model. While effective, for long-running debates, consider offloading the `battle_history` and `debate_history` to a persistent store (e.g., a database) to reduce memory footprint and enable recovery from crashes.
**3. Code Quality and Maintainability:**
* **Prompt Management Refinement:** The system prompts are stored as large Python string variables in `src/prompt/*.py`. This is difficult to manage and version control.
* **Suggestion:** Consolidate prompts into a structured format (e.g., YAML or JSON files) or use a dedicated prompt management library. This would allow for easier localization, versioning, and separation of prompt content from Python logic.
* **Type Hinting Consistency:** While Pydantic is used extensively, the use of `Any` in critical areas (e.g., `controller: Optional[Any]` in `Battle` tool) reduces type safety. Replace `Any` with specific protocol classes or forward references to improve static analysis and code clarity.
* **Error Handling in Tools:** The `_safe_fetch` function in `BigDealAnalysisTool` returns `None` on failure. While safe, this can lead to silent failures.
* **Suggestion:** Tools should return a `ToolFailure` object with a detailed error message, allowing the agent's ReAct loop to explicitly reason about the failure and attempt a recovery strategy, rather than simply receiving `None` data.
#### 3.4.2. Secondary Development Guide
The FinGenius project is highly modular, making secondary development straightforward by focusing on the three core components: **Agents**, **Tools**, and **Environments**.
### 1. Code Exploration Path
To understand the system flow, follow this path:
1. **Entry Point:** Start with `main.py` to see the high-level orchestration: environment creation, sequential execution of Research and Battle phases, and final report generation.
2. **Environment Flow:** Examine `src/environment/research.py` and `src/environment/battle.py` to understand the rules and data flow for each phase.
3. **Agent Logic:** Study the agent hierarchy in `src/agent/base.py` and `src/agent/toolcall.py` to grasp the ReAct loop and tool-calling mechanism.
4. **Capabilities:** Review `src/tool/base.py` and the specific tool implementations (e.g., `src/tool/big_deal_analysis.py`) to see how external data is fetched and processed.
### 2. Adding a New Specialized Agent
To introduce a new financial expert (e.g., a "Policy Agent"):
1. **Define the Agent:** Create a new file (e.g., `src/agent/policy.py`) inheriting from `MCPAgent`.
```python
class PolicyAgent(MCPAgent):
name: str = "policy_agent"
description: str = "分析宏观政策和行业监管变动。"
system_prompt: str = POLICY_SYSTEM_PROMPT # Define this prompt
available_tools: ToolCollection = Field(
default_factory=lambda: ToolCollection(PolicyTool(), Terminate())
)
```
2. **Create Necessary Tools:** If the agent needs new capabilities, create a `BaseTool` implementation (e.g., `PolicyTool`) in `src/tool/`.
3. **Register the Agent:** Modify `src/environment/research.py`'s `_create_agents` method to instantiate and include the new `PolicyAgent` in the research team.
### 3. Adding a New Tool (External Capability)
To integrate a new data source or function:
1. **Define the Tool:** Create a new file (e.g., `src/tool/new_data_source.py`) inheriting from `BaseTool`.
2. **Implement Execution:** Implement the `async def execute(...)` method, which contains the logic for interacting with the external service (e.g., a new financial API).
3. **Update Agent Toolset:** Add the new tool to the `ToolCollection` of the relevant specialized agent(s) in `src/agent/`.
### 4. Configuration
* **LLM Configuration:** Modify `config/config.example.toml` to change the LLM model, API key, and other parameters.
* **MCP Configuration:** Adjust `config/mcp.example.json` to configure the endpoints for the specialized financial data servers that the `MCPAgent`s connect to.
By adhering to the established agent hierarchy and the Tool/Environment separation, new features can be added with high confidence and minimal side effects.
================================================
FILE: thirdparty/FinRL-Meta.md
================================================
# FinRL-Meta - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
The project structure is highly modular, with the core logic encapsulated within the `meta/` directory. This design facilitates clear separation of concerns between data handling, environment simulation, and agent implementation.
```
FinRL-Meta/
├── meta/ # Core library source code for FinRL-Meta framework.
│ ├── agents/ # DRL Agent implementations and wrappers for various DRL libraries (ElegantRL, RLLib, Stable-Baselines3).
│ ├── config.py # Global configuration constants, including ticker lists, time zones, and API key placeholders.
│ ├── data_processor.py # The Facade class that orchestrates the entire data pipeline, selecting and running the appropriate data source processor.
│ ├── data_processors/ # Module containing concrete implementations for fetching and cleaning data from different financial APIs.
│ │ ├── _base.py # Abstract base class defining the common interface for all data processors (Strategy Pattern).
│ │ ├── yahoofinance.py # Implementation for fetching data from Yahoo Finance.
│ │ ├── binance.py # Implementation for fetching data from Binance.
│ │ └── ... # Other data source implementations (Alpaca, Tushare, etc.).
│ ├── env_crypto_trading/ # Module for cryptocurrency trading environments.
│ │ ├── env_multiple_crypto.py # Multi-asset cryptocurrency trading environment, adhering to the OpenAI Gym interface.
│ │ ├── env_btc_ccxt.py # Single-asset Bitcoin trading environment.
│ │ └── alpaca_paper_trade_multicrypto.py # Interface for live/paper trading execution using the Alpaca API.
│ └── env_execution_optimizing/ # Module for specialized execution optimization problems.
│ └── liquidation/ # Sub-module for the optimal liquidation problem.
│ ├── env_execution_optimizing.py # Market environment based on the Almgren-Chriss model.
│ ├── ddpg_agent.py # Implementation of the DDPG agent for continuous control.
│ └── model.py # Neural network definitions (Actor and Critic) for the DDPG agent.
├── README.md # Project documentation and usage examples.
├── setup.py # Python package setup file.
└── ... # Non-core files (e.g., examples, notebooks, docs).
```
The structure clearly delineates the **Data Layer** (`data_processors/`), the **Environment Layer** (`env_crypto_trading/`, `env_execution_optimizing/`), and the **Agent Layer** (`agents/`, `liquidation/ddpg_agent.py`), supporting the project's modular design philosophy. The use of a central `data_processor.py` and `config.py` provides global control and configuration points. The separation of environments into distinct domains (crypto trading vs. execution optimizing) allows for specialized modeling of market dynamics.
```
### 1.2. Core Folders for Analysis
* `/home/ubuntu/FinnewsHunter/thirdparty/FinRL-Meta/meta/data_processors`: Contains the core logic for fetching, cleaning, and transforming financial market data from various sources.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinRL-Meta/meta/env_crypto_trading`: Contains the reinforcement learning environments and live trading interfaces for cryptocurrency portfolio management.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinRL-Meta/meta/env_execution_optimizing/liquidation`: Contains the specialized environment and DRL agent implementation for the optimal trade execution (liquidation) problem.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinRL-Meta/meta/agents`: Contains the wrappers and base classes for integrating various external DRL libraries.
## Phase 2: Module-by-Module Deep Analysis
## Module Analysis
### 1. Module: `meta/agents`
* **Files Enumerated**: `elegantrl_models.py`, `rllib_models.py`, `stablebaselines3_models.py`.
* **Module Core Responsibility**: To provide a standardized interface and wrappers for integrating various external Deep Reinforcement Learning (DRL) libraries (ElegantRL, RLLib, Stable-Baselines3) with the FinRL-Meta environments. This module abstracts the library-specific agent creation and training logic.
* **Key File Identification**:
* `stablebaselines3_models.py`: Contains the `DRLAgent` class, which acts as a wrapper for Stable-Baselines3 algorithms (e.g., A2C, PPO, DDPG). It handles the creation of the agent, training, and testing, providing a unified API for the main workflow.
* `elegantrl_models.py`: Provides similar wrappers for ElegantRL agents.
* `rllib_models.py`: Provides wrappers for RLLib agents.
* **Core Implementation**: The `DRLAgent` classes typically take an environment, a model name, and hyperparameters. They encapsulate the boilerplate code for agent initialization, model saving/loading, and the training loop (`train_model`, `get_model`).
* **Dependencies**: Depends heavily on external DRL libraries (Stable-Baselines3, ElegantRL, RLLib) and the custom environments defined in the `meta/env_...` modules.
### 2. Module: `meta/data_processors`
* **Files Enumerated**: `_base.py`, `alpaca.py`, `binance.py`, `ccxt.py`, `tushare.py`, `yahoofinance.py`.
* **Module Core Responsibility**: To provide concrete implementations for fetching, cleaning, and transforming raw financial data from various sources into a standardized format (Pandas DataFrame) and ultimately into NumPy arrays for the RL environments.
* **Key File Identification**:
* `_base.py`: Defines the abstract base class `_Base`, which outlines the common interface (`download_data`, `clean_data`, `add_technical_indicator`, `df_to_array`) that all concrete processors must implement. This is the core of the Strategy Pattern.
* `yahoofinance.py`: Implements data fetching using the `yfinance` library, including specific logic for price adjustment and handling time intervals.
* `binance.py`: Implements data fetching from the Binance exchange, handling specific API calls and data aggregation logic.
* **Core Implementation**: The `download_data` methods handle API interaction. The `clean_data` methods are crucial for filling missing values and ensuring data integrity. The `df_to_array` method transforms the final DataFrame into the required NumPy arrays (`price_array`, `tech_array`, `turbulence_array`) for the RL environment.
* **Dependencies**: Depends on external data libraries (`yfinance`, `ccxt`, `tushare`, `alpaca_trade_api`) and common data science libraries (`pandas`, `numpy`).
### 3. Module: `meta/env_crypto_trading`
* **Files Enumerated**: `alpaca_paper_trade_multicrypto.py`, `create_crypto_env.py`, `env_btc_ccxt.py`, `env_multiple_crypto.py`.
* **Module Core Responsibility**: To define the simulation environments for cryptocurrency trading and provide an interface for live/paper trading execution.
* **Key File Identification**:
* `env_multiple_crypto.py`: Defines the `CryptoEnv` class, the primary multi-asset RL environment. It implements the core `reset()` and `step()` methods, managing the portfolio state (cash, stocks) and calculating the reward based on asset value change.
* `alpaca_paper_trade_multicrypto.py`: Defines `AlpacaPaperTradingMultiCrypto`, which acts as the execution layer. It loads a trained DRL policy, fetches real-time data, infers an action, and executes trades via the Alpaca API.
* **Core Implementation**: The `CryptoEnv.step()` method contains the critical trading logic: action normalization (to handle large price differences), transaction cost calculation, and portfolio update. The state is constructed by stacking normalized cash, stocks, and a lookback window of technical indicators.
* **Dependencies**: Depends on the `meta/data_processors` for data, and external libraries like `gym`, `numpy`, `pandas`, and `alpaca_trade_api`.
### 4. Module: `meta/env_execution_optimizing/liquidation`
* **Files Enumerated**: `ddpg_agent.py`, `env_execution_optimizing.py`, `model.py`, `utils.py`.
* **Module Core Responsibility**: To provide a specialized environment and DRL agent for the optimal trade execution problem, specifically the Almgren-Chriss liquidation model.
* **Key File Identification**:
* `env_execution_optimizing.py`: Defines `MarketEnvironment`, which models the stock price dynamics under market impact (permanent and temporary) and calculates the reward based on the Almgren-Chriss utility function.
* `ddpg_agent.py`: Defines the `Agent` class, a standard implementation of the DDPG algorithm, including `Actor` and `Critic` networks, `ReplayBuffer`, and `OU_Noise`.
* **Core Implementation**: The `MarketEnvironment.step()` method is the core, implementing the price evolution and market impact equations. The DDPG `Agent.learn()` method implements the standard DDPG update rules for the Actor and Critic networks.
* **Dependencies**: Depends on `numpy`, `torch`, and standard DRL components.
### Module PlantUML Diagrams
@startuml
title Agents Module (Stable-Baselines3)
abstract class BaseCallback {
+ _on_step()
}
class TensorboardCallback {
+ _on_step(): bool
}
class DRLAgent {
+ __init__(env)
+ get_model(model_name, policy, policy_kwargs, model_kwargs, verbose, seed)
+ train_model(model, tb_log_name, total_timesteps)
+ DRL_prediction(model, environment)
+ DRL_prediction_load_from_file(model_name, environment, cwd)
}
class DRLEnsembleAgent {
+ __init__(df, train_period, ...)
+ get_model(model_name, env, ...)
+ train_model(model, model_name, tb_log_name, iter_num, total_timesteps)
+ get_validation_sharpe(iteration, model_name)
+ DRL_validation(model, test_data, test_env, test_obs)
+ DRL_prediction(model, name, last_state, iter_num, ...)
+ run_ensemble_strategy(A2C_model_kwargs, PPO_model_kwargs, DDPG_model_kwargs, timesteps_dict)
}
TensorboardCallback --|> BaseCallback
DRLAgent ..> MODELS : uses
DRLEnsembleAgent ..> MODELS : uses
DRLEnsembleAgent ..> DRLAgent : uses methods
note right of DRLAgent::get_model
Initializes SB3 model (A2C, PPO, DDPG, SAC, TD3)
Handles action noise configuration
end note
note right of DRLEnsembleAgent::run_ensemble_strategy
Core logic for rolling-window training
and model selection based on Sharpe ratio
end note
@enduml
@startuml
skinparam classAttributeIconVisible true
package "Data Processors" {
enum DataSource {
akshare
alpaca
alphavantage
baostock
binance
ccxt
iexcloud
joinquant
quandl
quantconnect
ricequant
tushare
wrds
yahoofinance
}
abstract class _Base {
+ data_source: str
+ start_date: str
+ end_date: str
+ time_interval: str
+ dataframe: pd.DataFrame
--
+ download_data(ticker_list: List[str])
+ clean_data()
+ fillna()
+ add_technical_indicator(tech_indicator_list: List[str])
+ add_turbulence()
+ calculate_turbulence(): pd.DataFrame
+ add_vix()
+ df_to_array(tech_indicator_list: List[str], if_vix: bool)
+ calc_nonstandard_time_interval(): str
+ transfer_standard_ticker_to_nonstandard(ticker: str): str
+ save_data(path)
+ load_data(path)
}
class DataProcessor {
- processor: _Base
+ data_source: DataSource
+ start_date: str
+ end_date: str
+ time_interval: str
+ dataframe: pd.DataFrame
--
+ __init__(data_source: DataSource, ...)
+ download_data(ticker_list)
+ clean_data()
+ add_technical_indicator(tech_indicator_list: List[str])
+ add_turbulence()
+ add_vix()
+ df_to_array(if_vix: bool): np.array
+ data_split(df, start, end)
+ fillna()
+ run(ticker_list: str, technical_indicator_list: List[str], if_vix: bool)
}
class Yahoofinance {
+ download_data(ticker_list: List[str])
}
class Alpaca {
+ api: tradeapi.REST
+ download_data(ticker_list)
+ clean_data()
+ get_trading_days(start, end)
}
class Binance {
+ download_data(ticker_list: List[str])
+ dataframe_with_limit(symbol)
+ fetch_n_combine(startDate, endDate, tickers)
}
class Tushare {
+ token: str
+ adj: str
+ download_data(ticker_list: List[str])
}
DataProcessor o-- _Base : delegates
_Base <|-- Yahoofinance
_Base <|-- Alpaca
_Base <|-- Binance
_Base <|-- Tushare
DataProcessor o-- DataSource : uses
}
@enduml
@startuml
skinparam classAttributeIconVisible true
package "RL Environments (meta.envs)" {
package "Crypto Trading" {
class CryptoEnv {
+ lookback: int
+ initial_cash: float
+ buy_cost_pct: float
+ sell_cost_pct: float
+ price_array: np.ndarray
+ tech_array: np.ndarray
+ stocks: np.ndarray
--
+ __init__(config, lookback, initial_capital, ...)
+ reset(): np.ndarray
+ step(actions): (np.ndarray, float, bool, None)
+ get_state(): np.ndarray
- _generate_action_normalizer()
}
class BitcoinEnv {
+ stock_dim: int = 1
+ initial_account: float
+ transaction_fee_percent: float
--
+ __init__(...)
+ reset(): np.ndarray
+ step(action): (np.ndarray, float, bool, None)
+ draw_cumulative_return(...)
- load_data(...)
}
class AlpacaPaperTradingMultiCrypto {
- alpaca: tradeapi.REST
- act: AgentPPO.act
- CCTX_time_interval: str
- time_interval: int
- stocks: np.ndarray
- cash: float
--
+ __init__(...)
+ run()
+ trade()
+ get_state()
+ submitOrder(qty, stock, side, resp)
}
class create_crypto_env {
+ create_train_env(...)
+ create_test_env(...)
}
CryptoEnv <.. create_crypto_env : creates
BitcoinEnv .up.|> CryptoEnv : specialized single-asset env (conceptual)
AlpacaPaperTradingMultiCrypto ..> CryptoEnv : uses concepts (state/action space)
AlpacaPaperTradingMultiCrypto ..> meta.data_processors.Ccxt : data source
AlpacaPaperTradingMultiCrypto ..> elegantrl.agent.AgentPPO : loads agent
}
package "Execution Optimizing" {
class Agent << (A, #FF7700) DDPG Agent >> {
+ state_size: int
+ action_size: int
- actor_local: Actor
- critic_local: Critic
- noise: OUNoise
- memory: ReplayBuffer
--
+ __init__(state_size, action_size, random_seed)
+ step(state, action, reward, next_state, done)
+ act(state, add_noise=True)
+ learn(experiences, gamma)
+ soft_update(local_model, target_model, tau)
}
class OUNoise {
- mu: np.ndarray
- theta: float
- sigma: float
--
+ __init__(size, seed, mu, theta, sigma)
+ reset()
+ sample()
}
class ReplayBuffer {
- memory: deque
- experience: namedtuple
--
+ __init__(action_size, buffer_size, batch_size, seed)
+ add(state, action, reward, next_state, done)
+ sample()
}
Agent *-- OUNoise : uses
Agent *-- ReplayBuffer : uses
Agent ..> Actor : trains
Agent ..> Critic : trains
}
}
@enduml
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
## Core Abstractions, Design Philosophy, and Lifecycle Management
The FinRL-Meta project is built upon a highly modular and layered architecture, primarily following the **Facade** and **Strategy** design patterns to achieve flexibility and extensibility. The core abstractions revolve around three main components: Data, Environment, and Agent.
### 1. Data Abstraction
The data layer abstracts the complex process of connecting to various financial data sources (e.g., Yahoo Finance, Binance, Alpaca) into a unified interface.
* **`DataSource` Enum**: This is the fundamental abstraction, listing all supported data providers (`akshare`, `alpaca`, `yahoofinance`, etc.).
* **`_Base` Class**: An abstract base class (`meta/data_processors/_base.py`) that defines the common interface for all concrete data processors. It includes core methods like `download_data()`, `clean_data()`, `add_technical_indicator()`, and `df_to_array()`. This enforces a standard contract across all data sources.
* **`DataProcessor` Class**: This acts as a **Facade** (`meta/data_processor.py`). It takes a `DataSource` enum in its constructor and dynamically instantiates the corresponding concrete processor (e.g., `Yahoofinance`, `Binance`). Its `run()` method orchestrates the entire data pipeline: download, clean, add indicators, and transform the data into NumPy arrays (`price_array`, `tech_array`, `turbulence_array`) suitable for the RL environment.
### 2. Environment Abstraction
The environment layer provides a standard interface for the Deep Reinforcement Learning (DRL) agents, adhering to the OpenAI Gym standard (`reset`, `step`).
* **`CryptoEnv` / `BitcoinEnv`**: These classes (`meta/env_crypto_trading/env_multiple_crypto.py`, `meta/env_crypto_trading/env_btc_ccxt.py`) abstract the trading logic, portfolio management, and reward calculation. They manage the state space (cash, holdings, technical indicators) and the action space (buy/sell/hold).
* **State Representation**: The state is a flattened NumPy array, typically a concatenation of normalized cash, normalized stock holdings, and a lookback window of normalized technical indicators. This design choice simplifies the state space for DRL algorithms.
### 3. Agent and Execution Abstraction
The agent layer is designed to be decoupled from the core framework, allowing for easy integration of external DRL libraries (e.g., ElegantRL).
* **`DDPG_Agent`**: A concrete implementation of a DRL agent, demonstrating the use of the **Actor-Critic** architecture for continuous action spaces. It uses helper classes like `ReplayBuffer` and `OU_Noise`.
* **`AlpacaPaperTradingMultiCrypto`**: This class in the execution layer acts as a bridge between the trained DRL policy and a live trading API (Alpaca). It handles the real-time data fetching, state construction, policy inference, and order submission, managing the entire **live trading lifecycle**.
### Lifecycle Management
The typical lifecycle involves:
1. **Initialization**: `DataProcessor` is initialized with a `DataSource` and time parameters.
2. **Data Preparation**: `DataProcessor.run()` fetches and processes historical data, outputting NumPy arrays.
3. **Environment Setup**: An environment (`CryptoEnv`) is instantiated with the processed data arrays.
4. **Training/Testing**: A DRL agent interacts with the environment using `reset()` and `step()` methods.
5. **Deployment (Live Trading)**: The trained agent's policy is loaded into an execution class (`AlpacaPaperTradingMultiCrypto`), which runs a continuous loop to fetch real-time data, generate actions, and execute trades. The `run()` method in this class manages the continuous trading loop.
#### 3.1.2. Component Interactions
## Component Interactions, Data Flow, and Communication Patterns
The FinRL-Meta architecture is characterized by a clear separation of concerns, with data flowing sequentially from the Data Layer to the Environment Layer, and control/action signals flowing between the Environment and the Agent Layer.
### 1. Data Flow (Offline/Training Phase)
The primary data flow during the offline training phase is a one-way pipeline from the data source to the reinforcement learning environment.
| Source Component | Target Component | Data Format | Communication Pattern | Description |
| :--- | :--- | :--- | :--- | :--- |
| **Data Processor** | **RL Environment** | NumPy Arrays | Synchronous Call | The `DataProcessor.run()` method orchestrates the data pipeline, culminating in the output of three key NumPy arrays: `price_array`, `tech_array`, and `turbulence_array`. These arrays, which represent the entire historical dataset, are passed directly to the `CryptoEnv` constructor. |
| **RL Environment** | **DRL Agent** | NumPy Array (State) | Synchronous Call | In each `step()` call, the `CryptoEnv` calculates the next state (`get_state()`) and returns it to the DRL agent. The state is a flattened, normalized vector of market data and portfolio information. |
The `DataProcessor` acts as a **Strategy Pattern** selector, dynamically choosing a concrete data source module (e.g., `Yahoofinance`, `Binance`) based on the `DataSource` enum provided by the user. This ensures that the downstream components (the RL environments) only interact with the standardized NumPy array format, completely decoupling them from the complexities of external APIs.
### 2. Control Flow (Training Phase)
The control flow adheres strictly to the standard **OpenAI Gym interface** for reinforcement learning.
1. **Initialization**: The DRL training loop calls `env.reset()`. The environment initializes the portfolio (cash, stocks) and returns the initial state vector.
2. **Action Selection**: The DRL agent receives the state and uses its neural network policy (`Actor.forward()`) to select an action (a continuous vector of target stock allocations).
3. **State Transition**: The DRL training loop calls `env.step(action)`.
4. **Environment Logic**: Inside `env.step()`, the environment:
* Applies the action (simulates trades, updating `cash` and `stocks`).
* Calculates the reward (change in total asset value).
* Advances the time step.
* Determines the next state (`get_state()`).
* Checks for termination (`done`).
5. **Feedback**: The environment returns `(next_state, reward, done, info)` to the agent, closing the loop.
### 3. Communication Patterns (Online/Live Trading Phase)
The `AlpacaPaperTradingMultiCrypto` class manages the real-time interaction with external services, introducing asynchronous and external API communication.
1. **Real-Time Data Fetch**: The `get_state()` method within the live trading class uses a data processor (specifically `Ccxt` in the example) to fetch the latest market data via HTTP requests to the exchange API (e.g., Binance). This is a synchronous, blocking call to retrieve the necessary historical lookback window.
2. **Policy Inference**: The fetched data is transformed into the state vector, which is then passed to the loaded DRL policy (`self.act(s_tensor)`). This is a local, synchronous operation.
3. **Trade Execution**: The resulting action is translated into market orders. The `submitOrder()` method uses the Alpaca API (`alpaca.submit_order()`) to send the order to the broker. This is typically an external, asynchronous HTTP call, although the provided code wraps it in a `threading.Thread` and uses `join()` to make it functionally synchronous within the main loop, ensuring one trade is processed before the next time step.
This layered design ensures that the core RL logic remains clean and platform-agnostic, while the complexity of external data fetching and live execution is encapsulated in dedicated modules.
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml
skinparam componentStyle rectangle
skinparam classAttributeIconVisible true
title FinRL-Meta High-Level Architecture
package "Data Layer" {
class DataSource << (E, #ADD8E6) Enum >>
abstract class _Base << (A, #ADD8E6) Base Processor >>
class DataProcessor << (F, #ADD8E6) Facade >>
class Yahoofinance << (C, #ADD8E6) Concrete Processor >>
class Binance << (C, #ADD8E6) Concrete Processor >>
' ... other concrete processors ...
}
package "Environment Layer" {
class CryptoEnv << (E, #90EE90) RL Environment >> {
+ price_array: np.ndarray
+ tech_array: np.ndarray
+ stocks: np.ndarray
--
+ reset()
+ step(actions)
+ get_state()
}
class MarketEnvironment << (E, #90EE90) Liquidation Env >> {
+ shares_remaining
+ timeHorizon
--
+ step(action)
}
}
package "Agent Layer" {
class DDPG_Agent << (A, #FFB6C1) Deep RL Agent >>
class Actor << (N, #FFB6C1) Neural Network >>
class Critic << (N, #FFB6C1) Neural Network >>
class OUNoise << (H, #FFB6C1) Helper >>
class ReplayBuffer << (H, #FFB6C1) Helper >>
}
package "Execution Layer" {
class AlpacaPaperTradingMultiCrypto << (T, #FFA07A) Trading Interface >> {
- alpaca: tradeapi.REST
- act: Agent.act
--
+ run()
+ trade()
+ get_state()
}
}
' Relationships
' Data Flow
DataProcessor .up.> DataSource : uses
DataProcessor .right.> _Base : delegates
Yahoofinance .up.|> _Base
Binance .up.|> _Base
DataProcessor --> CryptoEnv : feeds (price, tech, turbulence arrays)
DataProcessor --> MarketEnvironment : feeds (implicitly via parameters)
' Environment to Agent
CryptoEnv .left.> DDPG_Agent : state/reward/action space
' Agent Internals
DDPG_Agent *-- Actor : trains/uses
DDPG_Agent *-- Critic : trains/uses
DDPG_Agent *-- OUNoise
DDPG_Agent *-- ReplayBuffer
' Execution Flow
AlpacaPaperTradingMultiCrypto .up.> CryptoEnv : conceptual interface
AlpacaPaperTradingMultiCrypto .up.> DDPG_Agent : loads/uses policy (act)
AlpacaPaperTradingMultiCrypto .up.> Binance : data fetching (via Ccxt)
AlpacaPaperTradingMultiCrypto .up.> Alpaca : trade execution
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
## Design Patterns Used in the Codebase
The FinRL-Meta project effectively utilizes several software design patterns to achieve modularity, flexibility, and maintainability, particularly in handling diverse data sources and complex reinforcement learning components.
### 1. Facade Pattern (DataProcessor)
The `DataProcessor` class (`meta/data_processor.py`) serves as a **Facade** to the entire data processing subsystem. It provides a simple, unified interface (`run()`, `download_data()`, `clean_data()`) for the complex operations of fetching, cleaning, and transforming data from multiple sources.
* **Implementation**: The `DataProcessor.__init__` method takes a `DataSource` enum and dynamically instantiates the appropriate concrete processor (e.g., `Yahoofinance`, `Binance`). All subsequent method calls on `DataProcessor` are delegated to the internal concrete processor instance.
* **Code Example (meta/data_processor.py)**:
```python
class DataProcessor:
def __init__(self, data_source: DataSource, ...):
# ... dynamic instantiation logic ...
self.processor = processor_dict.get(self.data_source)(...)
def download_data(self, ticker_list):
self.processor.download_data(ticker_list=ticker_list)
self.dataframe = self.processor.dataframe
```
### 2. Strategy Pattern (Data Processors)
The various data source classes (e.g., `Yahoofinance`, `Alpaca`, `Binance`) implement the **Strategy Pattern**. They all inherit from the abstract base class `_Base` (`meta/data_processors/_base.py`), which defines the common interface (the "Strategy"). Each concrete class provides its own specific implementation (the "Concrete Strategy") for methods like `download_data()` and `clean_data()`, tailored to the requirements of its respective API.
* **Implementation**: The `_Base` class defines the contract, and classes like `Yahoofinance` and `Binance` provide the specific logic for their data fetching and cleaning. The `DataProcessor` (the "Context") selects and uses the appropriate strategy object.
* **Code Example (meta/data_processors/_base.py)**:
```python
class _Base:
def download_data(self, ticker_list: List[str]):
pass # Defined in concrete classes
```
### 3. Actor-Critic Pattern (DDPG Agent)
The Deep Deterministic Policy Gradient (DDPG) agent implementation (`meta/env_execution_optimizing/liquidation/ddpg_agent.py`) is a prime example of the **Actor-Critic** architecture, a fundamental pattern in Reinforcement Learning.
* **Implementation**: The agent consists of two main neural networks:
* **Actor (`Actor` class)**: The policy network that takes the state as input and outputs the action (the policy).
* **Critic (`Critic` class)**: The value network that takes the state and action as input and outputs the Q-value (the value function).
* **Code Example (meta/env_execution_optimizing/liquidation/ddpg_agent.py)**:
```python
# Actor Network (w/ Target Network)
self.actor_local = Actor(state_size, action_size, random_seed).to(device)
# Critic Network (w/ Target Network)
self.critic_local = Critic(state_size, action_size, random_seed).to(device)
# In learn method:
# Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
```
### 4. Template Method Pattern (RL Environment)
The base environment structure, particularly in `CryptoEnv` and `BitcoinEnv`, follows the **Template Method Pattern**. The base class defines the skeleton of the algorithm (`reset`, `step`) but defers the implementation of specific steps (like state normalization or action scaling) to helper methods or configuration parameters.
* **Implementation**: The `step()` method in `CryptoEnv` is the template, which calls the concrete implementation of action normalization via `_generate_action_normalizer()` and applies the core trading logic. The overall structure is inherited from the OpenAI Gym interface, which itself is a form of the Template Method.
* **Code Example (meta/env_crypto_trading/env_multiple_crypto.py)**:
```python
class CryptoEnv:
# ...
def step(self, actions) -> (np.ndarray, float, bool, None):
# Template step 1: Normalize action (deferred to helper)
for i in range(self.action_dim):
norm_vector_i = self.action_norm_vector[i]
actions[i] = actions[i] * norm_vector_i
# Template step 2: Execute trades (core logic)
# ... sell logic ...
# ... buy logic ...
# Template step 3: Update state and calculate reward (core logic)
# ...
```
#### 3.3.2. Project Highlights
## Project Highlights: Innovative Features, Extensibility, and Flexibility Design
The FinRL-Meta project exhibits several innovative features and strong design choices that contribute to its extensibility and flexibility, making it a robust platform for financial reinforcement learning research and application.
* **Unified Data Pipeline Abstraction**:
* **Innovation**: The use of the `DataProcessor` Facade over a set of concrete data source strategies (`Yahoofinance`, `Binance`, etc.) is a major highlight. This design abstracts away the heterogeneity of financial data APIs, which often have different data formats, time zone conventions, and rate limits.
* **Flexibility**: Researchers can easily add support for a new data source by simply creating a new class that inherits from `_Base` and implementing the required methods. The core RL environment remains completely unaware of the data source's origin, only consuming the standardized NumPy arrays.
* **Decoupled RL Environment and Agent**:
* **Extensibility**: The core RL environments (`CryptoEnv`, `MarketEnvironment`) are designed to be agnostic to the specific DRL algorithm used. They adhere to the standard OpenAI Gym interface (`reset`, `step`), which is the universal contract for RL. This allows the project to seamlessly integrate agents from different DRL libraries (e.g., ElegantRL, Stable-Baselines3, RLLib), as seen in the `AlpacaPaperTradingMultiCrypto` class which dynamically loads the policy.
* **Innovation**: The environment state space is carefully engineered to be a fixed-size, normalized vector, making it directly compatible with standard deep learning models (e.g., fully connected layers in the Actor/Critic networks). The normalization factors (e.g., `cash * 2**-18`) are hardcoded to scale the state variables into a manageable range for neural network training.
* **Real-Time Trading Integration**:
* **Innovation**: The inclusion of the `AlpacaPaperTradingMultiCrypto` module demonstrates a clear path from research to real-world application. This module encapsulates the complexity of live trading, including API communication, order submission, and real-time state construction. It bridges the gap between a simulated environment and a live paper trading account.
* **Flexibility**: By separating the trading logic from the core RL environment, the project allows for different execution strategies (e.g., market orders, limit orders, different brokers) to be implemented without modifying the core training environment.
* **Domain-Specific Environment Modeling**:
* **Innovation**: The `MarketEnvironment` for execution optimization, based on the Almgren-Chriss model, is a sophisticated, domain-specific environment. It models complex financial phenomena like **permanent and temporary market impact** and uses a reward function based on the change in the Almgren-Chriss utility function. This highlights the project's focus on advanced financial modeling beyond simple portfolio management.
* **Extensibility**: The environment is parameterized with financial constants (`ANNUAL_VOLAT`, `BID_ASK_SP`, `LLAMBDA1`), allowing researchers to easily modify the market dynamics to test the robustness of their agents under different simulated conditions.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
## Improvement Suggestions: Performance, Architecture, and Code Quality
Based on the comprehensive analysis of the FinRL-Meta codebase, the following suggestions are proposed to enhance performance, optimize the architecture, and improve overall code quality.
### 1. Performance Bottlenecks and Optimization
| Area | Bottleneck/Issue | Suggestion for Improvement |
| :--- | :--- | :--- |
| **Data Processing (Pandas)** | Excessive use of `pd.concat()` and `df.append()` in data processors (e.g., `Yahoofinance.download_data`). These operations create new DataFrames in memory, leading to significant performance degradation and memory overhead, especially with large datasets. | **Pre-allocate Lists and Concatenate Once**: Instead of appending to a DataFrame in a loop, collect the individual DataFrames into a Python list and perform a single `pd.concat(list_of_dfs)` operation outside the loop. |
| **State Normalization** | Hardcoded magic numbers for state normalization (e.g., `cash * 2**-18`, `stocks * 2**-3`) are used across multiple environment files (`CryptoEnv`, `BitcoinEnv`). This makes tuning and debugging difficult. | **Centralize Normalization Constants**: Define all normalization constants in a single configuration file (e.g., `meta/config.py`) and load them dynamically. This improves maintainability and allows for easier hyperparameter tuning of the state space. |
| **Live Trading Latency** | The `AlpacaPaperTradingMultiCrypto.trade()` method uses `threading.Thread` with `join()` for `submitOrder`. This effectively makes the order submission synchronous and blocks the main trading loop, increasing latency. | **Asynchronous Order Submission**: Implement true asynchronous order submission using `asyncio` and non-blocking API calls (if supported by the Alpaca SDK) or a dedicated, non-blocking worker queue/process for trade execution. |
### 2. Architecture Optimization
* **Formalize the Environment Base Class**: Currently, the RL environments (`CryptoEnv`, `BitcoinEnv`) do not explicitly inherit from a common abstract base class, other than the implicit contract of the OpenAI Gym interface.
* **Suggestion**: Introduce a formal `BaseEnv` class in `meta/envs/_base.py` that inherits from `gym.Env` (or a modern equivalent) and defines abstract methods for `_calculate_reward()`, `_update_portfolio()`, and `_get_state()`. This would enforce a stricter contract and improve the clarity of the environment's responsibilities.
* **Decouple DRL Library Loading**: The `AlpacaPaperTradingMultiCrypto` class contains hardcoded imports and logic for `elegantrl` (lines 7164-7175). This tightly couples the execution layer to a specific DRL framework.
* **Suggestion**: Use a **Factory Pattern** to load the agent. The execution class should only accept a path to a saved model and a configuration, and a separate utility function should handle the framework-specific loading and policy instantiation.
### 3. Code Quality and Maintainability
* **Consistent Type Hinting**: While some files use type hints, consistency is lacking across the entire codebase.
* **Suggestion**: Adopt comprehensive Python type hinting for all function signatures and class attributes. This significantly improves code readability, enables static analysis tools, and reduces runtime errors.
* **Magic Number Elimination**: The `MarketEnvironment` in the execution optimization module is heavily parameterized with financial constants (e.g., `LLAMBDA1 = 1e-6`, `NUM_N = 60`).
* **Suggestion**: Move all these constants to a dedicated configuration file or a class-level attribute with clear documentation, making the environment's parameters transparent and easily adjustable.
* **Refactor `AlpacaPaperTradingMultiCrypto` State Logic**: The state construction logic in `get_state()` is complex, involving multiple array stacking and normalization steps.
* **Suggestion**: Encapsulate the state construction into a dedicated `StateBuilder` class or a static method. This would isolate the complex logic and make the state representation easier to verify and modify.
#### 3.4.2. Secondary Development Guide
## Secondary Development Guide: Best Practices for Code Exploration and Extension
This guide provides a structured approach for developers looking to explore, modify, or extend the FinRL-Meta codebase.
### 1. Code Exploration Path
Start your exploration by focusing on the three core layers of the architecture:
1. **Configuration and Entry Point (`meta/config.py` and `meta/data_processor.py`)**:
* Examine `meta/config.py` to understand the global constants, default ticker lists, and time zone settings.
* Review `meta/data_processor.py` to grasp how data sources are selected and the standardized data arrays (`price_array`, `tech_array`, `turbulence_array`) are generated. This is the **input** to the entire RL system.
2. **Environment Layer (`meta/env_crypto_trading/`)**:
* Focus on `meta/env_crypto_trading/env_multiple_crypto.py` (`CryptoEnv`). This is the heart of the simulation.
* Analyze the `__init__`, `reset()`, and `step(actions)` methods to understand the state space definition, reward function, and transaction logic (cost calculation, portfolio update).
3. **Agent/Execution Layer (`meta/env_execution_optimizing/` and `meta/env_crypto_trading/`)**:
* For DRL implementation details, study `meta/env_execution_optimizing/liquidation/ddpg_agent.py` and `model.py` to see the Actor-Critic network structure and training loop.
* For real-world application, examine `meta/env_crypto_trading/alpaca_paper_trade_multicrypto.py` to understand how a trained policy is deployed for live trading.
### 2. Best Practices for Extension
* **Adding a New Data Source**:
1. Create a new file in `meta/data_processors/` (e.g., `new_source.py`).
2. Define a class that inherits from `meta/data_processors/_base._Base`.
3. Implement the required methods, especially `download_data()` and `clean_data()`, ensuring the final `self.dataframe` adheres to the expected format (columns: `time`, `open`, `high`, `low`, `close`, `volume`, `tic`).
4. Update the `DataSource` enum and the `processor_dict` mapping in `meta/data_processor.py` to include your new class.
* **Creating a New Trading Environment**:
1. Create a new file in `meta/envs/` (e.g., `env_forex_trading.py`).
2. Define a new environment class (e.g., `ForexEnv`) that mimics the structure of `CryptoEnv`, implementing `reset()` and `step()`.
3. Crucially, redefine the **state space** (`self.state_dim`) and **action space** (`self.action_dim`) to match the requirements of the new domain (e.g., different asset types, different technical indicators).
4. Adjust the reward function and transaction cost logic to reflect the new market's characteristics.
* **Integrating a New DRL Algorithm**:
1. Ensure your new algorithm's policy can be loaded and called with a NumPy state array to return a NumPy action array.
2. If integrating into the live trading module, modify the agent loading section in `AlpacaPaperTradingMultiCrypto.__init__` to correctly load your new model and expose the `self.act` function.
3. If the new algorithm requires a different environment interface (e.g., discrete action space), you will need to create a new environment wrapper that translates the continuous actions of the existing environments into the required format.
================================================
FILE: thirdparty/FinRL.md
================================================
# FinRL - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
The FinRL project structure is organized into a core Python package (`finrl`) and several supporting directories, following a clear separation of concerns for a machine learning framework.
```
FinRL/
├── .git/ # Git version control metadata (Excluded from analysis)
├── .github/ # GitHub configuration (e.g., issue templates, workflows) (Excluded)
├── docker/ # Docker setup for containerized environments (Excluded)
├── docs/ # Documentation source files (Excluded)
├── examples/ # Jupyter notebooks and scripts demonstrating usage (Excluded)
├── figs/ # Project figures and logos (Excluded)
├── finrl/ # **CORE SOURCE CODE PACKAGE** - The heart of the framework
│ ├── agents/ # **DRL Agents and Wrappers**: Integrates and adapts various DRL libraries (Stable-Baselines3, ElegantRL, RLlib) to the FinRL environment interface.
│ │ ├── elegantrl/ # Integration with ElegantRL DRL library
│ │ ├── portfolio_optimization/ # Specific agents for portfolio optimization tasks
│ │ ├── rllib/ # Integration with RLlib DRL library
│ │ └── stablebaselines3/ # Integration with Stable-Baselines3 DRL library
│ ├── applications/ # **Financial Application Templates**: Provides end-to-end examples and specific configurations for different financial tasks.
│ │ ├── cryptocurrency_trading/
│ │ ├── high_frequency_trading/
│ │ ├── portfolio_allocation/
│ │ └── stock_trading/ # Example implementations for stock trading, including ensemble methods
│ ├── meta/ # **Meta/Environment Components**: The infrastructure layer for data and environment modeling.
│ │ ├── data_processors/ # Data acquisition and feature engineering from various sources (Yahoo, Alpaca, etc.)
│ │ ├── env_*/ # Custom OpenAI Gym environments for different financial tasks (stock trading, crypto, portfolio)
│ │ ├── paper_trading/ # Real-time/paper trading integration (e.g., Alpaca)
│ │ └── preprocessor/ # Legacy/alternative data downloaders
│ ├── config.py # Global configuration constants (dates, indicators, model params)
│ ├── main.py # Main entry point for CLI (train, test, trade modes)
│ ├── train.py # Core DRL training workflow logic
│ ├── trade.py # Core trading workflow logic (backtesting/paper trading)
│ └── plot.py # Utility for plotting results and performance metrics
├── unit_tests/ # Unit tests (Excluded)
└── ... # Other configuration files (README, LICENSE, setup.py, etc.) (Excluded)
```
The structure is highly modular, with the `finrl` package acting as the primary container. The **`meta`** module handles the crucial task of transforming raw financial data into a standardized Reinforcement Learning problem (State, Action, Reward), while the **`agents`** module abstracts the complexity of different DRL algorithms. The top-level files (`main.py`, `train.py`, `trade.py`) serve as the **orchestration layer**, tying these components together to execute the full DRL pipeline. This design ensures that the core logic is separated from configuration, data handling, and algorithm implementation.
```
### 1.2. Core Folders for Analysis
* `/home/ubuntu/FinRL/finrl`: The root of the core package, containing entry points and global configurations.
* `/home/ubuntu/FinRL/finrl/agents`: The module responsible for integrating and wrapping various DRL libraries (Stable-Baselines3, ElegantRL, RLlib) into a unified `DRLAgent` interface.
* `/home/ubuntu/FinRL/finrl/meta`: The meta-module that provides the necessary infrastructure for DRL in finance, including data processing, custom Gym environments, and paper trading interfaces.
* `/home/ubuntu/FinRL/finrl/applications`: Contains application-specific, end-to-end examples and templates for different financial tasks.
## Phase 2: Module-by-Module Deep Analysis
## 1. Core/Entry Module (`finrl`)
**Module Core Responsibility**: This module serves as the **entry point** and **orchestrator** for the entire FinRL workflow. It defines global configurations and implements the high-level logic for the three main modes of operation: `train`, `test`, and `trade`.
**Key File Identification**:
* `config.py`: Defines all global constants, including data directories (`DATA_SAVE_DIR`), date ranges (`TRAIN_START_DATE`), technical indicators (`INDICATORS` - e.g., `macd`, `rsi_30`), and default DRL model hyperparameters (`A2C_PARAMS`, `PPO_PARAMS`, etc.). This file centralizes all experiment parameters.
* `main.py`: The command-line interface entry point. It parses the `--mode` argument (`train`, `test`, `trade`) and calls the corresponding function from `finrl.train`, `finrl.test`, or `finrl.trade`. It ensures necessary directories are created for saving data and models.
* `train.py`: Implements the DRL training pipeline. It orchestrates the data flow: `DataProcessor` -> `download_data` -> `clean_data` -> `add_technical_indicator` -> `df_to_array` -> `StockTradingEnv` configuration -> `DRLAgent` initialization and `train_model`. It supports conditional loading of agents from `elegantrl`, `rllib`, or `stable_baselines3`.
* `trade.py`: Implements the trading pipeline, supporting two sub-modes: `backtesting` (which delegates to `finrl.test`) and `paper_trading` (which uses the `AlpacaPaperTrading` class from the `meta` module).
## 2. Meta/Environment Module (`finrl/meta`)
**Module Core Responsibility**: This is the **infrastructure layer** that adapts financial data and tasks into the standard Reinforcement Learning paradigm (Gym environments). It handles data acquisition, feature engineering, and the definition of the trading environment's state, action, and reward space.
**Key File Identification**:
* `data_processor.py`: The main facade class, `DataProcessor`. It acts as a factory/wrapper for various data source-specific processors (e.g., `YahooFinanceProcessor`, `AlpacaProcessor`). It provides a unified interface for data downloading, cleaning, adding technical indicators, and converting the final DataFrame into the NumPy arrays (`price_array`, `tech_array`, `turbulence_array`) required by the Gym environments.
* `data_processors/processor_yahoofinance.py`: A concrete implementation of a data processor. It uses the `yfinance` library (and potentially Selenium for scraping) to fetch data and includes methods for data cleaning and feature engineering (e.g., adding the VIX index).
* `env_stock_trading/env_stocktrading.py`: The core custom Gym environment, `StockTradingEnv`.
* **State Space**: A 1D NumPy array representing `[cash, stock_price_1, ..., stock_price_N, stock_shares_1, ..., stock_shares_N, tech_indicator_1, ..., tech_indicator_M, turbulence]`.
* **Action Space**: A continuous `Box` space, where each element corresponds to the percentage of total assets to allocate to a stock (ranging from -1 to 1, representing sell/buy).
* **Reward Function**: The reward is the change in the total portfolio value (cash + stock holdings) between the current step and the previous step, scaled by `reward_scaling`.
* **Turbulence**: The environment incorporates a **turbulence index** (`risk_indicator_col`) to model market volatility. If turbulence exceeds a threshold, the agent is forced to liquidate all positions, a critical risk management mechanism.
## 3. Agent Module (`finrl/agents`)
**Module Core Responsibility**: This module provides the necessary **wrappers and interfaces** to seamlessly integrate popular DRL libraries (Stable-Baselines3, ElegantRL, RLlib) with the custom FinRL Gym environments. This abstracts the DRL implementation details from the main workflow.
**Key File Identification**:
* `stablebaselines3/models.py`: Defines the `DRLAgent` class, which wraps SB3 models (A2C, PPO, SAC, TD3, DDPG). It uses the **Adapter Pattern** to make SB3 algorithms conform to the FinRL training and prediction interface. The `DRL_prediction` method handles the testing/backtesting loop using the trained model on a vectorized environment (`DummyVecEnv`).
* `elegantrl/models.py`: Defines the `DRLAgent` class for ElegantRL integration. This wrapper is more tightly coupled with the environment's internal arrays (`price_array`, `tech_array`) as ElegantRL uses a custom `Config` object for environment and agent setup.
* `portfolio_optimization/algorithms.py`: Contains specific algorithms for portfolio optimization, demonstrating the framework's flexibility beyond standard stock trading.
## 4. Application Module (`finrl/applications`)
**Module Core Responsibility**: This module provides **ready-to-use, end-to-end examples** for various financial tasks. These files serve as templates and demonstrations, showing how to combine the `meta` (data/env) and `agents` (DRL models) modules to solve a specific problem.
**Key File Identification**:
* `stock_trading/ensemble_stock_trading.py`: A key example demonstrating the use of an **ensemble strategy** where multiple DRL agents (e.g., PPO, A2C, DDPG) are trained and their performance is validated to select the best one for trading. This highlights a key feature of the FinRL framework.
* Other files (e.g., `cryptocurrency_trading`, `portfolio_allocation`) provide specialized configurations and environment settings for those specific domains, showcasing the framework's adaptability.
### Module PlantUML Diagrams
@startuml Module_Meta
title FinRL Meta Module (Data and Environment)
package "finrl.meta" {
class DataProcessor {
- processor: AbstractProcessor
+ __init__(data_source, ...)
+ download_data(...)
+ clean_data(...)
+ add_technical_indicator(...)
+ add_turbulence(...)
+ add_vix(...)
+ df_to_array(...) : price_array, tech_array, turbulence_array
}
package "data_processors" {
interface AbstractProcessor {
+ download_data()
+ clean_data()
+ add_technical_indicator()
+ add_turbulence()
+ add_vix()
+ df_to_array()
}
class YahooFinanceProcessor
class AlpacaProcessor
class WrdsProcessor
}
package "env_stock_trading" {
class StockTradingEnv {
- df: DataFrame
- state: np.array
- day: int
- initial_amount: int
- asset_memory: list
+ __init__(...)
+ step(actions) : state, reward, done, info
+ reset() : state
+ _sell_stock(index, action)
+ _buy_stock(index, action)
+ get_sb_env() : DummyVecEnv
}
StockTradingEnv -up-|> gym.Env
}
package "paper_trading" {
class AlpacaPaperTrading {
- api_key
- api_secret
- model
+ run()
}
}
}
DataProcessor o-- AbstractProcessor : uses
YahooFinanceProcessor -up-|> AbstractProcessor
AlpacaProcessor -up-|> AbstractProcessor
WrdsProcessor -up-|> AbstractProcessor
StockTradingEnv ..> DataProcessor : receives arrays from df_to_array()
AlpacaPaperTrading ..> StockTradingEnv : uses for state/action logic
@enduml
@startuml Module_Agents
title FinRL Agents Module (DRL Wrappers)
package "finrl.agents" {
interface DRLAgentInterface {
+ get_model(model_name, ...)
+ train_model(model, ...)
+ DRL_prediction(model, environment)
}
package "stablebaselines3" {
class DRLAgent_SB3 {
- env: StockTradingEnv
+ get_model(model_name, ...)
+ train_model(model, ...)
+ DRL_prediction(model, environment)
}
class TensorboardCallback
}
package "elegantrl" {
class DRLAgent_ElegantRL {
- env_config
+ get_model(model_name, model_kwargs)
+ train_model(model, cwd, total_timesteps)
}
}
package "rllib" {
class DRLAgent_RLlib {
+ get_model(model_name)
+ train_model(model, ...)
}
}
}
DRLAgent_SB3 -up-|> DRLAgentInterface
DRLAgent_ElegantRL -up-|> DRLAgentInterface
DRLAgent_RLlib -up-|> DRLAgentInterface
DRLAgent_SB3 ..> TensorboardCallback : uses
DRLAgent_SB3 ..> StockTradingEnv : wraps/uses
DRLAgent_ElegantRL ..> StockTradingEnv : wraps/uses
@enduml
@startuml Module_Core
title FinRL Core Module (Orchestration)
package "finrl" {
class Config {
+ TRAIN_START_DATE
+ INDICATORS
+ PPO_PARAMS
+ ...
}
class Main {
+ main()
+ build_parser()
}
class Train {
+ train(...)
}
class Trade {
+ trade(...)
}
}
Main ..> Config : reads constants
Main ..> Train : calls train()
Main ..> Trade : calls trade()
Train ..> DataProcessor : uses for data prep
Train ..> StockTradingEnv : instantiates environment
Train ..> DRLAgentInterface : uses for model training
Trade ..> StockTradingEnv : instantiates environment
Trade ..> DRLAgentInterface : uses for prediction (backtesting)
Trade ..> AlpacaPaperTrading : uses for paper trading
@enduml
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
The FinRL framework is fundamentally built on the **Reinforcement Learning (RL) Paradigm** applied to quantitative finance, adhering closely to the **OpenAI Gym interface** for environment standardization.
**Core Abstractions**:
1. **Data Processor**: This serves as an abstraction layer over diverse financial data sources (Yahoo Finance, Alpaca, WRDS, etc.). It is responsible for standardizing raw data into a clean, feature-engineered format (DataFrame) suitable for the RL environment. This abstraction ensures the core DRL logic remains independent of the data source.
2. **Environment (`StockTradingEnv`)**: This is the central abstraction that models the financial market as a Markov Decision Process (MDP). It rigorously defines the three core components of the RL problem:
* **State**: The observation space, which includes cash, stock prices, stock shares, technical indicators, and the market turbulence index.
* **Action**: The action space, a continuous `Box` representing the normalized allocation of total assets to each stock (ranging from -1 for selling to 1 for buying).
* **Reward**: The immediate reward, calculated as the change in the total portfolio value (cash + stock holdings) between time steps.
3. **DRL Agent Wrapper (`DRLAgent`)**: This is a critical abstraction over different DRL libraries (Stable-Baselines3, ElegantRL, RLlib). It allows users to swap out the underlying DRL algorithm with minimal code changes, promoting modularity, experimentation, and comparison of different algorithms on the same financial task.
**Design Philosophy**:
* **Modularity and Extensibility**: The clear separation of concerns between Data (`DataProcessor`), Environment (`Env`), and Algorithm (`DRLAgent`) is the cornerstone of the design. This structure allows for easy extension: new data sources require only a new processor implementation, new financial tasks require a new Gym environment, and new DRL algorithms require a new `DRLAgent` wrapper.
* **Risk-Awareness**: The framework demonstrates a focus on real-world risk management by explicitly including a **turbulence index** in the state space. The environment's logic includes a mechanism for forced liquidation of all positions if market turbulence exceeds a predefined threshold, a crucial feature for financial stability.
* **Ensemble Learning Focus**: The design encourages the use of ensemble strategies, as evidenced by the application templates, to mitigate the high variance and improve the robustness of DRL models in volatile financial markets.
**Lifecycle Management**:
The lifecycle is managed by the core orchestration scripts (`main.py`, `train.py`, `trade.py`). The process flows from configuration (`config.py`) -> data preparation (`DataProcessor`) -> environment setup (`StockTradingEnv`) -> model training (`DRLAgent`) -> model persistence (saving trained models) -> and finally, deployment for backtesting or paper trading. This sequential, modular lifecycle ensures reproducibility and clear debugging paths.
#### 3.1.2. Component Interactions
The FinRL system follows a clear, sequential data flow, primarily orchestrated by the `train.py` and `trade.py` scripts, ensuring a structured pipeline from data to decision-making.
**1. Data Acquisition and Preprocessing**:
The process begins in `train.py` which calls the `DataProcessor` (from `finrl/meta/data_processor.py`). The `DataProcessor` acts as a facade, instantiating a source-specific processor (e.g., `YahooFinanceProcessor` in `finrl/meta/data_processors/processor_yahoofinance.py`). This processor fetches raw financial data, cleans it, adds technical indicators, and incorporates market volatility measures like the VIX index. The final output is a set of three NumPy arrays: `price_array`, `tech_array`, and `turbulence_array`, which are passed back to the core workflow.
**2. Environment Initialization**:
These NumPy arrays are used to configure and instantiate the custom Gym environment, typically `StockTradingEnv` (from `finrl/meta/env_stock_trading/env_stocktrading.py`). The environment uses these arrays to define its state space and to simulate the passage of time (days), making the financial market an accessible Markov Decision Process (MDP) for the DRL agent.
**3. Training Loop (Agent-Environment Interaction)**:
The `train.py` script initializes the appropriate `DRLAgent` wrapper (e.g., `DRLAgent_SB3` from `finrl/agents/stablebaselines3/models.py`) and calls its `train_model()` method.
* **Interaction**: The DRL model interacts with the `StockTradingEnv` by calling `env.step(action)`. The DRL model outputs an `action` (a normalized portfolio allocation vector).
* **Execution**: The `StockTradingEnv.step()` method executes the simulated trade, updates the portfolio state (`self.state`), calculates the `reward` (change in portfolio value), and returns the new state, reward, and terminal status to the DRL algorithm.
* Training results are logged via the `TensorboardCallback` for monitoring.
**4. Testing/Trading Loop**:
The `trade.py` or `test.py` scripts handle post-training execution.
* The trained model is loaded via `DRLAgent.DRL_prediction()`.
* The model predicts an action for each day in the test/trade period, and the environment is stepped through.
* For performance evaluation, the `asset_memory` and `actions_memory` are recorded.
* For **paper trading**, the `AlpacaPaperTrading` class in `finrl/meta/paper_trading/alpaca.py` continuously monitors the market and executes trades via the Alpaca API based on the DRL model's predictions, bridging the gap between simulation and real-world application.
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@startuml FinRL_Architecture
title FinRL Overall Architecture
skinparam component {
BackgroundColor<> LightBlue
BorderColor<> Blue
BackgroundColor<> LightGreen
BorderColor<> Green
BackgroundColor<> LightYellow
BorderColor<> Orange
}
component [Config] <> as C
component [Main Entry Point] <> as M
component [Train Workflow] <> as T
component [Trade Workflow] <> as TR
package "finrl.meta" <> {
component [DataProcessor] <> as DP
component [Data Sources] <> as DS
component [StockTradingEnv (Gym)] <> as E
component [Paper Trading Interface] <> as PT
}
package "finrl.agents" <> {
component [DRLAgent (SB3, ERL, RLlib)] <> as A
component [DRL Libraries] <> as DRL
}
M --> C : Reads global parameters
M --> T : Calls train()
M --> TR : Calls trade()
T --> DP : 1. Initializes
DP --> DS : 2. Downloads & Preprocesses Data
DP --> T : 3. Returns price/tech/turbulence arrays
T --> E : 4. Instantiates Environment (with arrays)
T --> A : 5. Initializes DRL Agent (with Env)
A --> DRL : 6. Trains Model
TR --> E : Instantiates Environment
TR --> A : Loads Trained Model
TR --> PT : Executes Paper Trading (if mode=paper_trading)
E .right.> A : State/Action/Reward Loop (step())
A .left.> E : State/Action/Reward Loop (predict())
@enduml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
The FinRL codebase effectively utilizes several software design patterns to achieve its goals of modularity, extensibility, and separation of concerns.
1. **Adapter Pattern**
* **Description**: This pattern allows the interface of an existing class to be used as another interface. In FinRL, it is used to unify the interfaces of disparate DRL libraries.
* **Implementation**: The `DRLAgent` classes in `finrl/agents/stablebaselines3/models.py`, `finrl/agents/elegantrl/models.py`, and `finrl/agents/rllib/models.py` all conform to a common interface (`get_model`, `train_model`, `DRL_prediction`). Each class adapts the specific API calls of its underlying DRL library (SB3, ElegantRL, or RLlib) to this single, unified interface, allowing the core `train.py` script to treat them interchangeably.
2. **Factory Method Pattern (Implicit)**
* **Description**: This pattern provides an interface for creating objects in a superclass, but allows subclasses to alter the type of objects that will be created.
* **Implementation**: The `DataProcessor` class in `finrl/meta/data_processor.py` acts as a simple factory. Based on the `data_source` string passed to its constructor (e.g., `"alpaca"`, `"yahoofinance"`), it dynamically instantiates the correct concrete data processor object (e.g., `AlpacaProcessor`, `YahooFinanceProcessor`).
* **Code Example (from `data_processor.py`)**:
```python
class DataProcessor:
def __init__(self, data_source, ...):
if data_source == "alpaca":
self.processor = Alpaca(...)
elif data_source == "yahoofinance":
self.processor = YahooFinance()
# ... other data sources
```
3. **Strategy Pattern**
* **Description**: This pattern defines a family of algorithms, encapsulates each one, and makes them interchangeable. Strategy lets the algorithm vary independently from the clients that use it.
* **Implementation**: The overall training workflow in `train.py` allows the user to select a "strategy" (the DRL algorithm, e.g., PPO, SAC, DDPG) and the DRL library (e.g., `stable_baselines3`, `elegantrl`) at runtime. The `train` function then dynamically loads and uses the corresponding `DRLAgent` and DRL model based on these parameters, enabling easy comparison of different trading strategies.
#### 3.3.2. Project Highlights
The FinRL framework includes several innovative features and design choices that enhance its utility and flexibility for financial reinforcement learning:
* **Unified DRL Framework**: FinRL provides a single, consistent API that abstracts away the differences between multiple state-of-the-art DRL libraries, including Stable-Baselines3, ElegantRL, and RLlib. This allows researchers and practitioners to easily switch between and compare algorithms (e.g., PPO, SAC, DDPG) without modifying the core data or environment logic.
* **Financial Market Modeling with Risk Awareness**: The custom Gym environments, such as `StockTradingEnv`, are specifically tailored for finance. They incorporate essential real-world elements like **transaction costs** (`buy_cost_pct`, `sell_cost_pct`) and, critically, a **turbulence index**. This index is used to model market volatility, and the environment enforces a **risk-management mechanism** (forced liquidation) when turbulence exceeds a threshold, making the simulation more realistic and risk-aware.
* **Data Source Agnosticism**: Through the `DataProcessor` abstraction, the framework achieves a high degree of data source agnosticism. The same DRL pipeline can be run on data from various providers (Yahoo Finance, Alpaca, WRDS, etc.) by simply changing a configuration parameter, significantly reducing the effort required for data integration.
* **Real-World Readiness and Paper Trading**: The inclusion of a dedicated `trade.py` module with the `AlpacaPaperTrading` class provides a direct and seamless path from backtesting to live paper trading. This feature is a major highlight, enabling users to test their trained agents in a simulated live market environment before committing real capital.
* **Ensemble Learning Support**: The framework is explicitly designed to facilitate the training and validation of multiple agents, supporting robust **ensemble strategies** (as demonstrated in `ensemble_stock_trading.py`). This is a key feature for improving the stability and performance of DRL models in the highly stochastic financial domain.
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
The FinRL framework is robust, but several areas can be optimized to improve performance, maintainability, and flexibility:
1. **Environment Performance and Vectorization**:
* **Issue**: The core `StockTradingEnv` in `env_stocktrading.py` is implemented using standard Python/Pandas/NumPy logic, which can be slow for high-frequency or large-scale backtesting due to Python's overhead in the simulation loop.
* **Suggestion**: Implement a fully **vectorized environment** for training. This involves processing all time steps for all assets in parallel using NumPy or a library like JAX/PyTorch, drastically reducing the number of Python function calls and improving training speed. The current `DummyVecEnv` wrapper only vectorizes the environment interface, not the internal simulation logic.
2. **Data Acquisition Reliability and Brittle Code**:
* **Issue**: The `YahooFinanceProcessor` shows a mix of `yfinance` library usage and brittle web scraping techniques (Selenium/BeautifulSoup) for data acquisition. Web scraping is highly susceptible to breaking when the target website's structure changes.
* **Suggestion**: Standardize data acquisition to rely solely on stable, official APIs (like Alpaca, which is already integrated) or robust data providers. Remove the reliance on Selenium/scraping to ensure long-term stability and maintainability of the data pipeline.
3. **Configuration Management Modernization**:
* **Issue**: The use of global constants in `config.py` is simple but limits the flexibility required for complex, reproducible experiments. Modifying a global constant affects all parts of the code.
* **Suggestion**: Adopt a modern configuration management library like **Hydra** or use **Pydantic Settings**. This would allow for structured, hierarchical configuration files (YAML/JSON), easy command-line overrides, and better separation of configuration from the core codebase, making experiment tracking and parameter tuning more robust.
4. **Code Quality and Documentation**:
* **Issue**: While type hints are present, the documentation, particularly docstrings for the core `DRLAgent` methods and environment parameters, could be more comprehensive.
* **Suggestion**: Enforce a documentation standard (e.g., NumPy or Google style docstrings) for all public methods and classes. This will significantly improve code clarity and reduce the learning curve for secondary developers.
#### 3.4.2. Secondary Development Guide
The FinRL framework is designed for extensibility, making secondary development straightforward by focusing on the three core modular components: Data, Environment, and Agent.
1. **Start with `config.py`**:
* The first step for any new experiment is to define the scope by modifying the global constants in `finrl/config.py`. This includes setting the `TRAIN_START_DATE`, `TRAIN_END_DATE`, the list of `INDICATORS`, and the hyperparameters for the DRL models (e.g., `PPO_PARAMS`).
2. **Define the Task (Environment)**:
* For standard tasks (stock trading, crypto), use the existing environments in `finrl/meta/env_stock_trading`.
* To create a new financial task (e.g., options trading, futures), create a new custom Gym environment class that inherits from `gym.Env` and defines the unique state, action, and reward mechanisms specific to that task. Ensure the `step()` method correctly calculates the reward and updates the state based on the action.
3. **Prepare Data (DataProcessor)**:
* If your data source is supported (Yahoo, Alpaca, etc.), use the existing `DataProcessor` facade.
* To integrate a new data source, create a new `processor_yourname.py` file in `finrl/meta/data_processors`. This new class must implement the required methods: `download_data`, `clean_data`, `add_technical_indicator`, and crucially, `df_to_array` to convert the data into the NumPy arrays expected by the environment.
4. **Select/Implement Agent**:
* Choose a DRL library (Stable-Baselines3 is recommended for its comprehensive documentation). The `DRLAgent` wrappers handle the integration.
* To add a new DRL algorithm not currently supported, extend the appropriate `DRLAgent` class in `finrl/agents` and implement the `get_model`, `train_model`, and `DRL_prediction` methods to wrap the new algorithm's API.
5. **Execute via `main.py`**:
* Use the command-line interface (`python main.py --mode=train`) to execute the workflow. The orchestration logic in `main.py`, `train.py`, and `trade.py` will handle the rest, ensuring the data, environment, and agent are correctly linked.
This modular approach ensures that developers can focus on one component at a time without needing to rewrite the entire pipeline.
================================================
FILE: thirdparty/FinRobot.md
================================================
# FinRobot - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
```
### 1.2. Core Folders for Analysis
## Phase 2: Module-by-Module Deep Analysis
### Module PlantUML Diagrams
## Phase 3: Overall Architecture & Summary
### 3.1. Overall Architecture Analysis
#### 3.1.1. Core Abstractions
#### 3.1.2. Component Interactions
### 3.2. Overall Architecture PlantUML Diagram
```plantuml
@startuml
@enduml
```
### 3.3. Design Patterns & Highlights
#### 3.3.1. Design Patterns
#### 3.3.2. Project Highlights
### 3.4. Summary & Recommendations
#### 3.4.1. Potential Improvements
#### 3.4.2. Secondary Development Guide
================================================
FILE: thirdparty/FinceptTerminal.md
================================================
# FinceptTerminal - In-Depth Source Code Analysis
## Phase 1: Global Scan & Planning
### 1.1. Full Directory Structure
```
/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal (Root of the project)
|-- .github (Configuration for GitHub workflows and templates)
|-- docs (Project documentation, likely Docusaurus or similar)
|-- fincept-terminal-desktop (The main application source code)
| |-- public (Static assets for the frontend)
| |-- src-tauri (Rust backend code for Tauri)
| | |-- src (Core Rust source files)
| | | |-- commands (Tauri commands for data fetching and utilities, over 30 data sources)
| | | |-- data_sources (Rust-side data source implementations)
| | | |-- utils (Utility functions, notably the Python execution bridge)
| | | |-- lib.rs (Main Rust library, process management, IPC setup)
| | | |-- main.rs (Tauri entry point)
| |-- src (TypeScript/React frontend code)
| | |-- assets (Frontend static assets)
| | |-- components (Reusable UI components)
| | | |-- tabs (Major feature views like data-mapping, trading, portfolio, node-editor)
| | | |-- ui (Design system components)
| | |-- constants (Application-wide configuration values)
| | |-- contexts (React Context providers for global state)
| | |-- hooks (Custom React hooks for logic reuse)
| | |-- lib (Frontend utility functions)
| | |-- services (Core business logic and data orchestration)
| | | |-- backtesting (Logic for backtesting strategies)
| | | |-- websocket (Real-time data handling)
| | | |-- trading (Order management logic)
| | |-- stockBrokers (Brokerage API integration adapters, e.g., ZerodhaKite)
| | |-- types (TypeScript interfaces and type definitions)
| | |-- App.tsx (Main React application component)
|-- images (Marketing and documentation images)
The project structure clearly delineates the **Hybrid Architecture**. The `fincept-terminal-desktop` directory houses the core application, split into the `src-tauri` (Rust backend) and `src` (React/TypeScript frontend) folders. This separation of concerns is fundamental, with the Rust layer managing system-level tasks and the Python bridge, while the TypeScript layer handles the rich user interface and business logic via services. The extensive `commands` directory in the Rust backend highlights the project's focus on being a comprehensive financial data aggregator.
```
### 1.2. Core Folders for Analysis
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src-tauri/src`: The core Rust backend, handling IPC, process management, and data source delegation.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/components`: The React frontend's presentation layer, including all UI elements and feature views.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/services`: The frontend's business logic layer, containing core features like workflow management, backtesting, and trading.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/stockBrokers`: Brokerage integration adapters, implementing the Adapter Pattern for trading.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/types`: Shared TypeScript interfaces and type definitions for application-wide data structures.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/constants`: Application-wide configuration values and magic strings.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/contexts`: React Context providers for global state management.
* `/home/ubuntu/FinnewsHunter/thirdparty/FinceptTerminal/fincept-terminal-desktop/src/hooks`: Custom React hooks for logic reuse across components.
## Phase 2: Module-by-Module Deep Analysis
## Module 1: `src-tauri/src` (Rust Backend)
**Core Responsibility:** The Rust backend, built with Tauri, serves as the **core application logic and data gateway**. Its primary function is to manage system-level interactions, handle inter-process communication (IPC) with the frontend, and act as a secure, performant bridge to various external data sources and computational backends (like Python). It is responsible for managing the lifecycle of external processes, such as the MCP (Model Context Protocol) server.
**Key Files and Functions:**
* `lib.rs`: Defines the core state management (`MCPState` and `MCPProcess`) for external processes.
* `commands/mod.rs`: The central registry for all Tauri commands, revealing **extensive data source integration** (e.g., `yfinance`, `polygon`, `fred`, `worldbank`).
* `utils/python.rs`: A critical file that implements the logic to locate and execute the Python interpreter across different operating systems, confirming that the Rust backend delegates data fetching and heavy computation to Python scripts.
**Core Implementation & Dependencies:** The module uses Rust and Tauri, relying on `std::sync::{Arc, Mutex}` for safe, concurrent management of external processes. Tauri's `#[tauri::command]` macro is used extensively to expose Rust functions to the TypeScript/React frontend.
## Module 2: `src/components` (Frontend UI Components)
**Core Responsibility:** This module contains the React/TypeScript components that form the user interface, responsible for visual presentation and user interaction.
**Key Files and Functions (Inferred from Directory Structure):**
* `components/tabs/*`: Contains the main feature views, such as `data-mapping`, `equity-research`, `node-editor`, `portfolio`, and `trading`, indicating a highly modular, tab-based application structure.
* `components/charts`: Dedicated components for financial data visualization.
**Core Implementation & Dependencies:** Built with TypeScript and React, the components rely on the Tauri API (`@tauri-apps/api`) to call the Rust commands for data and system interaction.
## Module 3: `src/services` (Frontend Business Logic)
**Core Responsibility:** This module encapsulates the complex business logic and data orchestration for the frontend, separating it from the presentation layer.
**Key Files and Functions:**
* `workflowService.ts`: Manages the creation, storage, execution, and state of user-defined **workflows**, suggesting a core feature is a visual programming or automation tool.
* `services/backtesting`: Contains logic for financial backtesting, likely integrating with Python libraries like `vectorbt` or `lean`.
* `services/websocket`: Handles real-time data streaming, essential for a financial terminal.
**Core Implementation & Dependencies:** This module implements the **Service Layer** pattern and depends on the Tauri IPC layer to communicate with the Rust backend for data and process control.
## Module 4: `src/stockBrokers` (Brokerage Integration)
**Core Responsibility:** Provides a standardized interface for connecting to and interacting with various stock brokerage APIs.
**Key Files and Functions:**
* `stockBrokers/india/zerodhaKite`: A concrete implementation for a specific Indian brokerage, indicating a focus on the Indian market or a modular design for regional expansion.
**Core Implementation & Dependencies:** The module likely uses the **Adapter Pattern** to normalize the different brokerage APIs into a single interface used by the `trading` service.
## Module 5: `src/types` (Shared Data Structures)
**Core Responsibility:** Defines the core TypeScript data structures and interfaces used across the entire frontend application, ensuring type safety and consistency. This adheres to the **Single Source of Truth** principle for data types.
## Module 6: `src/lib`, `src/hooks`, `src/constants`, `src/contexts` (Utilities and State)
**Core Responsibility:** Contains common utilities, custom React hooks, application-wide constants, and React context providers for global state. This module uses the **Context Pattern** for dependency injection and state management throughout the frontend.
**Conclusion:** The project is a **hybrid desktop application** built with **Tauri (Rust) and React/TypeScript**. The Rust backend acts as a secure data API gateway, leveraging Python for data fetching, while the React frontend provides a rich, modular, tab-based user interface with core features like **workflow automation**, **backtesting**, and **brokerage integration**.
### Module PlantUML Diagrams
# Rust Backend Module (`src-tauri/src`)
@startuml
title Rust Backend Module (`src-tauri/src`)
package "Core Logic" {
class AppHandle
class MCPState {
- processes: Mutex>
}
class MCPProcess {
- child: Child
- stdin: Arc>
- response_rx: Receiver
}
class SpawnResult
interface TauriCommand
}
package "Utilities" {
class PythonUtils {
+ get_python_path(app: &AppHandle)
+ execute_python_command(...)
}
}
package "Commands" {
class YFinanceCommand <>
class PolygonCommand <