Repository: AgentEra/Agently-Daily-News-Collector Branch: main Commit: 6812e1c78fb9 Files: 51 Total size: 146.8 KB Directory structure: gitextract__2ep8twt/ ├── .gitignore ├── .vscode/ │ └── settings.json ├── Dockerfile ├── LICENSE ├── README.md ├── README_CN.md ├── SETTINGS.yaml ├── app.py ├── logs/ │ └── .gitkeep ├── news_collector/ │ ├── __init__.py │ ├── cli.py │ ├── collector.py │ ├── config.py │ ├── logging_utils.py │ └── markdown.py ├── outputs/ │ └── .gitkeep ├── prompts/ │ ├── create_outline.yaml │ ├── pick_news.yaml │ ├── summarize_news.yaml │ └── write_column.yaml ├── requirements.txt ├── tools/ │ ├── README.md │ ├── __init__.py │ ├── base.py │ └── builtin.py ├── v3/ │ ├── README.md │ ├── README_CN.md │ ├── SETTINGS.yaml │ ├── app.py │ ├── examples/ │ │ └── Latest Updates on AI Models2024-05-02.md │ ├── prompts/ │ │ ├── create_outline.yaml │ │ ├── pick_news.yaml │ │ ├── summarize.yaml │ │ └── write_column.yaml │ ├── requirements.txt │ ├── utils/ │ │ ├── __init__.py │ │ ├── logger.py │ │ ├── path.py │ │ └── yaml_reader.py │ └── workflows/ │ ├── __init__.py │ ├── column_workflow.py │ ├── main_workflow.py │ └── tools/ │ ├── __init__.py │ ├── browse.py │ └── search.py └── workflow/ ├── __init__.py ├── column_chunks.py ├── common.py ├── daily_news.py ├── report_chunks.py └── summary_chunks.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # MacOS .DS_Store # Agently .Agently # Project outputs logs/* !logs/.gitkeep outputs/* !outputs/.gitkeep ================================================ FILE: .vscode/settings.json ================================================ { "python-envs.defaultEnvManager": "ms-python.python:conda", "python-envs.defaultPackageManager": "ms-python.python:conda" } ================================================ FILE: Dockerfile ================================================ FROM python:3.10 WORKDIR /app COPY . . RUN pip install --no-cache-dir -r requirements.txt CMD ["python", "app.py"] ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # Agently Daily News Collector v4 Agently Daily News Collector has been rewritten on top of **Agently v4** and now uses: - `TriggerFlow` for the end-to-end pipeline - Agently v4 built-in `Search` and `Browse` tools - structured output contracts instead of the old v3 workflow API > Version constraint: this project requires **Agently v4.0.8.3 or newer**. The current implementation uses `TriggerFlow sub flow` to organize per-column pipelines, so earlier v4 releases are not compatible with the workflow structure used here. The previous Agently v3 project has been archived under [`./v3`](./v3). ## Features - Input a topic and generate a multi-column news briefing automatically - Search, shortlist, browse, summarize, and assemble stories in one flow - Save the final report as Markdown under `./outputs` - Keep prompt templates in `./prompts` for easy editing - Keep an independent `./tools` layer so search/browse can be replaced without touching the main workflow - Keep flow construction in `./workflow` so orchestration can evolve independently from collector logic ## Quick Start 1. Install dependencies: ```bash pip install -r requirements.txt ``` If you install Agently manually, make sure you use at least: ```bash pip install "agently>=4.0.8.3" ``` 2. Edit [`SETTINGS.yaml`](./SETTINGS.yaml): - Keep the model block as environment placeholders - Export the required environment variables: ```bash export AGENTLY_NEWS_BASE_URL="https://api.openai.com/v1" export AGENTLY_NEWS_MODEL="gpt-4.1-mini" export AGENTLY_NEWS_API_KEY="your_api_key" ``` - Or put them in a local `.env` file: ```dotenv AGENTLY_NEWS_BASE_URL=https://api.openai.com/v1 AGENTLY_NEWS_MODEL=gpt-4.1-mini AGENTLY_NEWS_API_KEY=your_api_key ``` - Adjust language / search / concurrency settings if needed - If your OpenAI-compatible endpoint does not require authentication, you can leave `AGENTLY_NEWS_API_KEY` unset and the project will skip `auth`. 3. Run: ```bash python app.py ``` Or pass a topic directly: ```bash python app.py "AI agents" ``` ## Project Structure ```text . ├── app.py ├── news_collector/ ├── tools/ ├── workflow/ ├── prompts/ ├── outputs/ ├── logs/ └── v3/ ``` ## Important v3 -> v4 Changes The business chain is still roughly: `outline -> search -> pick -> browse + summarize -> write column -> render markdown` What changed is the engineering shape around that chain. ### Project-level changes - The old v3 project used a main workflow plus a nested column workflow under `./workflows`, with custom `search.py` / `browse.py` helpers and storage-style state passing. - The v4 project separates responsibilities more clearly: - `news_collector/`: app/integration layer - `workflow/`: parent flow, column sub flow, and concrete chunk logic - `tools/`: search/browse adapter layer - `prompts/`: structured prompt contracts - Model configuration is no longer hardcoded in Python. It now uses `${ENV.xxx}` placeholders from `SETTINGS.yaml`, so deployment and local switching are simpler. - Tool wiring is no longer buried inside workflow code. Search, browse, and logger are injected as TriggerFlow runtime resources, which makes the workflow easier to replace or test. - The workflow plan is now closer to the business boundary: - parent flow: `prepare_request -> generate_outline -> for_each(column) -> render_report` - column sub flow: `search -> pick -> summarize -> write_column` - the `summarize` stage inside the column flow is further pushed down into a summary sub flow, where TriggerFlow handles fan-out and collection directly instead of leaving `asyncio.gather` in business code - this keeps the parent focused on report orchestration and the child focused on one column lifecycle - the immediate value of `sub flow` here is that the column pipeline becomes a reusable, independently evolvable workflow unit instead of staying buried inside one oversized parent chunk ### Agently v4 features used here - **TriggerFlow orchestration** - Replaces the old v3 workflow style with a more explicit flow graph (`to`, `for_each`, `sub flow`, branching-ready composition). - Unlike the old v3 Workflow chain, TriggerFlow here runs columns concurrently and also summarizes picked stories concurrently within each column. - Meaning for this project: the end-to-end news pipeline is easier to inspect, evolve, and split into chunks without mixing orchestration with business logic, while the parent report flow and the per-column pipeline can now be modeled directly as parent/child flows instead of one oversized chunk. - **Sub flow composition** - The project can now extract a naturally repeated business pipeline, “build one column”, into its own TriggerFlow and invoke it repeatedly from the parent flow inside `for_each(column)`. - Meaning for this project: - the parent flow stays focused on report-level orchestration - the column pipeline can be tested, visualized, and exported independently - future variants such as “briefing column”, “deep-dive column”, or “regional column” can reuse or derive from the child flow instead of cloning parent-flow nodes - `capture / write_back` makes the boundary between parent and child explicit for input, state, and resources - **Structured output contracts** - YAML prompts now define output schema directly for outline generation, news picking, summarizing, and column writing. - Meaning for this project: much less handwritten parsing glue, clearer interfaces between steps, and easier prompt iteration. - **Built-in Search / Browse tools** - The project now defaults to Agently v4 built-in tool implementations instead of the old project-local helpers. - Meaning for this project: less custom infrastructure code, and users can still swap implementations through `./tools` without rewriting the workflow. - **Runtime resources and state namespaces** - TriggerFlow runtime resources are used to inject logger/search/browse dependencies, while runtime state stores execution data such as request, outline, and intermediate results. - Meaning for this project: dependency wiring and execution state are separated cleanly, which keeps chunk code thinner and more maintainable. - **Environment-aware settings** - Agently v4 `set_settings(..., auto_load_env=True)` works directly with `${ENV.xxx}` placeholders. - Meaning for this project: model endpoint, model name, and API key can be switched by environment instead of editing code or committing secrets. ### Overall effect on this project - The core product behavior remains familiar to v3 users, but the project now has a cleaner app/workflow/tools/prompts split. - More logic is expressed in Agently-native capabilities instead of project-specific glue code. - True concurrency is now part of the default execution model. The v3 version was effectively serial, while the v4 version can process columns and per-column summaries in parallel through TriggerFlow. - Replacing tools, adjusting prompts, or evolving workflow steps is now lower-risk than in the old v3 layout, and the overall orchestration shape is again aligned with the original “main flow + column flow” mental model. - It also means workflow evolution can happen by layer: report-level changes stay in the parent flow, while column-level changes stay in the sub flow instead of forcing both to change together. ## Notes - Python `>=3.10` is required because Agently v4 requires it. - This project requires Agently `>=4.0.8.3`. - Model settings now use Agently v4 `auto_load_env=True` with `${ENV.xxx}` placeholders. - `tools/` defaults to Agently v4 built-in implementations, but you can replace the factories there with your own tools. - `workflow/` is now split by business boundary into the parent flow, the column sub flow, report-level chunks, and column-level chunks. - `news_collector/` acts as the app/integration layer for configuration, model wiring, and CLI entry support. - The current sample [`SETTINGS.yaml`](./SETTINGS.yaml) enables `BROWSE.enable_playwright: true` by default because many news pages need a real browser to return usable content. - If you do not want to install Playwright, set `BROWSE.enable_playwright` to `false` manually, but expect weaker browse quality on dynamic or protected sites. - The settings loader keeps basic compatibility with the old v3 keys such as `MODEL_PROVIDER`, `MODEL_URL`, `MODEL_AUTH`, `MODEL_OPTIONS`, `MAX_COLUMN_NUM`, and `USE_CUSTOMIZE_OUTLINE`. ================================================ FILE: README_CN.md ================================================ # Agently Daily News Collector v4 本项目已经基于 **Agently v4** 完整重写,核心实现改为: - 使用 `TriggerFlow` 编排整条新闻采集流程 - 使用 Agently v4 内置 `Search` / `Browse` 工具 - 使用结构化输出契约替代旧版 v3 `Workflow` API > 版本约束:本项目仅适用于 **Agently v4.0.8.3 及以上版本**。当前实现已经使用 `TriggerFlow sub flow` 组织栏目子流程;如果你安装的是更早的 v4 版本,主流程与子流程之间的组合能力将与当前代码不兼容。 原有 Agently v3 项目已经整体归档到 [`./v3`](./v3)。 ## 功能说明 - 输入一个主题,自动生成多栏目新闻汇总 - 自动完成搜索、筛选、浏览正文、总结和 Markdown 排版 - 最终报告输出到 `./outputs` - 提示词保存在 `./prompts`,便于继续调优 - 提供独立的 `./tools` 适配层,方便替换搜索和浏览实现 - 提供独立的 `./workflow` 目录,方便单独调整流程编排 ## 使用方式 1. 安装依赖: ```bash pip install -r requirements.txt ``` 如果你是手动安装 Agently,请确认版本至少为: ```bash pip install "agently>=4.0.8.3" ``` 2. 修改 [`SETTINGS.yaml`](./SETTINGS.yaml): - 保持模型配置为环境变量占位符 - 在环境变量中提供下面三个值: ```bash export AGENTLY_NEWS_BASE_URL="https://api.openai.com/v1" export AGENTLY_NEWS_MODEL="gpt-4.1-mini" export AGENTLY_NEWS_API_KEY="your_api_key" ``` - 或者写到本地 `.env` 文件中: ```dotenv AGENTLY_NEWS_BASE_URL=https://api.openai.com/v1 AGENTLY_NEWS_MODEL=gpt-4.1-mini AGENTLY_NEWS_API_KEY=your_api_key ``` - 按需调整输出语言、搜索参数和并发参数 - 如果你的 OpenAI-compatible 服务本身不需要鉴权,可以不设置 `AGENTLY_NEWS_API_KEY`,项目会自动跳过 `auth` 3. 启动: ```bash python app.py ``` 也可以直接把主题作为命令行参数传入: ```bash python app.py "AI Agents" ``` ## 目录结构 ```text . ├── app.py ├── news_collector/ ├── tools/ ├── workflow/ ├── prompts/ ├── outputs/ ├── logs/ └── v3/ ``` ## 重要说明:v3 -> v4 的关键变化 业务主线其实没有变,仍然基本是: `outline -> search -> pick -> browse + summarize -> write column -> render markdown` 真正变化的是这条链路在工程上的组织方式。 ### 从本项目角度看,主要改了什么 - 旧版 v3 主要是 `./workflows` 里的主流程加栏目子流程,再配合项目内自定义的 `search.py` / `browse.py` 和 storage 传值。 - 新版 v4 把职责拆得更清楚: - `news_collector/`:app / integration 层 - `workflow/`:主 flow、栏目 sub flow 与各 chunk 的具体实现 - `tools/`:搜索与抓取适配层 - `prompts/`:结构化提示词契约 - 模型配置不再写死在 Python 代码里,而是统一通过 `SETTINGS.yaml` 里的 `${ENV.xxx}` 占位符注入,部署和切换环境更简单。 - 搜索、浏览、日志等依赖不再散落在工作流实现内部,而是通过 TriggerFlow runtime resources 注入,后续替换实现时不需要改业务流程本身。 - 现在工作流规划也更贴近业务边界: - 主 flow 负责 `prepare_request -> generate_outline -> for_each(column) -> render_report` - 栏目 sub flow 负责 `search -> pick -> summarize -> write_column` - 栏目内部的 `summarize` 又继续下沉为一个 summary sub flow,用 TriggerFlow 自己的 `for_each + collect` 做并发收拢,而不是在业务代码里手写 `asyncio.gather` - 这样主流程关注“整份日报如何生成”,子流程关注“单个栏目如何产出” - `sub flow` 的直接价值是:栏目链路现在可以被当成一个独立、可复用、可单独演进的流程单元来看待,而不是继续埋在父流程的某个大 chunk 里 ### 本项目实际用到了 Agently v4 的哪些关键能力 - **TriggerFlow 编排** - 用更显式的流程图式写法替代 v3 的旧 Workflow 风格,支持 `to`、`for_each`、`sub flow` 等组合方式。 - 和旧版 v3 基本串行执行不同,这个 v4 版本会并发处理多个栏目,并在栏目内部并发总结多条入选新闻。 - 对本项目的意义:新闻采集链路更容易拆 chunk、看依赖、调并发,也更适合后续继续演进;现在“主流程”和“栏目流程”可以直接用 sub flow 建模,而不是继续把整条栏目链路塞进一个大 chunk。 - **Sub Flow 组合能力** - 现在可以把“栏目生成”这种天然重复出现的业务子流程,抽成独立的 TriggerFlow,再由父 flow 在 `for_each(column)` 中重复调用。 - 对本项目的意义: - 父 flow 只保留日报级编排,职责更稳定 - 栏目流程可以单独测试、单独可视化、单独导出配置 - 后续如果新增“快讯栏目”“深度栏目”“地区栏目”,可以直接复用或派生子流程,而不是继续复制粘贴父流程节点 - 主流程和子流程之间通过 `capture / write_back` 显式传递输入、状态和资源,边界比闭包式调用清晰得多 - **结构化输出契约** - 现在 outline、pick、summarize、write column 都直接在 YAML prompt 里声明输出结构。 - 对本项目的意义:少写很多手工解析代码,步骤之间的接口更清晰,调 prompt 时更可控。 - **内置 Search / Browse 工具** - 默认直接使用 Agently v4 提供的 Search / Browse,而不是沿用 v3 里项目自带的工具实现。 - 对本项目的意义:减少项目自维护基础设施代码,同时又保留了 `./tools` 层,方便用户自己替换实现。 - **Runtime resources 与 state 命名空间** - 通过 TriggerFlow runtime resources 注入 `logger`、`search_tool`、`browse_tool`,通过 runtime state 保存 `request`、`outline`、中间结果。 - 对本项目的意义:把“依赖注入”和“流程状态”拆开,chunk 代码更薄,也更容易维护。 - **环境变量感知的 settings** - 使用 Agently v4 的 `set_settings(..., auto_load_env=True)` 配合 `${ENV.xxx}` 占位符。 - 对本项目的意义:`base_url`、`model`、`api_key` 都可以按环境切换,不需要改代码,也更适合本地开发和部署。 ### 这些改动对项目整体的意义 - 对 v3 用户来说,产品级行为仍然熟悉,但项目结构已经从“单体 workflow 脚本”变成了更清晰的 app / workflow / tools / prompts 分层。 - 更多能力直接复用了 Agently v4 原生机制,而不是继续在项目里堆自定义胶水代码。 - 真正的并发执行现在成为默认能力。v3 版本整体上仍是串行 workflow,而 v4 可以通过 TriggerFlow 并发跑栏目和栏目内摘要,直接改善总耗时。 - 后续无论是替换工具、调整提示词,还是演进工作流步骤,风险都比 v3 结构更低;主流程和栏目流程也终于恢复成了清晰的父子结构。 - 这也意味着工作流演进可以按层进行:日报级逻辑改父 flow,栏目级逻辑改 sub flow,二者不必总是一起变动。 ## 说明 - Agently v4 要求 Python `>=3.10` - 本项目要求 Agently `>=4.0.8.3` - 模型配置现在使用 Agently v4 的 `auto_load_env=True` 和 `${ENV.xxx}` 占位符 - `tools/` 默认封装 Agently v4 内置工具;如果你要接自己的搜索或抓取方案,只需要替换这里的工厂函数 - `workflow/` 现在按业务边界拆成主 flow、栏目 sub flow、报告级 chunks、栏目级 chunks - `news_collector/` 现在承担 app/integration 层职责,负责配置、模型装配和 CLI 入口支持 - 当前仓库里的 [`SETTINGS.yaml`](./SETTINGS.yaml) 默认开启 `BROWSE.enable_playwright: true`,因为很多新闻页面只有在真实浏览器环境下才能抓到可用正文 - 如果你不想额外安装 Playwright,可以手动把 `BROWSE.enable_playwright` 改成 `false`,但动态站点、受保护页面和部分媒体站的抓取质量会明显下降 - 新版配置优先读取 `MODEL / SEARCH / BROWSE / WORKFLOW / OUTLINE / OUTPUT` 结构,同时兼容部分旧版 v3 配置键,例如 `MODEL_PROVIDER`、`MODEL_URL`、`MODEL_AUTH`、`MODEL_OPTIONS`、`MAX_COLUMN_NUM`、`USE_CUSTOMIZE_OUTLINE` ================================================ FILE: SETTINGS.yaml ================================================ # Debug DEBUG: false # Shared proxy for model/search/browse. Leave empty if not needed. PROXY: http://127.0.0.1:7890 # Agently v4 model configuration. # Use Agently v4 `${ENV.xxx}` placeholders and let Agently resolve them from # the current environment or `.env` file when applying model settings. MODEL: provider: OpenAICompatible base_url: ${ENV.DEEPSEEK_BASE_URL} model: ${ENV.DEEPSEEK_DEFAULT_MODEL} model_type: chat auth: api_key: ${ENV.DEEPSEEK_API_KEY} request_options: temperature: 0.2 SEARCH: max_results: 8 timelimit: d region: us-en backend: auto BROWSE: enable_playwright: true playwright_headless: true response_mode: markdown max_content_length: 12000 min_content_length: 80 WORKFLOW: max_column_num: 3 max_news_per_column: 3 output_language: Chinese column_concurrency: 3 summary_concurrency: 3 OUTLINE: use_customized: false customized: report_title: Today's News about Large Model Applications column_list: - column_title: Latest Launches column_requirement: Focus on newly announced or newly released large model applications in the last few days. search_keywords: large model application launch this week - column_title: Hot Topics column_requirement: Focus on the most discussed or fastest-growing large model applications. search_keywords: large model application trending news - column_title: Industry Moves column_requirement: Focus on enterprise adoption, funding, partnerships, or strategy shifts around large model applications. search_keywords: large model application enterprise partnership funding OUTPUT: directory: outputs ================================================ FILE: app.py ================================================ from news_collector.cli import main if __name__ == "__main__": raise SystemExit(main()) ================================================ FILE: logs/.gitkeep ================================================ ================================================ FILE: news_collector/__init__.py ================================================ def __getattr__(name: str): if name == "AppSettings": from .config import AppSettings return AppSettings if name == "DailyNewsCollector": from .collector import DailyNewsCollector return DailyNewsCollector if name == "main": from .cli import main return main raise AttributeError(name) ================================================ FILE: news_collector/cli.py ================================================ from __future__ import annotations import sys from pathlib import Path from .config import AppSettings from .collector import DailyNewsCollector from .logging_utils import configure_logging ROOT_DIR = Path(__file__).resolve().parent.parent SETTINGS_PATH = ROOT_DIR / "SETTINGS.yaml" def main() -> int: settings = AppSettings.load(SETTINGS_PATH) logger = configure_logging( debug=settings.debug, log_dir=ROOT_DIR / "logs", ) topic = " ".join(sys.argv[1:]).strip() if not topic: topic = input("请输入要生成新闻汇总的主题 / Please input the topic: ").strip() if not topic: print("Topic is required.") return 1 collector = DailyNewsCollector( settings=settings, root_dir=ROOT_DIR, logger=logger, ) try: result = collector.collect(topic) except Exception as exc: # pragma: no cover - CLI guard logger.exception("Daily news collection failed: %s", exc) return 1 print(result["markdown"]) print(f"\n[Saved to] {result['output_path']}") return 0 ================================================ FILE: news_collector/collector.py ================================================ from __future__ import annotations import logging import os import re from pathlib import Path from typing import Any, cast from agently import Agently from tools import create_browse_tool, create_search_tool from workflow import build_daily_news_flow from .config import AppSettings class DailyNewsCollector: def __init__( self, *, settings: AppSettings, root_dir: str | Path, logger: logging.Logger, ): self.settings = settings self.root_dir = Path(root_dir).resolve() self.logger = logger self._configure_agently() search_tool = create_search_tool(self.settings) browse_tool = create_browse_tool(self.settings) self.flow = build_daily_news_flow( settings=self.settings, root_dir=self.root_dir, model_label=self.model_label, ) self.flow.update_runtime_resources( logger=self.logger, search_tool=search_tool, browse_tool=browse_tool, ) def collect(self, topic: str) -> dict[str, Any]: normalized_topic = topic.strip() if not normalized_topic: raise ValueError("Topic is required.") result = self.flow.start(normalized_topic) return result def _configure_agently(self) -> None: from dotenv import find_dotenv, load_dotenv load_dotenv(find_dotenv()) model_settings = self.settings.model.to_agently_settings(self.settings.proxy) self._ensure_required_model_env( { "base_url": model_settings.get("base_url"), "model": model_settings.get("model"), } ) if self._missing_env_names(model_settings.get("auth")): model_settings.pop("auth", None) resolved_model_name = self._resolve_env_value(model_settings.get("model")) self.model_label = f"{self.settings.model.provider} / {resolved_model_name}" Agently.set_settings("debug", self.settings.debug) Agently.set_settings( self.settings.model.provider, model_settings, auto_load_env=True, ) def _ensure_required_model_env(self, model_settings: dict[str, Any]) -> None: env_names = sorted(set(self._collect_env_names(model_settings))) if not env_names: return missing_env_names = [ name for name in env_names if os.getenv(name) in (None, "") ] if missing_env_names: raise EnvironmentError( "Missing required model environment variables: " + ", ".join(missing_env_names) ) @classmethod def _collect_env_names(cls, value: Any) -> list[str]: if isinstance(value, str): return re.findall(r"\$\{\s*ENV\.([^}]+?)\s*\}", value) if isinstance(value, dict): env_names: list[str] = [] for item in value.values(): env_names.extend(cls._collect_env_names(item)) return env_names if isinstance(value, list): env_names: list[str] = [] for item in value: env_names.extend(cls._collect_env_names(item)) return env_names return [] @classmethod def _missing_env_names(cls, value: Any) -> list[str]: env_names = sorted(set(cls._collect_env_names(value))) return [name for name in env_names if os.getenv(name) in (None, "")] @staticmethod def _resolve_env_value(value: Any) -> str: if not isinstance(value, str): return str(value) def replacer(match: re.Match[str]) -> str: env_name = match.group(1).strip() return os.getenv(env_name, match.group(0)) return re.sub(r"\$\{\s*ENV\.([^}]+?)\s*\}", replacer, value) ================================================ FILE: news_collector/config.py ================================================ from __future__ import annotations import os import re from dataclasses import dataclass, field from pathlib import Path from typing import Any, Literal, TypeAlias, TypeVar, cast import yaml ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}") LiteralStrT = TypeVar("LiteralStrT", bound=str) ModelProvider: TypeAlias = Literal["OpenAICompatible", "OpenAI", "OAIClient"] ModelType: TypeAlias = Literal["chat", "completions", "embeddings"] SearchBackend: TypeAlias = Literal[ "auto", "bing", "duckduckgo", "yahoo", "google", "mullvad_google", "yandex", "wikipedia", ] SearchNewsTimeLimit: TypeAlias = Literal["d", "w", "m"] SearchRegion: TypeAlias = Literal[ "xa-ar", "xa-en", "ar-es", "au-en", "at-de", "be-fr", "be-nl", "br-pt", "bg-bg", "ca-en", "ca-fr", "ct-ca", "cl-es", "cn-zh", "co-es", "hr-hr", "cz-cs", "dk-da", "ee-et", "fi-fi", "fr-fr", "de-de", "gr-el", "hk-tzh", "hu-hu", "in-en", "id-id", "id-en", "ie-en", "il-he", "it-it", "jp-jp", "kr-kr", "lv-lv", "lt-lt", "xl-es", "my-ms", "my-en", "mx-es", "nl-nl", "nz-en", "no-no", "pe-es", "ph-en", "ph-tl", "pl-pl", "pt-pt", "ro-ro", "ru-ru", "sg-en", "sk-sk", "sl-sl", "za-en", "es-es", "se-sv", "ch-de", "ch-fr", "ch-it", "tw-tzh", "th-th", "tr-tr", "ua-uk", "uk-en", "us-en", "ue-es", "ve-es", "vn-vi", ] BrowseResponseMode: TypeAlias = Literal["markdown", "text"] MODEL_PROVIDER_VALUES: tuple[ModelProvider, ...] = ("OpenAICompatible", "OpenAI", "OAIClient") MODEL_TYPE_VALUES: tuple[ModelType, ...] = ("chat", "completions", "embeddings") SEARCH_BACKEND_VALUES: tuple[SearchBackend, ...] = ( "auto", "bing", "duckduckgo", "yahoo", "google", "mullvad_google", "yandex", "wikipedia", ) SEARCH_TIMELIMIT_VALUES: tuple[SearchNewsTimeLimit, ...] = ("d", "w", "m") SEARCH_REGION_VALUES: tuple[SearchRegion, ...] = ( "xa-ar", "xa-en", "ar-es", "au-en", "at-de", "be-fr", "be-nl", "br-pt", "bg-bg", "ca-en", "ca-fr", "ct-ca", "cl-es", "cn-zh", "co-es", "hr-hr", "cz-cs", "dk-da", "ee-et", "fi-fi", "fr-fr", "de-de", "gr-el", "hk-tzh", "hu-hu", "in-en", "id-id", "id-en", "ie-en", "il-he", "it-it", "jp-jp", "kr-kr", "lv-lv", "lt-lt", "xl-es", "my-ms", "my-en", "mx-es", "nl-nl", "nz-en", "no-no", "pe-es", "ph-en", "ph-tl", "pl-pl", "pt-pt", "ro-ro", "ru-ru", "sg-en", "sk-sk", "sl-sl", "za-en", "es-es", "se-sv", "ch-de", "ch-fr", "ch-it", "tw-tzh", "th-th", "tr-tr", "ua-uk", "uk-en", "us-en", "ue-es", "ve-es", "vn-vi", ) BROWSE_RESPONSE_MODE_VALUES: tuple[BrowseResponseMode, ...] = ("markdown", "text") def _resolve_env_placeholders(value: Any) -> Any: if isinstance(value, str): def replace(match: re.Match[str]) -> str: env_name = match.group(1) default_value = match.group(2) or "" return os.getenv(env_name, default_value) return ENV_PATTERN.sub(replace, value) if isinstance(value, list): return [_resolve_env_placeholders(item) for item in value] if isinstance(value, dict): return { key: _resolve_env_placeholders(item) for key, item in value.items() } return value def _as_dict(value: Any) -> dict[str, Any]: return value if isinstance(value, dict) else {} def _as_int(value: Any, default: int) -> int: try: return int(value) except (TypeError, ValueError): return default def _as_bool(value: Any, default: bool = False) -> bool: if value is None: return default if isinstance(value, bool): return value if isinstance(value, str): normalized = value.strip().lower() if normalized in {"1", "true", "yes", "on"}: return True if normalized in {"0", "false", "no", "off"}: return False return bool(value) def _as_optional_str(value: Any) -> str | None: if value is None: return None text = str(value).strip() if not text or text.lower() in {"none", "null"}: return None return text def _normalize_auth(value: Any) -> Any: if isinstance(value, str): normalized = _as_optional_str(value) if normalized and "input your api key" in normalized.lower(): return None return normalized if isinstance(value, dict): normalized = { str(key): item for key, item in value.items() if item not in (None, "", [], {}) } api_key = _as_optional_str(normalized.get("api_key")) if api_key is None: normalized.pop("api_key", None) else: normalized["api_key"] = api_key return normalized or None return value def _as_literal( value: Any, *, allowed: tuple[LiteralStrT, ...], default: LiteralStrT, ) -> LiteralStrT: if isinstance(value, str): candidate = value.strip() if candidate in allowed: return cast(LiteralStrT, candidate) lower_candidate = candidate.lower() for item in allowed: if lower_candidate == item.lower(): return item return default @dataclass(slots=True) class ModelConfig: provider: ModelProvider = "OpenAICompatible" base_url: str = "https://api.openai.com/v1" model: str = "gpt-4.1-mini" model_type: ModelType = "chat" auth: Any = None request_options: dict[str, Any] = field(default_factory=dict) proxy: str | None = None @classmethod def from_raw(cls, raw: dict[str, Any]) -> "ModelConfig": block = _as_dict(raw.get("MODEL") or raw.get("model")) legacy_request_options = dict(_as_dict(raw.get("MODEL_OPTIONS"))) block_request_options = dict(_as_dict(block.get("request_options") or block.get("options"))) request_options = block_request_options or legacy_request_options model_name = block.get("model") or request_options.pop("model", None) or "gpt-4.1-mini" return cls( provider=_as_literal( block.get("provider") or raw.get("MODEL_PROVIDER"), allowed=MODEL_PROVIDER_VALUES, default="OpenAICompatible", ), base_url=str(block.get("base_url") or raw.get("MODEL_URL") or "https://api.openai.com/v1"), model=str(model_name), model_type=_as_literal( block.get("model_type"), allowed=MODEL_TYPE_VALUES, default="chat", ), auth=_normalize_auth(block.get("auth", raw.get("MODEL_AUTH"))), request_options=request_options, proxy=_as_optional_str(block.get("proxy")), ) def to_agently_settings(self, global_proxy: str | None = None) -> dict[str, Any]: settings: dict[str, Any] = { "base_url": self.base_url, "model": self.model, "model_type": self.model_type, "request_options": self.request_options, } proxy = self.proxy or global_proxy if proxy: settings["proxy"] = proxy if self.auth is not None: settings["auth"] = self.auth return settings @dataclass(slots=True) class SearchConfig: max_results: int = 8 timelimit: SearchNewsTimeLimit = "d" region: SearchRegion = "us-en" backend: SearchBackend = "auto" proxy: str | None = None @classmethod def from_raw(cls, raw: dict[str, Any]) -> "SearchConfig": block = _as_dict(raw.get("SEARCH") or raw.get("search")) return cls( max_results=max(_as_int(block.get("max_results", raw.get("MAX_SEARCH_RESULTS")), 8), 1), timelimit=_as_literal( block.get("timelimit"), allowed=SEARCH_TIMELIMIT_VALUES, default="d", ), region=_as_literal( block.get("region"), allowed=SEARCH_REGION_VALUES, default="us-en", ), backend=_as_literal( block.get("backend"), allowed=SEARCH_BACKEND_VALUES, default="auto", ), proxy=_as_optional_str(block.get("proxy")), ) @dataclass(slots=True) class BrowseConfig: enable_playwright: bool = False playwright_headless: bool = True response_mode: BrowseResponseMode = "markdown" max_content_length: int = 12000 min_content_length: int = 80 proxy: str | None = None @classmethod def from_raw(cls, raw: dict[str, Any]) -> "BrowseConfig": block = _as_dict(raw.get("BROWSE") or raw.get("browse")) return cls( enable_playwright=_as_bool(block.get("enable_playwright"), False), playwright_headless=_as_bool(block.get("playwright_headless"), True), response_mode=_as_literal( block.get("response_mode"), allowed=BROWSE_RESPONSE_MODE_VALUES, default="markdown", ), max_content_length=max(_as_int(block.get("max_content_length"), 12000), 2000), min_content_length=max(_as_int(block.get("min_content_length"), 80), 20), proxy=_as_optional_str(block.get("proxy")), ) @dataclass(slots=True) class WorkflowConfig: max_column_num: int = 3 max_news_per_column: int = 3 output_language: str = "Chinese" column_concurrency: int = 3 summary_concurrency: int = 3 @classmethod def from_raw(cls, raw: dict[str, Any]) -> "WorkflowConfig": block = _as_dict(raw.get("WORKFLOW") or raw.get("workflow")) return cls( max_column_num=max(_as_int(block.get("max_column_num", raw.get("MAX_COLUMN_NUM")), 3), 1), max_news_per_column=max(_as_int(block.get("max_news_per_column"), 3), 1), output_language=str(block.get("output_language") or raw.get("OUTPUT_LANGUAGE") or "Chinese"), column_concurrency=max(_as_int(block.get("column_concurrency"), 3), 1), summary_concurrency=max(_as_int(block.get("summary_concurrency"), 3), 1), ) @dataclass(slots=True) class OutlineConfig: use_customized: bool = False customized: dict[str, Any] = field(default_factory=dict) @classmethod def from_raw(cls, raw: dict[str, Any]) -> "OutlineConfig": block = _as_dict(raw.get("OUTLINE") or raw.get("outline")) customized = block.get("customized", raw.get("CUSTOMIZE_OUTLINE")) or {} return cls( use_customized=_as_bool(block.get("use_customized", raw.get("USE_CUSTOMIZE_OUTLINE", False))), customized=customized if isinstance(customized, dict) else {}, ) @dataclass(slots=True) class OutputConfig: directory: str = "outputs" @classmethod def from_raw(cls, raw: dict[str, Any]) -> "OutputConfig": block = _as_dict(raw.get("OUTPUT") or raw.get("output")) return cls(directory=str(block.get("directory") or "outputs")) @dataclass(slots=True) class AppSettings: debug: bool = False proxy: str | None = None model: ModelConfig = field(default_factory=ModelConfig) search: SearchConfig = field(default_factory=SearchConfig) browse: BrowseConfig = field(default_factory=BrowseConfig) workflow: WorkflowConfig = field(default_factory=WorkflowConfig) outline: OutlineConfig = field(default_factory=OutlineConfig) output: OutputConfig = field(default_factory=OutputConfig) @classmethod def load(cls, path: str | Path) -> "AppSettings": config_path = Path(path) raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} if not isinstance(raw, dict): raise TypeError(f"Settings file must contain a dictionary, got: {type(raw)}") resolved = _resolve_env_placeholders(raw) return cls( debug=_as_bool(resolved.get("DEBUG", resolved.get("debug", False))), proxy=_as_optional_str(resolved.get("PROXY", resolved.get("proxy"))), model=ModelConfig.from_raw(resolved), search=SearchConfig.from_raw(resolved), browse=BrowseConfig.from_raw(resolved), workflow=WorkflowConfig.from_raw(resolved), outline=OutlineConfig.from_raw(resolved), output=OutputConfig.from_raw(resolved), ) ================================================ FILE: news_collector/logging_utils.py ================================================ from __future__ import annotations import logging from pathlib import Path def configure_logging(*, debug: bool, log_dir: str | Path) -> logging.Logger: target_dir = Path(log_dir) target_dir.mkdir(parents=True, exist_ok=True) logger = logging.getLogger("agently_daily_news_collector") logger.setLevel(logging.DEBUG if debug else logging.INFO) logger.propagate = False if logger.handlers: return logger formatter = logging.Formatter( "%(asctime)s [%(levelname)s] %(message)s", ) console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG if debug else logging.INFO) console_handler.setFormatter(formatter) file_handler = logging.FileHandler( target_dir / "collector.log", encoding="utf-8", ) file_handler.setLevel(logging.DEBUG if debug else logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.addHandler(file_handler) return logger ================================================ FILE: news_collector/markdown.py ================================================ from __future__ import annotations from typing import Any def _labels_for_language(language: str) -> dict[str, str]: normalized = language.lower() if "chinese" in normalized or normalized.startswith("zh"): return { "generated_at": "生成时间", "topic": "主题", "prologue": "导语", "news_list": "新闻列表", "source": "来源", "date": "日期", "summary": "摘要", "comment": "推荐理由", "model": "模型", } return { "generated_at": "Generated At", "topic": "Topic", "prologue": "Prologue", "news_list": "News List", "source": "Source", "date": "Date", "summary": "Summary", "comment": "Why It Matters", "model": "Model", } def render_markdown( *, report_title: str, generated_at: str, topic: str, language: str, columns: list[dict[str, Any]], model_label: str, ) -> str: labels = _labels_for_language(language) lines = [ f"# {report_title}", "", f"> {labels['generated_at']}: {generated_at}", f"> {labels['topic']}: {topic}", "", ] for column in columns: lines.extend( [ f"## {column['title']}", "", f"### {labels['prologue']}", "", column["prologue"], "", f"### {labels['news_list']}", "", ] ) for news in column["news_list"]: lines.append(f"- [{news['title']}]({news['url']})") meta_parts = [] if news.get("source"): meta_parts.append(f"{labels['source']}: {news['source']}") if news.get("date"): meta_parts.append(f"{labels['date']}: {news['date']}") if meta_parts: lines.append(f" - {' | '.join(meta_parts)}") lines.append(f" - {labels['summary']}: {news['summary']}") lines.append(f" - {labels['comment']}: {news['recommend_comment']}") lines.append("") lines.extend( [ "---", "", "Powered by [Agently 4](https://github.com/AgentEra/Agently)", "", f"{labels['model']}: {model_label}", ] ) return "\n".join(lines).strip() + "\n" ================================================ FILE: outputs/.gitkeep ================================================ ================================================ FILE: prompts/create_outline.yaml ================================================ input: topic: ${topic} today: ${today} info: output_language: ${language} max_column_num: ${max_column_num} instruct: - Design a daily news report outline about {input.topic}. - Focus on recent and genuinely newsworthy developments, not timeless background material. - Create between 2 and {info.max_column_num} distinct columns. - Each column should have a clear editorial angle and usable search keywords. - column_requirement should describe a practical coverage direction, not an overly narrow fact checklist. - Avoid making column_requirement depend on very specific version numbers, exact financing stages, exact pull requests, or unverified assumptions unless the topic itself clearly requires that specificity. - Prefer column angles that can still include adjacent and strongly related developments if they help readers understand the topic. - search_keywords must be concise and search-engine friendly, ideally 4-10 terms instead of a long sentence. - Prefer entity names and English or mixed-language keywords when the topic includes product names, project names, or proper nouns. - Avoid stuffing search_keywords with excessive years, clauses, or explanatory text. - report_title should look like a publishable briefing headline in {info.output_language}. output: report_title: $type: str $desc: natural title for the final news report column_list: $type: - column_title: $type: str $desc: concise column heading column_requirement: $type: str $desc: practical coverage direction for stories in this column; keep it specific but not overly narrow search_keywords: $type: str $desc: practical search query that includes the topic and matches this column angle $desc: list of columns for the report ================================================ FILE: prompts/pick_news.yaml ================================================ input: ${column_news} info: column_title: ${column_title} column_requirement: ${column_requirement} max_news_per_column: ${max_news_per_column} instruct: - Review the candidate news list for the current column. - This stage is a shortlist stage, so prioritize recall over precision. - Prefer concrete, recent, source-backed stories that plausibly satisfy or strongly relate to {info.column_requirement}. - Keep clearly relevant or potentially relevant stories for the next summarizing step, even if the snippet is incomplete. - Reject only obvious duplicates, clearly unrelated stories, or items that are too thin to justify browsing. - Keep at most {info.max_news_per_column} items with can_use equal to true. - Use relevance_score from 0 to 10. output: - id: $type: int $desc: value from {input.[].id} can_use: $type: bool $desc: whether this item should enter the shortlist for later webpage reading relevance_score: $type: int $desc: ranking score from 0 to 10 for shortlisted items recommend_comment: $type: str $desc: short why-it-matters note if can_use is true, otherwise empty string ================================================ FILE: prompts/summarize_news.yaml ================================================ input: ${news_content} info: news_title: ${news_title} column_requirement: ${column_requirement} language: ${language} instruct: - Read the browsed page content and extract the core facts related to {info.news_title}. - This stage is the final relevance check because you can read the full page content here. - If the page content is too thin, inaccessible, or clearly irrelevant to {info.column_requirement}, set can_summarize to false. - If the page contains enough concrete facts that are meaningfully related to the column, set can_summarize to true even when the fit is not perfect. - summary must be one compact paragraph in {info.language}. - Focus on facts that explain why the story belongs in this column. - When the article is only partially related, summarize only the relevant part instead of rejecting it outright. output: can_summarize: $type: bool $desc: whether the page contains enough useful information to summarize summary: $type: str $desc: one-paragraph summary in {info.language}, or empty string if can_summarize is false ================================================ FILE: prompts/write_column.yaml ================================================ input: ${news_list} info: column_title: ${column_title} column_requirement: ${column_requirement} language: ${language} instruct: - Build the final column using the shortlisted stories. - Keep the strongest non-duplicated stories only. - prologue should be 2 to 3 sentences in {info.language}. - When useful, cite key stories with Markdown links like [title](url). output: prologue: $type: str $desc: opening paragraph for this column in {info.language} news_list: $type: - id: $type: int $desc: value from {input.[].id} recommend_comment: $type: str $desc: refined why-it-matters note for the selected story $desc: ordered list of stories to keep in the final column ================================================ FILE: requirements.txt ================================================ agently>=4.0.8.3 PyYAML>=6.0.2 ddgs>=9.10.0 beautifulsoup4>=4.12.3 python-dotenv>=1.0.1 ================================================ FILE: tools/README.md ================================================ # Tools Layer `tools/` 是项目的可替换工具适配层。 默认实现: - `tools/builtin.py` - 直接封装 Agently v4 内置 `Search` / `Browse` 当前入口: - `tools/__init__.py` 如果你想替换为自己的搜索或网页抓取实现,只需要: 1. 新建一个模块,例如 `tools/custom.py` 2. 实现 `SearchToolProtocol` 和 `BrowseToolProtocol` 对应的方法 3. 在 `tools/__init__.py` 中把 `create_search_tool` / `create_browse_tool` 改为导出你的工厂函数 最小接口约束: ```python class SearchToolProtocol(Protocol): async def search_news( self, *, query: str, timelimit: SearchNewsTimeLimit, max_results: int, ) -> list[dict[str, Any]]: ... class BrowseToolProtocol(Protocol): async def browse(self, url: str) -> str: ... ``` ================================================ FILE: tools/__init__.py ================================================ from .base import BrowseToolProtocol, SearchToolProtocol from .builtin import create_browse_tool, create_search_tool __all__ = [ "BrowseToolProtocol", "SearchToolProtocol", "create_browse_tool", "create_search_tool", ] ================================================ FILE: tools/base.py ================================================ from __future__ import annotations from typing import Any, Protocol from news_collector.config import SearchNewsTimeLimit class SearchToolProtocol(Protocol): async def search_news( self, *, query: str, timelimit: SearchNewsTimeLimit, max_results: int, ) -> list[dict[str, Any]]: ... class BrowseToolProtocol(Protocol): async def browse(self, url: str) -> str: ... ================================================ FILE: tools/builtin.py ================================================ from __future__ import annotations from typing import Any from agently.builtins.tools import Browse, Search from ddgs.exceptions import DDGSException from news_collector.config import AppSettings, SearchNewsTimeLimit from .base import BrowseToolProtocol, SearchToolProtocol class AgentlyBuiltinSearchTool(SearchToolProtocol): def __init__(self, settings: AppSettings): self._tool = Search( proxy=settings.search.proxy or settings.proxy, region=settings.search.region, backend=settings.search.backend, ) async def search_news( self, *, query: str, timelimit: SearchNewsTimeLimit, max_results: int, ) -> list[dict[str, Any]]: try: results = await self._tool.search_news( query=query, timelimit=timelimit, max_results=max_results, ) except DDGSException as exc: if "No results found" in str(exc): return [] raise return results if isinstance(results, list) else [] class AgentlyBuiltinBrowseTool(BrowseToolProtocol): def __init__(self, settings: AppSettings): self._tool = Browse( proxy=settings.browse.proxy or settings.proxy, enable_pyautogui=False, enable_playwright=settings.browse.enable_playwright, enable_bs4=True, response_mode=settings.browse.response_mode, max_content_length=settings.browse.max_content_length, min_content_length=settings.browse.min_content_length, playwright_headless=settings.browse.playwright_headless, ) async def browse(self, url: str) -> str: result = await self._tool.browse(url) return str(result or "") def create_search_tool(settings: AppSettings) -> SearchToolProtocol: return AgentlyBuiltinSearchTool(settings) def create_browse_tool(settings: AppSettings) -> BrowseToolProtocol: return AgentlyBuiltinBrowseTool(settings) ================================================ FILE: v3/README.md ================================================

Agently-Daily-News-Collector

English | 中文说明
**Agently Daily News Collector** is an open-source LLM based automatically news collecting workflow showcase project powered by [**_Agently_** AI application development framework](https://github.com/Maplemx/Agently). You can use this project to generate almost any topic of news collection. All you need to do is simply input the field topic of your news collection. Then you wait and the AI agents will do their jobs automatically until a high quality news collection is generated and saved into a markdown file. News collection file examples: `MarkDown File` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models2024-05-02.md) `PDF File` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models%202024-05-02.pdf) > **ℹ️ Notice:** > > Visit https://github.com/Maplemx/Agently if you want to learn more about **_Agently_** AI Application development framework. ## How to Use ### Step 1: Clone this repo Run this command in shell: ```shell git clone git@github.com:AgentEra/Agently-Daily-News-Collector.git ``` ### Step 2: Edit settings YAML file You can find [`SETTINGS.yaml`](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/SETTINGS.yaml) file in the project dir. Input your model's API key and change other settings as your wish. If you want to use other model, you can read [this document](https://github.com/Maplemx/Agently/blob/main/docs/guidebook/application_development_handbook.ipynb) or [this Agently official website page](http://agently.tech/features/model_request.html) to see how to set the settings. ### Step 3: Start Because this project is a Python project, you need to install Python first. You can find installation instruction on [Python official website](https://www.python.org/). At the first time to run this project, you should use this command in shell to download and install dependency packages: ```shell pip install -r path/to/project/requirements.txt ``` Wait until the dependency packages are installed then use this command in shell to start the generation process. ```shell python path/to/project/app.py ``` You will see a tip `[Please input the topic of your daily news collection]:`. Input your topic idea about the field of news that you want to collect, then you're good to go. During the process, there'll be some logs printed to shell to present what tasks are done like this: ```shell 2024-05-02 22:44:27,347 [INFO] [Outline Generated] {'report_title': "Today's news about AI Models Appliaction", 'column_list': [{'column_title': 'Latest News', 'column_requirement': 'The content is related to AI Models Appliaction, and the time is within 24 hours', 'search_keywords': 'AI Models Appliaction news latest'}, {'column_title': 'Hot News', 'column_requirement': 'The content is related to AI Models Appliaction, and the interaction is high', 'search_keywords': 'AI Models Appliaction news hot'}, {'column_title': 'Related News', 'column_requirement': 'The content is related to AI Models Appliaction, but not news', 'search_keywords': 'AI Models Appliaction report'}]} 2024-05-02 22:44:32,352 [INFO] [Start Generate Column] Latest News 2024-05-02 22:44:34,132 [INFO] [Search News Count] 8 2024-05-02 22:44:46,062 [INFO] [Picked News Count] 2 2024-05-02 22:44:46,062 [INFO] [Summarzing] With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma 2024-05-02 22:44:52,579 [INFO] [Summarzing] Success 2024-05-02 22:44:57,580 [INFO] [Summarzing] Over 500 AI models are now optimised for Core Ultra processors, says Intel 2024-05-02 22:45:02,130 [INFO] [Summarzing] Success 2024-05-02 22:45:19,475 [INFO] [Column Data Prepared] {'title': 'Latest News', 'prologue': 'Stay up-to-date with the latest advancements in AI technology with these news updates: [Yseop Partners with AWS to Develop Generative AI for BioPharma](https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html) and [Intel Optimizes Over 500 AI Models for Core Ultra Processors](https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html).', 'news_list': [{'url': 'https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html', 'title': 'With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma', 'summary': "Yseop utilizes AWS to create a new Generative AI application for the Biopharma sector. This application leverages AWS for its scalability and security, and it allows Biopharma companies to bring pharmaceuticals and vaccines to the market more quickly. Yseop's platform integrates LLM models for generating scientific content while meeting the security standards of the pharmaceutical industry.", 'recommend_comment': 'AWS partnership helps Yseop develop an innovative Generative AI application for the BioPharma industry, enabling companies to expedite the delivery of pharmaceuticals and vaccines to market. The integration of LLM models and compliance with stringent pharmaceutical industry security standards make this a valuable solution for BioPharma companies.'}, {'url': 'https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html', 'title': 'Over 500 AI models are now optimised for Core Ultra processors, says Intel', 'summary': 'Intel stated over 500 AI models are optimized for Core Ultra processors. These models are accessible from well-known sources like OpenVINO Model Zoo, Hugging Face, ONNX Model Zoo, and PyTorch.', 'recommend_comment': "Intel's optimization of over 500 AI models for Core Ultra processors provides access to a vast selection of pre-trained models from reputable sources. This optimization enhances the performance and efficiency of AI applications, making it easier for developers to deploy AI solutions on Intel-based hardware."}]} ``` Whole process will take some time, so just relax and have some rest☕️. ### Step 4: Get your news collection markdown file! When the process is done finally, you will see a tip like this with markdown text that generated printed on screen: ```shell 2024-05-02 21:57:20,521 [INFO] [Markdown Generated] ``` Then you can find a markdown file named ` .md` in your project dir. Enjoy it! 😄 --- ## Mainly Dependencies - **Agently AI Development Framework**: https://github.com/Maplemx/Agently | https://pypi.org/project/Agently/ - **duckduckgo-search**: https://pypi.org/project/duckduckgo-search/ - **BeautifulSoup4**: https://pypi.org/project/beautifulsoup4/ - **PyYAM**L: https://pypi.org/project/pyyaml/ --- Please ⭐️ this repo and [Agently](https://github.com/Maplemx/Agently) main repo if you like it! Thank you very much! > 💡 Ideas / Bug Report: [Report Issues Here](https://github.com/AgentEra/Agently-Daily-News-Collector/issues) > > 📧 Email Us: [developer@agently.cn](mailto:developer@agently.cn) > > 👾 Discord Group: > > [Click Here to Join](https://discord.gg/4HnarMBpYT) or Scan the QR Code Down Below > > image > > 💬 WeChat Group(加入微信群): > > [Click Here to Apply](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan the QR Code Down Below > > image ================================================ FILE: v3/README_CN.md ================================================

Agently-Daily-News-Collector

Agently 新闻汇总报告生成器

English Introduction | 中文说明
**Agently新闻汇总报告生成器**是一个基于[**_Agently_** AI应用开发框架](https://github.com/Maplemx/Agently)开发的应用项目。本项目构建了**基于大语言模型驱动的全自动工作流**,能够根据用户输入的主题关键词,自动完成新闻汇总报告的结构设计、栏目组稿(含新闻检索、筛查、总结、栏目信息撰写)及报告MarkDown格式文件的输出全过程。同时,本项目**完全开源**,欢迎开发者们通过Fork->PR的方式共同优化。 新闻汇总报告的样例可参考: `MarkDown文件` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models2024-05-02.md) `PDF文件` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models%202024-05-02.pdf) > 如果您希望进一步了解[**_Agently_** AI应用开发框架](https://github.com/Maplemx/Agently),您可以访问框架的[主仓库地址](https://github.com/Maplemx/Agently)或是[中文官网](http://Agently.cn)阅读更多相关信息,框架提供了丰富的教程和案例,帮助您逐步上手。 ## 如何使用 ### 第一步:将本仓库Clone到本地 在您的开发目录中使用以下Shell脚本指令: ```shell git clone git@github.com:AgentEra/Agently-Daily-News-Collector.git ``` ### 第二步:修改SETTINGS.yaml设置文件 您可以在Clone到本地的项目文件夹中找到[`SETTINGS.yaml`](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/SETTINGS.yaml)这个文件,再根据您的需要修改其中的设置项即可。 下面是具体的设置项说明: ```yaml # Debug Settings IS_DEBUG: false # 如果此项为true,将会输出更多执行过程信息,包括搜索和模型请求的明细信息 # Proxy Settings PROXY: http://127.0.0.1:7890 # 项目中的搜索和模型请求可能会需要使用前向代理,可以通过此项设置代理信息 # Model Settings MODEL_PROVIDER: OAIClient #默认使用OpenAI格式的兼容客户端,此客户端能够适配OpenAI以及各类兼容OpenAI格式的本地模型 MODEL_URL: http://base_url_path # 如果您需要修改Base URL,使用此项进行设置 MODEL_AUTH: api_key: "" # 在这里输入鉴权用的API-Key信息 MODEL_OPTIONS: # 在这里指定模型需要的其他参数,如指定具体的模型,或是调整temperture model: gpt-3.5-turbo temperture: 0.8 # Application Settings MAX_COLUMN_NUM: 3 # 在这里设置汇总报告结构中的专栏数量 OUTPUT_LANGUAGE: Chinese # 在这里设置汇总报告的输出语种,默认为英语,您可能需要手动改成中文 MAX_SEARCH_RESULTS: 8 # 在这里设置每个栏目搜索的最大结果数量 # 注意,如果数量设置过大,可能会导致超出模型的处理窗口大小,请根据模型具体情况设置 SLEEP_TIME: 5 # 在这里设置每次模型请求后的等待时间,以防止频繁请求导致模型拒绝访问 ``` 如果您想要了解切换其他模型的更多细节,可以阅读Agently官方网站关于[模型设置的说明页面](http://agently.tech/features/model_request.html)。 ### 第三步:启动任务 因为本项目为Python项目,您需要在本地安装Python环境。您可以在[Python官方网站](https://www.python.org/)找到适合您的安装方法。 然后,在您的项目目录下使用以下Shell脚本指令更新项目依赖包: ```shell pip install -r requirements.txt ``` 依赖包安装完毕后,通过以下Shell脚本指令即可启动: ```shell python app.py ``` 随后您会看到一个提示:`[Please input the topic of your daily news collection]:`。 根据提示输入您想要汇总的新闻领域主题关键词,或是用一句话描述您想要生成什么样的新闻汇总报告,然后任务就会开始自动运行了。在这里,您可以输入任何语种的内容,但生成内容的语种会和您在第二步中的设置的语种要求相同。 接下来您就可以等待运行的结果了,整个过程大约需要5-8分钟。 在运行的过程中,您会看到类似下面展示的输出日志,这些日志将帮助您了解当前在处理的任务,以及运行的关键进展情况: ```shell 2024-05-02 22:44:27,347 [INFO] [Outline Generated] {'report_title': "Today's news about AI Models Appliaction", 'column_list': [{'column_title': 'Latest News', 'column_requirement': 'The content is related to AI Models Appliaction, and the time is within 24 hours', 'search_keywords': 'AI Models Appliaction news latest'}, {'column_title': 'Hot News', 'column_requirement': 'The content is related to AI Models Appliaction, and the interaction is high', 'search_keywords': 'AI Models Appliaction news hot'}, {'column_title': 'Related News', 'column_requirement': 'The content is related to AI Models Appliaction, but not news', 'search_keywords': 'AI Models Appliaction report'}]} 2024-05-02 22:44:32,352 [INFO] [Start Generate Column] Latest News 2024-05-02 22:44:34,132 [INFO] [Search News Count] 8 2024-05-02 22:44:46,062 [INFO] [Picked News Count] 2 2024-05-02 22:44:46,062 [INFO] [Summarzing] With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma 2024-05-02 22:44:52,579 [INFO] [Summarzing] Success 2024-05-02 22:44:57,580 [INFO] [Summarzing] Over 500 AI models are now optimised for Core Ultra processors, says Intel 2024-05-02 22:45:02,130 [INFO] [Summarzing] Success 2024-05-02 22:45:19,475 [INFO] [Column Data Prepared] {'title': 'Latest News', 'prologue': 'Stay up-to-date with the latest advancements in AI technology with these news updates: [Yseop Partners with AWS to Develop Generative AI for BioPharma](https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html) and [Intel Optimizes Over 500 AI Models for Core Ultra Processors](https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html).', 'news_list': [{'url': 'https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html', 'title': 'With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma', 'summary': "Yseop utilizes AWS to create a new Generative AI application for the Biopharma sector. This application leverages AWS for its scalability and security, and it allows Biopharma companies to bring pharmaceuticals and vaccines to the market more quickly. Yseop's platform integrates LLM models for generating scientific content while meeting the security standards of the pharmaceutical industry.", 'recommend_comment': 'AWS partnership helps Yseop develop an innovative Generative AI application for the BioPharma industry, enabling companies to expedite the delivery of pharmaceuticals and vaccines to market. The integration of LLM models and compliance with stringent pharmaceutical industry security standards make this a valuable solution for BioPharma companies.'}, {'url': 'https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html', 'title': 'Over 500 AI models are now optimised for Core Ultra processors, says Intel', 'summary': 'Intel stated over 500 AI models are optimized for Core Ultra processors. These models are accessible from well-known sources like OpenVINO Model Zoo, Hugging Face, ONNX Model Zoo, and PyTorch.', 'recommend_comment': "Intel's optimization of over 500 AI models for Core Ultra processors provides access to a vast selection of pre-trained models from reputable sources. This optimization enhances the performance and efficiency of AI applications, making it easier for developers to deploy AI solutions on Intel-based hardware."}]} ``` ### 第四步:得到一份新鲜出炉的新闻汇总报告📰! 在整个处理过程结束时,您将会看到类似下方的提示,并可以看到完整的报告MarkDown格式结果被输出到屏幕上: ```shell 2024-05-02 21:57:20,521 [INFO] [Markdown Generated] ``` 同时,您也可以在您的项目文件夹中找到一份命名格式为`<汇总报告名称> <生成日期>.md`的文件。 大功告成!🎉 --- ## 主要依赖说明 - Agently AI应用开发框架:https://github.com/Maplemx/Agently | https://pypi.org/project/Agently/ | http://Agently.cn - duckduckgo-search: https://pypi.org/project/duckduckgo-search/ - BeautifulSoup4: https://pypi.org/project/beautifulsoup4/ - PyYAML: https://pypi.org/project/pyyaml/ --- 如果您喜欢这个项目,请为本项目以及[Agently框架主仓库](https://github.com/Maplemx/Agently)点亮⭐️。 如果您希望了解更多关于本项目的线上产品化版本信息,欢迎通过下面的方式加入我们的讨论群,我们将在近期组织线上产品化版本的测试。 > 💡 意见反馈/Bug提交: [Report Issues Here](https://github.com/AgentEra/Agently-Daily-News-Collector/issues) > > 📧 联系我们: [developer@agently.cn](mailto:developer@agently.cn) > > 💬 加入微信讨论群: > > [点击这里填写申请表](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf)或扫描下方二维码申请入群 > > image ================================================ FILE: v3/SETTINGS.yaml ================================================ # Debug Settings IS_DEBUG: false # Proxy Settings #PROXY: http://127.0.0.1:7890 # Model Settings MODEL_PROVIDER: OAIClient #MODEL_URL: MODEL_AUTH: api_key: "" MODEL_OPTIONS: model: gpt-3.5-turbo # Application Settings MAX_COLUMN_NUM: 3 OUTPUT_LANGUAGE: English MAX_SEARCH_RESULTS: 8 SLEEP_TIME: 5 # Outline Settings USE_CUSTOMIZE_OUTLINE: false CUSTOMIZE_OUTLINE: report_title: "Today's News about Large Model Applications" column_list: - column_title: New Apps column_requirement: Looking for those applications powered by large models which announced recently search_keywords: large model application announce this week - column_title: Hot Apps column_requirement: Looking for those applications powered by large models which are most popular or are discussed most search_keywords: large model application popular hot - column_title: Fun Apps column_requirement: Looking for those applications powered by large models which are funny or inspirational search_keywords: large model application cool fun inspire ================================================ FILE: v3/app.py ================================================ import Agently import utils.yaml_reader as yaml from utils.logger import Logger from workflows import main_workflow from utils.path import root_path # Settings and Logger SETTINGS = yaml.read("./SETTINGS.yaml") logger = Logger(console_level = "DEBUG" if SETTINGS.IS_DEBUG else "INFO") # Agent Factory agent_factory = ( Agently.AgentFactory(is_debug=SETTINGS.IS_DEBUG) .set_settings("current_model", SETTINGS.MODEL_PROVIDER) .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.auth", SETTINGS.MODEL_AUTH) .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.url", SETTINGS.MODEL_URL if hasattr(SETTINGS, "MODEL_URL") else None) .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.options", SETTINGS.MODEL_OPTIONS if hasattr(SETTINGS, "MODEL_OPTIONS") else {}) ) # Start Workflow main_workflow.start( agent_factory=agent_factory, SETTINGS=SETTINGS, root_path=root_path, logger=logger, ) ================================================ FILE: v3/examples/Latest Updates on AI Models2024-05-02.md ================================================ # Latest Updates on AI Models > 2024-05-02 Thursday ## Industry Trends ### PROLOGUE > The selected news articles are related to current trends and developments in the field of AI models. They cover various aspects of AI implementation across industries, from real-time pharma news delivery and control room optimization to accelerated adoption of AI and competitive dynamics in the AI sector. The articles provide insights into how AI is shaping operational efficiency, innovation, decision-making, and industry-specific tasks, reflecting the increasing utilization of AI technology for growth and productivity enhancement. ### NEWS LIST - [AppliedXL Collaborates with Bloomberg to Provide AI-Powered, Real-Time Pharma News on the Bloomberg Terminal](https://www.lelezard.com/en/news-21360719.html) - `[summray]` AppliedXL collaborates with Bloomberg to provide AI-powered, real-time pharma news on the Bloomberg Terminal. The collaboration aims to deliver key insights to help users stay ahead of catalyst events in the pharmaceutical industry. AppliedXL's AI technology analyzes live public data to uncover signals and trends, which are then distilled into early news stories included in real-time news feeds for early signal detection and market analysis. The collaboration focuses on the life sciences and biopharma space, alerting users to irregularities in clinical trial progressions and other market-moving events. AppliedXL's AI technology combines machine learning and human expertise to provide precise and contextualized information efficiently. - `[comment]` This news article discusses how AppliedXL collaborates with Bloomberg to provide AI-powered, real-time pharma news on the Bloomberg Terminal. The collaboration aims to deliver key insights to help users stay ahead of catalyst events in the pharmaceutical industry, showcasing the use of AI in delivering real-time industry updates. - [AI for control rooms](https://www.symmetrymagazine.org/article/ai-for-control-rooms?language_content_entity=und) - `[summray]` AI is being utilized in control rooms within the fields of particle physics and astrophysics to assist with complex tasks. From machine learning algorithms helping to keep particle beams flowing in accelerators to optimizing telescope scheduling for studying galaxies, AI is proving to be a valuable tool for scientists. Additionally, AI is being developed to aid electric grid operators in managing the increasing number of energy resources connecting to the grid. The goal is not to replace human operators but to enhance decision-making by presenting them with the best tool options immediately and learning from human feedback. - `[comment]` The article highlights the use of AI in control rooms within particle physics and astrophysics, assisting with complex tasks. It showcases how AI is enhancing decision-making and presenting the best tool options immediately to human operators, aligning with the current trend of utilizing AI to optimize processes. - [Six AI industry trends we're tracking in 2024 (and beyond)](https://diginomica.com/six-ai-industry-trends-were-tracking-2024-and-beyond) - `[summray]` In 2024, the adoption of AI across industries has accelerated significantly, with projections indicating that by 2040, 1.3 million businesses will be utilizing AI to drive innovation. Various sectors such as the telecom industry, manufacturing, energy, utilities, construction, asset-centric service providers, and defense companies are leveraging AI and automation to enhance operational efficiency, drive performance, accelerate evolution, alleviate challenges, transform fleet management, and strengthen cybersecurity. As organizations invest in advanced technology like AI to optimize processes and automate industry-specific tasks, the potential for growth and productivity enhancement is vast, signaling a shift towards more resilient and digitally transformed operations. - `[comment]` The content discusses the accelerated adoption of AI across various industries, enhancing operational efficiency and driving innovation. It reflects the trend of organizations investing in advanced technology like AI to optimize processes and automate industry-specific tasks for enhanced growth and productivity. - [Microsoft's Fear Of Google's AI Dominance Led To OpenAI Investment, Internal Email Reveals: 'We're Multiple Years Behind The Competition'](https://www.benzinga.com/news/24/05/38582364/microsofts-fear-of-googles-ai-dominance-led-to-openai-investment-internal-email-reveals-were-multipl) - `[summray]` An internal email from Microsoft Corp. revealed that the company's investment in OpenAI was motivated by the fear of falling behind Google in AI capabilities. Microsoft's chief technology officer Kevin Scott expressed concerns about the lack of machine learning scale, infrastructure, and development speed compared to Google and OpenAI. The email highlighted the intense competition in the AI space, with Microsoft investing over $13 billion in OpenAI to enhance various services. The email sheds light on the rivalry between Microsoft and Google in the AI sector, with Google introducing Bard (now Gemini) to compete with OpenAI's ChatGPT, facing some challenges during the launch. This news article reflects the current trends and developments in AI models and the competitive landscape in the industry. - `[comment]` The news reveals Microsoft's investment in OpenAI motivated by the fear of falling behind Google in AI capabilities. It sheds light on the intense competition in the AI space, showcasing the current trends and developments in AI models and the competitive landscape in the industry. - [Q1 2024 Cognizant Technology Solutions Corp Earnings Call](https://finance.yahoo.com/news/q1-2024-cognizant-technology-solutions-123608449.html) - `[summray]` Cognizant Technology Solutions reported on their Q1 2024 Earnings Call, highlighting progress against strategic priorities in a challenging demand environment. They delivered revenue growth exceeding guidance, expanded adjusted operating margin, and noted improvements in voluntary attrition. The company saw sequential growth in Health Sciences and Communications, Media and Technology, with declines in Financial Services. The demand environment remains uncertain, shifting client spending to cost-saving projects. Cognizant focuses on innovation, including AI, cloud, and digital technologies. They mentioned partnerships with Microsoft, Google Cloud, and NVIDIA for AI initiatives. The company emphasized the importance of collaboration, cited recognition for innovation, and highlighted their Bluebolt grassroots initiative. Overall, they aim to increase revenue growth, become an employer of choice, and simplify operations. - `[comment]` Cognizant Technology Solutions' Q1 2024 Earnings Call highlights their focus on innovation, including AI, cloud, and digital technologies. The partnerships with Microsoft, Google Cloud, and NVIDIA for AI initiatives showcase the ongoing trend of companies leveraging AI for growth and becoming employers of choice. ## Innovations and Research ### PROLOGUE > Recent innovations and breakthroughs in the AI models domain are highlighted in the selected news articles. China's advancements in AI technologies, including the SenseNova 5.0 large language model and Vidu text-to-video AI tool, demonstrate the country's commitment to cutting-edge AI developments. Additionally, the rise of generative AI is emphasized as a key trend for driving innovation and organizational growth. Furthermore, a team of researchers has outlined guidelines for the responsible use of machine learning in science, aiming to enhance credibility and reproducibility in research. Explore more about these advancements and guidelines in the following articles: [China's AI Advances](https://swarajyamag.com/technology/chinas-ai-advances-that-are-flying-under-the-radar), [Generative AI's Exponential Potential](https://www.forbes.com/sites/forbestechcouncil/2024/05/02/innovators-should-seize-on-generative-ais-exponential-potential/), [Science's AI Problem](https://www.sciencedaily.com/releases/2024/05/240501153055.htm) ### NEWS LIST - [China's AI Advances That Are Flying Under The Radar](https://swarajyamag.com/technology/chinas-ai-advances-that-are-flying-under-the-radar) - `[summray]` China is making significant advancements in Artificial Intelligence (AI), with recent releases rivalling those in the United States. SenseTime unveiled the SenseNova 5.0 large language model (LLM) with impressive capabilities in knowledge, mathematics, reasoning, and coding. The model surpasses OpenAI's GPT-4 Turbo and tops various multimodal benchmarks. Another innovation is Vidu, a text-to-video AI tool that can generate 16-second videos based on simple text prompts. Additionally, Stardust Intelligence introduced the Astribot S1 humanoid robot, capable of performing household chores and imitating human movements. China is demonstrating seriousness in its AI ambitions, with over 40 approved AI models for public use and a vision to empower billions of people with AI robot assistants. - `[comment]` China's advancements in AI, such as the SenseNova 5.0 large language model and Vidu text-to-video AI tool, showcase the country's commitment to innovative technologies in the AI domain. - [Innovators Should Seize On Generative AI's Exponential Potential](https://www.forbes.com/sites/forbestechcouncil/2024/05/02/innovators-should-seize-on-generative-ais-exponential-potential/) - `[summray]` Generative AI is identified as a significant trend in the tech industry that necessitates rapid adaptation. The market for generative AI is projected to grow rapidly, with organizations investing in the technology to drive innovation. McKinsey details how generative AI can accelerate organizational growth by rapidly processing information, writing code for self-improvement, and enhancing competitive edge. By utilizing generative AI tools tailored for each phase of innovation, organizations can revamp their innovation processes to tap into the technology's potential. The importance of experimentation, prototyping, and scaling is emphasized, with generative AI offering various tools to aid in these processes. The democratization of innovation across employees and the augmentation of emerging technologies hold promise for accelerating organization's adaptability and competitiveness in leveraging generative AI for innovation. - `[comment]` Generative AI is a crucial trend in tech, with potential to drive rapid innovation and organizational growth. Organizations should leverage generative AI tools for revolutionizing their innovation processes. - [Science has an AI problem: This group says they can fix it](https://www.sciencedaily.com/releases/2024/05/240501153055.htm) - `[summray]` An interdisciplinary team of 19 researchers, led by Princeton University computer scientists Arvind Narayanan and Sayash Kapoor, has published guidelines for the responsible use of machine learning in science to address the credibility crisis in research caused by deep flaws in machine learning methods. The guidelines focus on transparency and integrity, calling for detailed descriptions of machine learning models, code, data, hardware specifications, experimental design, and project goals. The aim is to ensure reproducibility of results, validate claims, and accelerate scientific progress by improving the quality of published papers. - `[comment]` A team of researchers has provided guidelines for responsible use of machine learning in science to address credibility issues. Transparency and integrity in machine learning models are crucial for reproducibility of results and scientific progress. ## Future Outlook ### PROLOGUE > The following article delves into the future prospects, challenges, and potential advancements of AI models in the context of business operations and employee dynamics. It explores the impact of technological advancements, particularly Artificial Intelligence (AI), on businesses and employees, focusing on layoffs resulting from automation. For more information, you can visit the article [Layoffs in the wake of technological advancements: The inherent benefits for businesses and employees](https://www.ghanaweb.com/GhanaHomePage/business/Layoffs-in-the-wake-of-technological-advancements-The-inherent-benefits-for-businesses-and-employees-1928854). ### NEWS LIST - [Layoffs in the wake of technological advancements: The inherent benefits for businesses and employees](https://www.ghanaweb.com/GhanaHomePage/business/Layoffs-in-the-wake-of-technological-advancements-The-inherent-benefits-for-businesses-and-employees-1928854) - `[summray]` The article discusses the impact of technological advancements, particularly Artificial Intelligence (AI), on businesses and employees, focusing on layoffs as a result of automation. It highlights the benefits and challenges of AI in the workplace, such as increased productivity, job displacement, and layoff exercises. The causes of layoffs, including economic downturns, technological advancements, restructuring, shifting consumer preferences, and cost-saving measures, are explored, along with the opportunities they bring for businesses. Additionally, the article outlines the benefits of layoffs for employees, such as severance packages, career reevaluation, increased market value, networking opportunities, personal growth, and entrepreneurial opportunities. - `[comment]` The article provides insights into the impact of technological advancements, specifically AI, on businesses and employees, highlighting the challenges and benefits associated with layoffs. It is a relevant read for understanding the future prospects of AI models in the workplace. --- Powered by [Agently AI Application Development Framework & Agently Workflow](https://github.com/Maplemx/Agently) Model Information:OAIClient - {'model': 'gpt-3.5-turbo'} **_Agently_** [Guidebook](https://github.com/Maplemx/Agently/blob/main/docs/guidebook) [Apply Developers WeChat Group](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan QR Code to Apply. image ================================================ FILE: v3/prompts/create_outline.yaml ================================================ input: ${topic} instruct: task: prepare news collection outline according {input}'s topic output language: ${language} output: report_title: $type: str $desc: generate a title for this news collection like "daily news about sports", "today's news about finance" column_list: $type: - column_title: $type: str column_requirement: $type: str $desc: describe recheck standard about the contents in this column to make sure all contents are aimed at the requirement of {input}'s topic search_keywords: $type: str $desc: search keywords for this column splited by space. make sure the filed keyword about {input} is included in keywords. $desc: the number of columns <= ${max_column_num} ================================================ FILE: v3/prompts/pick_news.yaml ================================================ input: ${column_news} instruct: news select rules: - ${column_requirement} - if several news are similar, just retain the one with most famous source and output {can_use} as false for others output: - id: $type: int $desc: value from {input.[].id} can_use: $type: bool $desc: judge if {input.brief} can be used according {instruct} recommend_comment: $type: str $desc: provide your recommend comment if {can_use} == true, or just output null ================================================ FILE: v3/prompts/summarize.yaml ================================================ input: ${news_content} info: column requirement: ${column_requirement} news title: ${news_title} instruct: output language: ${language} summary rule: - find and summarize the main content part of the news content which is collected from webpage - summary focus on relative content to {column requirement} and {news title} - summary in one paragraph without linebreak output: can_summarize: $type: bool $desc: judge if {input} has enough relative content to be summarized translated_title: $type: str $desc: translate {input.news title} into ${language} summary: $type: str $desc: summarize {input} according {info} and {instruct} if {can_summarize} == true, or output null ================================================ FILE: v3/prompts/write_column.yaml ================================================ input: ${slimmed_news} info: column requirement: ${column_requirement} instruct: news select rules: - if there're serveral similar content news, only select one of them into {news_list} - all news selected must follow or be relative to {column requirement} output language: ${language} output: news_list: $type: - id: $type: int $desc: value from {input.[].id} recommend_comment: $type: str $desc: provide your recommend comment of this news according your role and {column requirement} $desc: select news into column list according {news select rules} from {input} prologue: $type: str $desc: write a prologue for readers according {news_list} and {news select rules}, you can use [](news url) to mark key information ================================================ FILE: v3/requirements.txt ================================================ Agently>=3.2.2.8 PyYAML==6.0.1 duckduckgo_search>=5.3.0 beautifulsoup4>=4.12.3 ================================================ FILE: v3/utils/__init__.py ================================================ ================================================ FILE: v3/utils/logger.py ================================================ import os import logging logging.getLogger().setLevel(logging.NOTSET) class Logger(object): def __init__(self, **kwargs): name = kwargs.get("name", "Agently-Daily-News-Collector") log_level = kwargs.get("log_level", "ERROR") console_level = kwargs.get("console_level", "INFO") log_format = kwargs.get("format", "%(asctime)s\t[%(levelname)s]\t%(message)s") log_path = kwargs.get("path", "./logs/Agently_daily_news_collector.log") handlers = kwargs.get("handlers", []) self.logger = logging.getLogger(name) if self.logger.hasHandlers(): self.logger.handlers.clear() stream_handler = logging.StreamHandler() stream_handler.setLevel(getattr(logging, console_level)) stream_handler.setFormatter(logging.Formatter(log_format)) self.logger.addHandler(stream_handler) file_handler = logging.FileHandler(log_path) file_handler.setLevel(getattr(logging, log_level)) file_handler.setFormatter(logging.Formatter(log_format)) self.logger.addHandler(file_handler) for handler in handlers: self.logger.addHandler(handler) def __transform(self, *args, **kwargs): message = "" for arg in args: message += f"{ arg }\t" message = message[:-1] kwargs_to_list = [] kwargs_message = "" for key, value in kwargs.items(): kwargs_to_list.append(f"{ key }: { str(value) }") kwargs_message += "\t".join(kwargs_to_list) if kwargs_message != "": message += f"\t{ kwargs_message }" return message def debug(self, *args, **kwargs): return self.logger.debug(self.__transform(*args, **kwargs)) def info(self, *args, **kwargs): return self.logger.info(self.__transform(*args, **kwargs)) def warning(self, *args, **kwargs): return self.logger.warning(self.__transform(*args, **kwargs)) def error(self, *args, **kwargs): return self.logger.error(self.__transform(*args, **kwargs)) def critical(self, *args, **kwargs): return self.logger.critical(self.__transform(*args, **kwargs)) logger = Logger() ================================================ FILE: v3/utils/path.py ================================================ import os root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ================================================ FILE: v3/utils/yaml_reader.py ================================================ import yaml from types import SimpleNamespace class YAMLResult(SimpleNamespace): pass def read(yaml_path:str): try: with open(yaml_path, "r") as yaml_file: yaml_dict = yaml.safe_load(yaml_file) return YAMLResult(**yaml_dict) except Exception as e: raise Exception(f"[YAML Reader] Error occured when read YAML from path '{ yaml_path }'.\nError: { str(e) }") ================================================ FILE: v3/workflows/__init__.py ================================================ ================================================ FILE: v3/workflows/column_workflow.py ================================================ import time import Agently from .tools.search import search from .tools.browse import browse def start(column_outline, *, agent_factory, SETTINGS, root_path, logger): tool_proxy = ( SETTINGS.TOOL_PROXY if hasattr(SETTINGS, "TOOL_PROXY") else ( SETTINGS.PROXY if hasattr(SETTINGS, "PROXY") else None ) ) logger.info("[Start Generate Column]", column_outline["column_title"]) column_workflow = Agently.Workflow() column_editor_agent = agent_factory.create_agent() # You can set column editor agent here, read https://github.com/Maplemx/Agently/tree/main/docs/guidebook to explore """ ( column_editor_agent .set_role("...") .set_user_info("...") ) """ # Define Workflow Chunks @column_workflow.chunk("start", type="Start") @column_workflow.chunk("search") def search_executor(inputs, storage): storage.set( "searched_news", search( column_outline["search_keywords"], timelimit=SETTINGS.NEWS_TIME_LIMIT if hasattr(SETTINGS, "NEWS_TIME_LIMIT") else "d", proxy=tool_proxy, logger=logger, ) ) @column_workflow.chunk("pick_news") def pick_news_executor(inputs, storage): searched_news = storage.get("searched_news", []) logger.info("[Search News Count]", len(searched_news)) if len(searched_news) > 0: pick_results = ( column_editor_agent .load_yaml_prompt( path=f"{ root_path }/prompts/pick_news.yaml", variables={ "column_news": searched_news, "column_requirement": column_outline["column_requirement"], } ) .start() ) # sleep to avoid requesting too often time.sleep(SETTINGS.SLEEP_TIME) picked_news = [] for pick_result in pick_results: if pick_result["can_use"]: news = searched_news[int(pick_result["id"])].copy() news.update({ "recommend_comment": pick_result["recommend_comment"] }) picked_news.append(news) storage.set("picked_news", picked_news) logger.info("[Picked News Count]", len(picked_news)) else: storage.set("picked_news", []) logger.info("[Picked News Count]", 0) @column_workflow.chunk("read_and_summarize") def read_and_summarize_executor(inputs, storage): picked_news = storage.get("picked_news", []) readed_news = [] if picked_news and len(picked_news) > 0: for news in picked_news: logger.info("[Summarzing]", news["title"]) news_content = browse( news["url"], proxy=tool_proxy, logger=logger, ) if news_content and news_content != "": try: summary_result = ( column_editor_agent .load_yaml_prompt( path=f"{ root_path }/prompts/summarize.yaml", variables={ "news_content": news_content, "column_requirement": column_outline["column_requirement"], "news_title": news["title"], "language": SETTINGS.OUTPUT_LANGUAGE, } ) .start() ) if summary_result["can_summarize"]: readed_news_info = news.copy() readed_news_info.update({ "title": summary_result["translated_title"], "summary": summary_result["summary"] }) readed_news.append(readed_news_info) logger.info("[Summarzing]", "Success") else: logger.info("[Summarzing]", "Failed") # sleep to avoid requesting too often time.sleep(SETTINGS.SLEEP_TIME) except Exception as e: logger.error(f"[Summarzie]: Can not summarize '{ news['title'] }'.\tError: { str(e) }") storage.set("readed_news", readed_news) @column_workflow.chunk("write_column") def write_column_executor(inputs, storage): readed_news = storage.get("readed_news", []) if readed_news and len(readed_news) > 0: slimmed_news = [] for index, news in enumerate(readed_news): slimmed_news.append({ "id": index, "title": news["title"], "summary": news["summary"], "url": news["url"], }) column_result = ( column_editor_agent .load_yaml_prompt( path=f"{ root_path }/prompts/write_column.yaml", variables={ "slimmed_news": slimmed_news, "column_requirement": column_outline["column_requirement"], "language": SETTINGS.OUTPUT_LANGUAGE, } ) .start() ) # sleep to avoid requesting too often time.sleep(SETTINGS.SLEEP_TIME) final_news_list = [] for news in column_result["news_list"]: id = news["id"] final_news_list.append({ "url": readed_news[id]["url"], "title": readed_news[id]["title"], "summary": readed_news[id]["summary"], "recommend_comment": news["recommend_comment"], }) storage.set("final_result", { "title": column_outline["column_title"], "prologue": column_result["prologue"], "news_list": final_news_list, }) else: storage.set("final_result", None) # Connect Chunks ( column_workflow.chunks["start"] .connect_to(column_workflow.chunks["search"]) .connect_to(column_workflow.chunks["pick_news"]) .connect_to(column_workflow.chunks["read_and_summarize"]) .connect_to(column_workflow.chunks["write_column"]) ) # Start Workflow column_workflow.start() return column_workflow.executor.store.get("final_result") ================================================ FILE: v3/workflows/main_workflow.py ================================================ import time import Agently from datetime import datetime from .column_workflow import start as start_column_workflow def start(*, agent_factory, SETTINGS, root_path, logger): main_workflow = Agently.Workflow() chief_editor_agent = agent_factory.create_agent() # You can set chief editor agent here, read https://github.com/Maplemx/Agently/tree/main/docs/guidebook to explore """ ( chief_editor_agent .set_role("...") .set_user_info("...") ) """ # Define Workflow Chunks @main_workflow.chunk("start", type="Start") @main_workflow.chunk("input_topic") def input_topic_executor(inputs, storage): if not SETTINGS.USE_CUSTOMIZE_OUTLINE: storage.set( "topic", input("[Please input the topic of your news collection]: ") ) @main_workflow.chunk("generate_outline") def generate_outline_executor(inputs, storage): if SETTINGS.USE_CUSTOMIZE_OUTLINE: storage.set("outline", SETTINGS.CUSTOMIZE_OUTLINE) logger.info("[Use Customize Outline]", SETTINGS.CUSTOMIZE_OUTLINE) else: # Load prompt from /prompts/create_outline.yaml outline = ( chief_editor_agent .load_yaml_prompt( path=f"{ root_path }/prompts/create_outline.yaml", variables={ "topic": storage.get("topic"), "news_time_limit": SETTINGS.NEWS_TIME_LIMIT if hasattr(SETTINGS, "NEWS_TIME_LIMIT") else "d", "language": SETTINGS.OUTPUT_LANGUAGE, "max_column_num": SETTINGS.MAX_COLUMN_NUM, } ) .start() ) storage.set("outline", outline) logger.info("[Outline Generated]", outline) # sleep to avoid requesting too often time.sleep(SETTINGS.SLEEP_TIME) @main_workflow.chunk("generate_columns") def generate_columns_executor(inputs, storage): columns_data = [] outline = storage.get("outline") for column_outline in outline["column_list"]: column_data = start_column_workflow( column_outline=column_outline, agent_factory=agent_factory, SETTINGS=SETTINGS, root_path=root_path, logger=logger, ) if column_data: columns_data.append(column_data) logger.info("[Column Data Prepared]", column_data) storage.set("columns_data", columns_data) @main_workflow.chunk("generate_markdown") def generate_markdown_executor(inputs, storage): outline = storage.get("outline") columns_data = storage.get("columns_data") if columns_data and len(columns_data) > 0: # Main Title md_doc_text = f'# { outline["report_title"] }\n\n' md_doc_text += f'> { datetime.now().strftime("%Y-%m-%d %A") }\n\n' # Columns if SETTINGS.IS_DEBUG: logger.debug("[Columns Data]", columns_data) for column_data in columns_data: md_doc_text += f'## { column_data["title"] }\n\n### PROLOGUE\n\n' md_doc_text += f'> { column_data["prologue"] }\n\n' md_doc_text += f"### NEWS LIST\n\n" for single_news in column_data["news_list"]: md_doc_text += f'- [{ single_news["title"] }]({ single_news["url"] })\n\n' md_doc_text += f' - `[summray]` { single_news["summary"] }\n' md_doc_text += f' - `[comment]` { single_news["recommend_comment"] }\n\n' # Tailer md_doc_text +="\n\n---\n\nPowered by [Agently AI Application Development Framework & Agently Workflow](https://github.com/Maplemx/Agently)\n\n" md_doc_text += f"Model Information:{ SETTINGS.MODEL_PROVIDER if hasattr(SETTINGS, 'MODEL_PROVIDER') else 'OpenAI' } - { str(SETTINGS.MODEL_OPTIONS) if hasattr(SETTINGS, 'MODEL_OPTIONS') else 'Default Options' }\n\n" md_doc_text += '**_Agently_** [Guidebook](https://github.com/Maplemx/Agently/blob/main/docs/guidebook)\n\n[Apply Developers WeChat Group](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan QR Code to Apply.\n\nimage' logger.info("[Markdown Generated]", md_doc_text) with open(f'{ root_path }/{ outline["report_title"] }_{ datetime.now().strftime("%Y-%m-%d") }.md', 'w', encoding='utf-8') as f: f.write(md_doc_text) else: logger.info("[Markdown Generation Failed] Due to have not any column data.") # Connect Chunks ( main_workflow.chunks["start"] .connect_to(main_workflow.chunks["input_topic"]) .connect_to(main_workflow.chunks["generate_outline"]) .connect_to(main_workflow.chunks["generate_columns"]) .connect_to(main_workflow.chunks["generate_markdown"]) ) # Start Workflow main_workflow.start() ================================================ FILE: v3/workflows/tools/__init__.py ================================================ ================================================ FILE: v3/workflows/tools/browse.py ================================================ import re import requests from bs4 import BeautifulSoup def browse(url, *, logger=None, proxy=None): content = "" try: request_options = { "headers": { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" } } if proxy: if proxy.startswith("http:"): request_options.update({ "proxies": { "http": proxy } }) elif proxy.startswith("https:"): request_options.update({ "proxies": { "https": proxy } }) page = requests.get( url, **request_options ) soup = BeautifulSoup(page.content, "html.parser") # find text in p, list, pre (github code), td chunks = soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre", "td"]) for chunk in chunks: if chunk.name.startswith("h"): content += "#" * int(chunk.name[-1]) + " " + chunk.get_text() + "\n" else: text = chunk.get_text() if text and text != "": content += text + "\n" # find text in div that class=content divs = soup.find("div", class_="content") if divs: chunks_with_text = divs.find_all(text=True) for chunk in chunks_with_text: if isinstance(chunk, str) and chunk.strip(): content += chunk.strip() + "\n" content = re.sub(r"\n+", "\n", content) return content except Exception as e: if logger: logger.error(f"[Browse]: Can not browse '{ url }'.\tError: { str(e) }") return "" ================================================ FILE: v3/workflows/tools/search.py ================================================ from duckduckgo_search import DDGS def search(keywords, **kwargs): results = [] try: with DDGS(proxy=kwargs.get("proxy", None)) as ddgs: for index, result in enumerate( ddgs.news( keywords, max_results=kwargs.get("max_results", 8), timelimit=kwargs.get("timelimit", "d"), ) ): results.append({ "id": index, "title": result["title"], "brief": result["body"], "url": result["url"], "source": result["source"], "date": result["date"], }) return results except Exception as e: if "logger" in kwargs: kwargs["logger"].error(f"[Search]: Can not search '{ keywords }'.\tError: { str(e) }") return [] ================================================ FILE: workflow/__init__.py ================================================ from .daily_news import build_daily_news_flow __all__ = ["build_daily_news_flow"] ================================================ FILE: workflow/column_chunks.py ================================================ from __future__ import annotations import copy import re from typing import Any, Callable from agently import TriggerFlowRuntimeData from .common import ( DailyNewsChunkConfig, create_editor_agent, is_chinese_language, require_logger, require_search_tool, safe_int, ) from .summary_chunks import pick_news def create_search_column_news_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def search_column_news(data: TriggerFlowRuntimeData) -> dict[str, Any] | None: column_outline = data.value if isinstance(data.value, dict) else None if not isinstance(column_outline, dict): return None title = str(column_outline.get("column_title") or "").strip() if not title: return None logger = require_logger(data) request = _get_request_context(data) logger.info("[Start Generate Column] %s", title) try: searched_news = await search_news( config, logger, require_search_tool(data), column_outline, topic=str(request.get("topic") or ""), ) except Exception as exc: logger.exception("[Column Search Failed] %s: %s", title, exc) return None logger.info("[Search News Count] %s => %s", title, len(searched_news)) if not searched_news: return None return { "column_outline": copy.deepcopy(column_outline), "searched_news": searched_news, } return search_column_news def create_pick_column_news_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def pick_column_news(data: TriggerFlowRuntimeData) -> dict[str, Any] | None: context = _coerce_column_context(data.value) if context is None: return None column_outline = context["column_outline"] searched_news = context["searched_news"] title = str(column_outline.get("column_title") or "").strip() logger = require_logger(data) try: picked_news = await pick_news( config, column_outline, searched_news, ) except Exception as exc: logger.exception("[Column Pick Failed] %s: %s", title, exc) return None logger.info("[Picked News Count] %s => %s", title, len(picked_news)) if not picked_news: return None return { **context, "picked_news": picked_news, } return pick_column_news def create_write_column_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def write_column(data: TriggerFlowRuntimeData) -> dict[str, Any] | None: context = _coerce_column_context(data.value, require_picked=True, require_summarized=True) if context is None: return None column_outline = context["column_outline"] title = str(column_outline.get("column_title") or "").strip() logger = require_logger(data) try: column_result = await _write_column( config, column_outline, context["summarized_news"], ) except Exception as exc: logger.exception("[Column Write Failed] %s: %s", title, exc) return None logger.info("[Column Ready] %s", title) return column_result return write_column def _get_request_context(data: TriggerFlowRuntimeData) -> dict[str, Any]: request = data.state.get("request") if not isinstance(request, dict): request = data.get_runtime_data("request") return request if isinstance(request, dict) else {} def _coerce_column_context( value: Any, *, require_picked: bool = False, require_summarized: bool = False, ) -> dict[str, Any] | None: if not isinstance(value, dict): return None column_outline = value.get("column_outline") searched_news = value.get("searched_news") if not isinstance(column_outline, dict) or not isinstance(searched_news, list): return None context: dict[str, Any] = { "column_outline": column_outline, "searched_news": searched_news, } picked_news = value.get("picked_news") if picked_news is not None: if not isinstance(picked_news, list): return None context["picked_news"] = picked_news elif require_picked: return None summarized_news = value.get("summarized_news") if summarized_news is not None: if not isinstance(summarized_news, list): return None context["summarized_news"] = summarized_news elif require_summarized: return None return context async def search_news( config: DailyNewsChunkConfig, logger, search_tool, column_outline: dict[str, Any], *, topic: str, ) -> list[dict[str, Any]]: query = str(column_outline.get("search_keywords") or "").strip() if not query: return [] queries = build_search_queries( search_keywords=query, topic=topic, ) normalized_results = [] seen_urls: set[str] = set() for candidate in queries: try: raw_results = await search_tool.search_news( query=candidate, timelimit=config.settings.search.timelimit, max_results=config.settings.search.max_results, ) except Exception as exc: logger.warning("[Search Failed] %s => %s", candidate, exc) continue added_count = 0 for raw in raw_results or []: if not isinstance(raw, dict): continue title = str(raw.get("title") or "").strip() url = str(raw.get("url") or raw.get("href") or "").strip() if not title or not url or url in seen_urls: continue seen_urls.add(url) normalized_results.append( { "id": len(normalized_results), "title": title, "brief": str(raw.get("body") or raw.get("snippet") or "").strip(), "url": url, "source": str(raw.get("source") or "").strip(), "date": str(raw.get("date") or "").strip(), } ) added_count += 1 if len(normalized_results) >= config.settings.search.max_results: break logger.info("[Search Attempt] %s => %s", candidate, added_count) if len(normalized_results) >= config.settings.search.max_results: break return normalized_results def build_search_queries( *, search_keywords: str, topic: str, ) -> list[str]: queries: list[str] = [] seen: set[str] = set() def add(query: str) -> None: normalized = re.sub(r"\s+", " ", query).strip() if not normalized or normalized in seen: return seen.add(normalized) queries.append(normalized) add(search_keywords) keyword_tokens = _extract_search_tokens(search_keywords) topic_tokens = _extract_search_tokens(topic) if keyword_tokens: add(" ".join(keyword_tokens)) non_year_keyword_tokens = [token for token in keyword_tokens if not re.fullmatch(r"\d{4}", token)] if non_year_keyword_tokens: add(" ".join(non_year_keyword_tokens)) if topic_tokens: add(" ".join(topic_tokens)) add(" ".join([*topic_tokens, "news"])) merged_tokens = _dedupe_tokens([*topic_tokens, *keyword_tokens]) if merged_tokens: add(" ".join(merged_tokens)) add(" ".join([*merged_tokens, "news"])) return queries def _extract_search_tokens(text: str) -> list[str]: tokens = re.findall(r"[A-Za-z0-9][A-Za-z0-9._+-]*", text) return _dedupe_tokens(tokens)[:8] def _dedupe_tokens(tokens: list[str]) -> list[str]: result: list[str] = [] seen: set[str] = set() for token in tokens: normalized = token.strip() lower_token = normalized.lower() if not normalized or lower_token in seen: continue seen.add(lower_token) result.append(normalized) return result async def _write_column( config: DailyNewsChunkConfig, column_outline: dict[str, Any], summarized_news: list[dict[str, Any]], ) -> dict[str, Any]: slimmed_news = [] for index, news in enumerate(summarized_news): slimmed_news.append( { "id": index, "title": news["title"], "summary": news["summary"], "url": news["url"], "source": news.get("source", ""), "date": news.get("date", ""), "recommend_comment": news.get("recommend_comment", ""), } ) column_result = await ( create_editor_agent(kind="column") .load_yaml_prompt( config.prompt_dir / "write_column.yaml", { "news_list": slimmed_news, "column_title": column_outline["column_title"], "column_requirement": column_outline["column_requirement"], "language": config.settings.workflow.output_language, }, ) .async_start( ensure_keys=[ "prologue", "news_list[*].id", "news_list[*].recommend_comment", ] ) ) if not isinstance(column_result, dict): return _build_fallback_column(config, column_outline, summarized_news) final_news_list = [] used_ids: set[int] = set() for item in column_result.get("news_list", []): if not isinstance(item, dict): continue news_id = safe_int(item.get("id"), -1) if news_id < 0 or news_id >= len(summarized_news) or news_id in used_ids: continue used_ids.add(news_id) final_item = copy.deepcopy(summarized_news[news_id]) refined_comment = str(item.get("recommend_comment") or "").strip() if refined_comment: final_item["recommend_comment"] = refined_comment final_news_list.append(final_item) if not final_news_list: final_news_list = summarized_news[: config.settings.workflow.max_news_per_column] prologue = str(column_result.get("prologue") or "").strip() if not prologue: prologue = _build_fallback_prologue(config, column_outline, final_news_list) return { "title": column_outline["column_title"], "prologue": prologue, "news_list": final_news_list, } def _build_fallback_column( config: DailyNewsChunkConfig, column_outline: dict[str, Any], summarized_news: list[dict[str, Any]], ) -> dict[str, Any]: return { "title": column_outline["column_title"], "prologue": _build_fallback_prologue(config, column_outline, summarized_news), "news_list": summarized_news[: config.settings.workflow.max_news_per_column], } def _build_fallback_prologue( config: DailyNewsChunkConfig, column_outline: dict[str, Any], news_list: list[dict[str, Any]], ) -> str: if not news_list: return str(column_outline.get("column_requirement") or "") if is_chinese_language(config.settings.workflow.output_language): lead_titles = ",".join(f"《{news['title']}》" for news in news_list[:3]) return f"本栏目围绕“{column_outline['column_title']}”整理了以下重点内容:{lead_titles}。" lead_titles = ", ".join(news["title"] for news in news_list[:3]) return f"This section highlights the most relevant stories for {column_outline['column_title']}: {lead_titles}." __all__ = [ "create_search_column_news_chunk", "create_pick_column_news_chunk", "create_write_column_chunk", ] ================================================ FILE: workflow/common.py ================================================ from __future__ import annotations import logging import re from dataclasses import dataclass from pathlib import Path from typing import Any, cast from agently import Agently, TriggerFlowRuntimeData from news_collector.config import AppSettings from tools.base import BrowseToolProtocol, SearchToolProtocol @dataclass(frozen=True, slots=True) class DailyNewsChunkConfig: settings: AppSettings prompt_dir: Path output_dir: Path model_label: str def create_editor_agent(*, kind: str): agent = Agently.create_agent(name=f"{kind}_editor") if kind == "chief": agent.set_agent_prompt( "system", "You are a veteran newsroom chief editor who designs reliable daily news briefings.", ) agent.set_agent_prompt( "instruct", [ "Prefer recent, factual, non-duplicated stories.", "Keep structures stable and concise.", ], ) else: agent.set_agent_prompt( "system", "You are a meticulous news editor who selects and rewrites high-signal stories.", ) agent.set_agent_prompt( "instruct", [ "Reject irrelevant or thin content.", "Keep comments practical and publication-ready.", ], ) return agent def is_chinese_language(language: str) -> bool: normalized = language.lower() return "chinese" in normalized or normalized.startswith("zh") def safe_filename(name: str) -> str: cleaned = re.sub(r"[\\/:*?\"<>|]+", "-", name) cleaned = re.sub(r"\s+", " ", cleaned).strip(" .-_") return cleaned or "daily-news-report" def safe_int(value: Any, default: int) -> int: try: return int(value) except (TypeError, ValueError): return default def require_logger(data: TriggerFlowRuntimeData) -> logging.Logger: return cast(logging.Logger, data.require_resource("logger")) def require_search_tool(data: TriggerFlowRuntimeData) -> SearchToolProtocol: return cast(SearchToolProtocol, data.require_resource("search_tool")) def require_browse_tool(data: TriggerFlowRuntimeData) -> BrowseToolProtocol: return cast(BrowseToolProtocol, data.require_resource("browse_tool")) __all__ = [ "DailyNewsChunkConfig", "create_editor_agent", "is_chinese_language", "safe_filename", "safe_int", "require_logger", "require_search_tool", "require_browse_tool", ] ================================================ FILE: workflow/daily_news.py ================================================ from __future__ import annotations from pathlib import Path from agently import TriggerFlow from news_collector.config import AppSettings from .column_chunks import ( create_pick_column_news_chunk, create_search_column_news_chunk, create_write_column_chunk, ) from .common import DailyNewsChunkConfig from .report_chunks import ( create_generate_outline_chunk, create_prepare_request_chunk, create_render_report_chunk, ) from .summary_chunks import ( create_dispatch_summary_batch_chunk, create_finalize_summary_chunk, create_merge_summary_batch_chunk, create_prepare_summary_candidates_chunk, create_summarize_candidate_chunk, ) def build_summary_sub_flow( *, chunk_config: DailyNewsChunkConfig, ) -> TriggerFlow: flow = TriggerFlow(name="daily-news-summary-sub-flow") prepare_summary_candidates = flow.chunk("prepare_summary_candidates")( create_prepare_summary_candidates_chunk(chunk_config) ) dispatch_summary_batch = flow.chunk("dispatch_summary_batch")( create_dispatch_summary_batch_chunk(chunk_config) ) summarize_candidate = flow.chunk("summarize_candidate")( create_summarize_candidate_chunk(chunk_config) ) merge_summary_batch = flow.chunk("merge_summary_batch")( create_merge_summary_batch_chunk(chunk_config) ) finalize_summary = flow.chunk("finalize_summary")( create_finalize_summary_chunk(chunk_config) ) ( flow.when("Summary.Dispatch") .to(dispatch_summary_batch) .for_each(concurrency=chunk_config.settings.workflow.summary_concurrency) .to(summarize_candidate) .end_for_each() .to(merge_summary_batch) ) flow.when("Summary.Done").to(finalize_summary).end() flow.to(prepare_summary_candidates) return flow def build_column_sub_flow( *, chunk_config: DailyNewsChunkConfig, ) -> TriggerFlow: flow = TriggerFlow(name="daily-news-column-sub-flow") summary_sub_flow = build_summary_sub_flow(chunk_config=chunk_config) search_column_news = flow.chunk("search_column_news")(create_search_column_news_chunk(chunk_config)) pick_column_news = flow.chunk("pick_column_news")(create_pick_column_news_chunk(chunk_config)) write_column = flow.chunk("write_column")(create_write_column_chunk(chunk_config)) ( flow.to(search_column_news) .to(pick_column_news) .to_sub_flow( summary_sub_flow, capture={ "input": "value", "resources": { "logger": "resources.logger", "browse_tool": "resources.browse_tool", }, }, write_back={ "value": "result", }, ) .to(write_column) .end() ) return flow def build_daily_news_flow( *, settings: AppSettings, root_dir: str | Path, model_label: str, ) -> TriggerFlow: resolved_root_dir = Path(root_dir).resolve() chunk_config = DailyNewsChunkConfig( settings=settings, prompt_dir=resolved_root_dir / "prompts", output_dir=resolved_root_dir / settings.output.directory, model_label=model_label, ) flow = TriggerFlow(name="daily-news-collector-v4") column_sub_flow = build_column_sub_flow(chunk_config=chunk_config) prepare_request = flow.chunk("prepare_request")(create_prepare_request_chunk(chunk_config)) generate_outline = flow.chunk("generate_outline")(create_generate_outline_chunk(chunk_config)) render_report = flow.chunk("render_report")(create_render_report_chunk(chunk_config)) ( flow.to(prepare_request) .to(generate_outline) .for_each(concurrency=settings.workflow.column_concurrency) .to_sub_flow( column_sub_flow, capture={ "input": "value", "runtime_data": { "request": "runtime_data.request", }, "resources": { "logger": "resources.logger", "search_tool": "resources.search_tool", "browse_tool": "resources.browse_tool", }, }, write_back={ "value": "result", }, ) .end_for_each() .to(render_report) .end() ) return flow ================================================ FILE: workflow/report_chunks.py ================================================ from __future__ import annotations import copy from datetime import datetime from pathlib import Path from typing import Any, Callable from agently import TriggerFlowRuntimeData from news_collector.markdown import render_markdown from .common import DailyNewsChunkConfig, create_editor_agent, require_logger, safe_filename def create_prepare_request_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def prepare_request(data: TriggerFlowRuntimeData) -> dict[str, Any]: topic = str(data.value).strip() now = datetime.now() request = { "topic": topic, "today": now.strftime("%Y-%m-%d"), "generated_at": now.strftime("%Y-%m-%d %H:%M:%S"), "language": config.settings.workflow.output_language, } data.state.set("request", request) require_logger(data).info("[Topic] %s", topic) return request return prepare_request def create_generate_outline_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def generate_outline(data: TriggerFlowRuntimeData) -> list[dict[str, Any]]: request = data.value logger = require_logger(data) if config.settings.outline.use_customized: outline = _get_customized_outline(config) logger.info("[Use Customized Outline] %s", outline) else: outline = await _generate_outline(config, request) logger.info("[Outline Generated] %s", outline) data.state.set("outline", outline) return outline.get("column_list", []) return generate_outline def create_render_report_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def render_report(data: TriggerFlowRuntimeData) -> dict[str, Any]: request = data.state.get("request") or {} outline = data.state.get("outline") or {} columns = [column for column in data.value if isinstance(column, dict)] report_title = str( outline.get("report_title") or f"Daily News about {request.get('topic', 'the topic')}" ) markdown = render_markdown( report_title=report_title, generated_at=str(request.get("generated_at") or ""), topic=str(request.get("topic") or ""), language=config.settings.workflow.output_language, columns=columns, model_label=config.model_label, ) output_path = _write_markdown( config=config, report_title=report_title, report_date=str(request.get("today") or ""), markdown=markdown, ) require_logger(data).info("[Markdown Saved] %s", output_path) return { "report_title": report_title, "output_path": str(output_path), "markdown": markdown, "columns": columns, } return render_report async def _generate_outline( config: DailyNewsChunkConfig, request: dict[str, Any], ) -> dict[str, Any]: outline = await ( create_editor_agent(kind="chief") .load_yaml_prompt( config.prompt_dir / "create_outline.yaml", { "topic": request["topic"], "today": request["today"], "language": config.settings.workflow.output_language, "max_column_num": config.settings.workflow.max_column_num, }, ) .async_start( ensure_keys=[ "report_title", "column_list[*].column_title", "column_list[*].column_requirement", "column_list[*].search_keywords", ] ) ) if not isinstance(outline, dict): raise TypeError(f"Invalid outline result: {outline}") column_list = outline.get("column_list", []) if not isinstance(column_list, list): raise TypeError("Outline column_list must be a list.") outline["column_list"] = column_list[: config.settings.workflow.max_column_num] return outline def _get_customized_outline(config: DailyNewsChunkConfig) -> dict[str, Any]: outline = copy.deepcopy(config.settings.outline.customized) column_list = outline.get("column_list", []) if not isinstance(column_list, list) or not column_list: raise ValueError("Customized outline must provide a non-empty column_list.") outline["column_list"] = column_list[: config.settings.workflow.max_column_num] outline.setdefault("report_title", "Daily News Briefing") return outline def _write_markdown( *, config: DailyNewsChunkConfig, report_title: str, report_date: str, markdown: str, ) -> Path: config.output_dir.mkdir(parents=True, exist_ok=True) file_name = f"{safe_filename(report_title)}_{report_date or datetime.now().strftime('%Y-%m-%d')}.md" output_path = config.output_dir / file_name output_path.write_text(markdown, encoding="utf-8") return output_path __all__ = [ "create_prepare_request_chunk", "create_generate_outline_chunk", "create_render_report_chunk", ] ================================================ FILE: workflow/summary_chunks.py ================================================ from __future__ import annotations import copy from typing import Any, Callable from agently import TriggerFlowRuntimeData from .common import ( DailyNewsChunkConfig, create_editor_agent, is_chinese_language, require_browse_tool, require_logger, safe_int, ) def create_prepare_summary_candidates_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def prepare_summary_candidates(data: TriggerFlowRuntimeData): context = _coerce_summary_context(data.value) if context is None: data.state.set("summary_context", None, emit=False) data.state.set("summary_candidates", [], emit=False) data.state.set("summary_cursor", 0, emit=False) data.state.set("summary_results", [], emit=False) data.state.set("summary_target_count", 0, emit=False) await data.async_emit("Summary.Done", None) return candidates = build_summary_candidates( config, context["column_outline"], context["searched_news"], context["picked_news"], ) target_count = min( len(context["picked_news"]), config.settings.workflow.max_news_per_column, ) data.state.set("summary_context", copy.deepcopy(context), emit=False) data.state.set("summary_candidates", candidates, emit=False) data.state.set("summary_cursor", 0, emit=False) data.state.set("summary_results", [], emit=False) data.state.set("summary_target_count", target_count, emit=False) if target_count <= 0 or not candidates: await data.async_emit("Summary.Done", None) else: await data.async_emit("Summary.Dispatch", None) return prepare_summary_candidates def create_dispatch_summary_batch_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def dispatch_summary_batch(data: TriggerFlowRuntimeData) -> list[dict[str, Any]]: candidates = data.state.get("summary_candidates") or [] cursor = safe_int(data.state.get("summary_cursor"), 0) target_count = safe_int(data.state.get("summary_target_count"), 0) summary_results = data.state.get("summary_results") or [] if not isinstance(candidates, list) or not isinstance(summary_results, list): raise RuntimeError("Invalid summary flow state.") remaining_needed = target_count - len(summary_results) batch_size = min( max(config.settings.workflow.summary_concurrency, 1), max(remaining_needed, 0), len(candidates) - cursor, ) if batch_size <= 0: raise RuntimeError("Summary dispatch received no work. Summary.Done should have been emitted first.") batch = candidates[cursor : cursor + batch_size] data.state.set("summary_cursor", cursor + batch_size, emit=False) return batch return dispatch_summary_batch def create_summarize_candidate_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def summarize_candidate(data: TriggerFlowRuntimeData) -> dict[str, Any]: candidate = data.value if isinstance(data.value, dict) else {} news = candidate.get("news") is_backup = bool(candidate.get("is_backup")) if not isinstance(news, dict): return { "news": {}, "is_backup": is_backup, "summarized": None, } logger = require_logger(data) column_outline = _get_summary_column_outline(data) summarized = await summarize_single_news( config, logger, require_browse_tool(data), column_outline, news, ) return { "news": copy.deepcopy(news), "is_backup": is_backup, "summarized": summarized, } return summarize_candidate def create_merge_summary_batch_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def merge_summary_batch(data: TriggerFlowRuntimeData): logger = require_logger(data) results = data.value if isinstance(data.value, list) else [] summary_results = data.state.get("summary_results") or [] cursor = safe_int(data.state.get("summary_cursor"), 0) candidates = data.state.get("summary_candidates") or [] target_count = safe_int(data.state.get("summary_target_count"), 0) if not isinstance(summary_results, list) or not isinstance(candidates, list): raise RuntimeError("Invalid summary merge state.") for item in results: if not isinstance(item, dict): continue news = item.get("news") summarized = item.get("summarized") is_backup = bool(item.get("is_backup")) title = str(news.get("title") or "").strip() if isinstance(news, dict) else "" if isinstance(summarized, dict): summary_results.append(summarized) continue if is_backup: logger.info("[Backup News Rejected] %s", title) elif cursor < len(candidates): logger.info("[Backup News Activated] %s", title) data.state.set("summary_results", summary_results, emit=False) if len(summary_results) >= target_count or cursor >= len(candidates): await data.async_emit("Summary.Done", None) else: await data.async_emit("Summary.Dispatch", None) return merge_summary_batch def create_finalize_summary_chunk( config: DailyNewsChunkConfig, ) -> Callable[[TriggerFlowRuntimeData], Any]: async def finalize_summary(data: TriggerFlowRuntimeData) -> dict[str, Any]: context = data.state.get("summary_context") if not isinstance(context, dict): return { "column_outline": {}, "searched_news": [], "picked_news": [], "summarized_news": [], } result = copy.deepcopy(context) summarized_news = data.state.get("summary_results") or [] result["summarized_news"] = summarized_news if isinstance(summarized_news, list) else [] logger = require_logger(data) title = str(result.get("column_outline", {}).get("column_title") or "").strip() logger.info("[Summarized News Count] %s => %s", title, len(result["summarized_news"])) return result return finalize_summary def _coerce_summary_context(value: Any) -> dict[str, Any] | None: if not isinstance(value, dict): return None column_outline = value.get("column_outline") searched_news = value.get("searched_news") picked_news = value.get("picked_news") if not isinstance(column_outline, dict) or not isinstance(searched_news, list) or not isinstance(picked_news, list): return None return { "column_outline": copy.deepcopy(column_outline), "searched_news": copy.deepcopy(searched_news), "picked_news": copy.deepcopy(picked_news), } def _get_summary_column_outline(data: TriggerFlowRuntimeData) -> dict[str, Any]: context = data.state.get("summary_context") if isinstance(context, dict) and isinstance(context.get("column_outline"), dict): return context["column_outline"] return {} def build_summary_candidates( config: DailyNewsChunkConfig, column_outline: dict[str, Any], searched_news: list[dict[str, Any]], picked_news: list[dict[str, Any]], ) -> list[dict[str, Any]]: candidates: list[dict[str, Any]] = [] picked_urls = { str(news.get("url") or "").strip() for news in picked_news if str(news.get("url") or "").strip() } seen_urls: set[str] = set() for news in picked_news: url = str(news.get("url") or "").strip() if not url or url in seen_urls: continue seen_urls.add(url) candidates.append( { "news": copy.deepcopy(news), "is_backup": False, } ) for news in searched_news: url = str(news.get("url") or "").strip() if not url or url in seen_urls or url in picked_urls: continue seen_urls.add(url) backup_news = copy.deepcopy(news) if not str(backup_news.get("recommend_comment") or "").strip(): backup_news["recommend_comment"] = build_backup_recommend_comment( config, column_outline, backup_news, ) candidates.append( { "news": backup_news, "is_backup": True, } ) return candidates async def pick_news( config: DailyNewsChunkConfig, column_outline: dict[str, Any], searched_news: list[dict[str, Any]], ) -> list[dict[str, Any]]: pick_results = await ( create_editor_agent(kind="column") .load_yaml_prompt( config.prompt_dir / "pick_news.yaml", { "column_news": searched_news, "column_title": column_outline["column_title"], "column_requirement": column_outline["column_requirement"], "max_news_per_column": config.settings.workflow.max_news_per_column, }, ) .async_start( ensure_keys=[ "[*].id", "[*].can_use", "[*].relevance_score", "[*].recommend_comment", ] ) ) if not isinstance(pick_results, list): return [] picked_news = [] seen_ids: set[int] = set() sorted_results = sorted( [item for item in pick_results if isinstance(item, dict)], key=lambda item: safe_int(item.get("relevance_score"), 0), reverse=True, ) for item in sorted_results: if item.get("can_use") is not True: continue news_id = safe_int(item.get("id"), -1) if news_id < 0 or news_id >= len(searched_news) or news_id in seen_ids: continue seen_ids.add(news_id) picked_item = copy.deepcopy(searched_news[news_id]) picked_item["recommend_comment"] = str(item.get("recommend_comment") or "").strip() picked_item["relevance_score"] = safe_int(item.get("relevance_score"), 0) picked_news.append(picked_item) if len(picked_news) >= config.settings.workflow.max_news_per_column: break return picked_news async def summarize_single_news( config: DailyNewsChunkConfig, logger, browse_tool, column_outline: dict[str, Any], news: dict[str, Any], ) -> dict[str, Any] | None: logger.info("[Summarizing] %s", news["title"]) content = await browse_tool.browse(news["url"]) content = str(content or "").strip() if len(content) < config.settings.browse.min_content_length: logger.info("[Summarizing] Failed - content too short") return None if is_invalid_browse_content(content): logger.info("[Summarizing] Failed - invalid browsed content") return None summary_result = await ( create_editor_agent(kind="column") .load_yaml_prompt( config.prompt_dir / "summarize_news.yaml", { "news_content": content, "news_title": news["title"], "column_requirement": column_outline["column_requirement"], "language": config.settings.workflow.output_language, }, ) .async_start( ensure_keys=[ "can_summarize", "summary", ] ) ) if not isinstance(summary_result, dict): logger.info("[Summarizing] Failed - invalid summary output") return None if summary_result.get("can_summarize") is not True: logger.info("[Summarizing] Failed - model rejected content") return None summary = str(summary_result.get("summary") or "").strip() if not summary: logger.info("[Summarizing] Failed - empty summary") return None summarized_news = copy.deepcopy(news) summarized_news["summary"] = summary logger.info("[Summarizing] Success") return summarized_news def build_backup_recommend_comment( config: DailyNewsChunkConfig, column_outline: dict[str, Any], news: dict[str, Any], ) -> str: title = str(column_outline.get("column_title") or "this section") news_title = str(news.get("title") or "").strip() if is_chinese_language(config.settings.workflow.output_language): if news_title: return f"该报道与“{title}”存在明确关联,可作为备用候选:{news_title}。" return f"该报道与“{title}”存在明确关联,可作为备用候选。" if news_title: return f"This story is meaningfully related to {title} and is kept as a backup candidate: {news_title}." return f"This story is meaningfully related to {title} and is kept as a backup candidate." def is_invalid_browse_content(content: str) -> bool: normalized = content.strip() lowered = normalized.lower() invalid_markers = ( "can not browse '", "fallback failed:", "content_empty_or_too_short", "we've detected unusual activity", "not a robot", "captcha", "access denied", "subscribe now", ) return any(marker in lowered for marker in invalid_markers) __all__ = [ "create_prepare_summary_candidates_chunk", "create_dispatch_summary_batch_chunk", "create_summarize_candidate_chunk", "create_merge_summary_batch_chunk", "create_finalize_summary_chunk", "pick_news", ]