Repository: AgentEra/Agently-Daily-News-Collector
Branch: main
Commit: 6812e1c78fb9
Files: 51
Total size: 146.8 KB

Directory structure:
gitextract__2ep8twt/

├── .gitignore
├── .vscode/
│   └── settings.json
├── Dockerfile
├── LICENSE
├── README.md
├── README_CN.md
├── SETTINGS.yaml
├── app.py
├── logs/
│   └── .gitkeep
├── news_collector/
│   ├── __init__.py
│   ├── cli.py
│   ├── collector.py
│   ├── config.py
│   ├── logging_utils.py
│   └── markdown.py
├── outputs/
│   └── .gitkeep
├── prompts/
│   ├── create_outline.yaml
│   ├── pick_news.yaml
│   ├── summarize_news.yaml
│   └── write_column.yaml
├── requirements.txt
├── tools/
│   ├── README.md
│   ├── __init__.py
│   ├── base.py
│   └── builtin.py
├── v3/
│   ├── README.md
│   ├── README_CN.md
│   ├── SETTINGS.yaml
│   ├── app.py
│   ├── examples/
│   │   └── Latest Updates on AI Models2024-05-02.md
│   ├── prompts/
│   │   ├── create_outline.yaml
│   │   ├── pick_news.yaml
│   │   ├── summarize.yaml
│   │   └── write_column.yaml
│   ├── requirements.txt
│   ├── utils/
│   │   ├── __init__.py
│   │   ├── logger.py
│   │   ├── path.py
│   │   └── yaml_reader.py
│   └── workflows/
│       ├── __init__.py
│       ├── column_workflow.py
│       ├── main_workflow.py
│       └── tools/
│           ├── __init__.py
│           ├── browse.py
│           └── search.py
└── workflow/
    ├── __init__.py
    ├── column_chunks.py
    ├── common.py
    ├── daily_news.py
    ├── report_chunks.py
    └── summary_chunks.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# MacOS
.DS_Store

# Agently
.Agently

# Project outputs
logs/*
!logs/.gitkeep
outputs/*
!outputs/.gitkeep


================================================
FILE: .vscode/settings.json
================================================
{
    "python-envs.defaultEnvManager": "ms-python.python:conda",
    "python-envs.defaultPackageManager": "ms-python.python:conda"
}

================================================
FILE: Dockerfile
================================================
FROM python:3.10

WORKDIR /app

COPY . .

RUN pip install --no-cache-dir -r requirements.txt

CMD ["python", "app.py"]


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Agently Daily News Collector v4

Agently Daily News Collector has been rewritten on top of **Agently v4** and now uses:

- `TriggerFlow` for the end-to-end pipeline
- Agently v4 built-in `Search` and `Browse` tools
- structured output contracts instead of the old v3 workflow API

> Version constraint: this project requires **Agently v4.0.8.3 or newer**. The current implementation uses `TriggerFlow sub flow` to organize per-column pipelines, so earlier v4 releases are not compatible with the workflow structure used here.

The previous Agently v3 project has been archived under [`./v3`](./v3).

## Features

- Input a topic and generate a multi-column news briefing automatically
- Search, shortlist, browse, summarize, and assemble stories in one flow
- Save the final report as Markdown under `./outputs`
- Keep prompt templates in `./prompts` for easy editing
- Keep an independent `./tools` layer so search/browse can be replaced without touching the main workflow
- Keep flow construction in `./workflow` so orchestration can evolve independently from collector logic

## Quick Start

1. Install dependencies:

```bash
pip install -r requirements.txt
```

If you install Agently manually, make sure you use at least:

```bash
pip install "agently>=4.0.8.3"
```

2. Edit [`SETTINGS.yaml`](./SETTINGS.yaml):

- Keep the model block as environment placeholders
- Export the required environment variables:

```bash
export AGENTLY_NEWS_BASE_URL="https://api.openai.com/v1"
export AGENTLY_NEWS_MODEL="gpt-4.1-mini"
export AGENTLY_NEWS_API_KEY="your_api_key"
```

- Or put them in a local `.env` file:

```dotenv
AGENTLY_NEWS_BASE_URL=https://api.openai.com/v1
AGENTLY_NEWS_MODEL=gpt-4.1-mini
AGENTLY_NEWS_API_KEY=your_api_key
```

- Adjust language / search / concurrency settings if needed
- If your OpenAI-compatible endpoint does not require authentication, you can leave `AGENTLY_NEWS_API_KEY` unset and the project will skip `auth`.

3. Run:

```bash
python app.py
```

Or pass a topic directly:

```bash
python app.py "AI agents"
```

## Project Structure

```text
.
├── app.py
├── news_collector/
├── tools/
├── workflow/
├── prompts/
├── outputs/
├── logs/
└── v3/
```

## Important v3 -> v4 Changes

The business chain is still roughly:

`outline -> search -> pick -> browse + summarize -> write column -> render markdown`

What changed is the engineering shape around that chain.

### Project-level changes

- The old v3 project used a main workflow plus a nested column workflow under `./workflows`, with custom `search.py` / `browse.py` helpers and storage-style state passing.
- The v4 project separates responsibilities more clearly:
  - `news_collector/`: app/integration layer
  - `workflow/`: parent flow, column sub flow, and concrete chunk logic
  - `tools/`: search/browse adapter layer
  - `prompts/`: structured prompt contracts
- Model configuration is no longer hardcoded in Python. It now uses `${ENV.xxx}` placeholders from `SETTINGS.yaml`, so deployment and local switching are simpler.
- Tool wiring is no longer buried inside workflow code. Search, browse, and logger are injected as TriggerFlow runtime resources, which makes the workflow easier to replace or test.
- The workflow plan is now closer to the business boundary:
  - parent flow: `prepare_request -> generate_outline -> for_each(column) -> render_report`
  - column sub flow: `search -> pick -> summarize -> write_column`
  - the `summarize` stage inside the column flow is further pushed down into a summary sub flow, where TriggerFlow handles fan-out and collection directly instead of leaving `asyncio.gather` in business code
  - this keeps the parent focused on report orchestration and the child focused on one column lifecycle
  - the immediate value of `sub flow` here is that the column pipeline becomes a reusable, independently evolvable workflow unit instead of staying buried inside one oversized parent chunk

### Agently v4 features used here

- **TriggerFlow orchestration**
  - Replaces the old v3 workflow style with a more explicit flow graph (`to`, `for_each`, `sub flow`, branching-ready composition).
  - Unlike the old v3 Workflow chain, TriggerFlow here runs columns concurrently and also summarizes picked stories concurrently within each column.
  - Meaning for this project: the end-to-end news pipeline is easier to inspect, evolve, and split into chunks without mixing orchestration with business logic, while the parent report flow and the per-column pipeline can now be modeled directly as parent/child flows instead of one oversized chunk.
- **Sub flow composition**
  - The project can now extract a naturally repeated business pipeline, “build one column”, into its own TriggerFlow and invoke it repeatedly from the parent flow inside `for_each(column)`.
  - Meaning for this project:
    - the parent flow stays focused on report-level orchestration
    - the column pipeline can be tested, visualized, and exported independently
    - future variants such as “briefing column”, “deep-dive column”, or “regional column” can reuse or derive from the child flow instead of cloning parent-flow nodes
    - `capture / write_back` makes the boundary between parent and child explicit for input, state, and resources
- **Structured output contracts**
  - YAML prompts now define output schema directly for outline generation, news picking, summarizing, and column writing.
  - Meaning for this project: much less handwritten parsing glue, clearer interfaces between steps, and easier prompt iteration.
- **Built-in Search / Browse tools**
  - The project now defaults to Agently v4 built-in tool implementations instead of the old project-local helpers.
  - Meaning for this project: less custom infrastructure code, and users can still swap implementations through `./tools` without rewriting the workflow.
- **Runtime resources and state namespaces**
  - TriggerFlow runtime resources are used to inject logger/search/browse dependencies, while runtime state stores execution data such as request, outline, and intermediate results.
  - Meaning for this project: dependency wiring and execution state are separated cleanly, which keeps chunk code thinner and more maintainable.
- **Environment-aware settings**
  - Agently v4 `set_settings(..., auto_load_env=True)` works directly with `${ENV.xxx}` placeholders.
  - Meaning for this project: model endpoint, model name, and API key can be switched by environment instead of editing code or committing secrets.

### Overall effect on this project

- The core product behavior remains familiar to v3 users, but the project now has a cleaner app/workflow/tools/prompts split.
- More logic is expressed in Agently-native capabilities instead of project-specific glue code.
- True concurrency is now part of the default execution model. The v3 version was effectively serial, while the v4 version can process columns and per-column summaries in parallel through TriggerFlow.
- Replacing tools, adjusting prompts, or evolving workflow steps is now lower-risk than in the old v3 layout, and the overall orchestration shape is again aligned with the original “main flow + column flow” mental model.
- It also means workflow evolution can happen by layer: report-level changes stay in the parent flow, while column-level changes stay in the sub flow instead of forcing both to change together.

## Notes

- Python `>=3.10` is required because Agently v4 requires it.
- This project requires Agently `>=4.0.8.3`.
- Model settings now use Agently v4 `auto_load_env=True` with `${ENV.xxx}` placeholders.
- `tools/` defaults to Agently v4 built-in implementations, but you can replace the factories there with your own tools.
- `workflow/` is now split by business boundary into the parent flow, the column sub flow, report-level chunks, and column-level chunks.
- `news_collector/` acts as the app/integration layer for configuration, model wiring, and CLI entry support.
- The current sample [`SETTINGS.yaml`](./SETTINGS.yaml) enables `BROWSE.enable_playwright: true` by default because many news pages need a real browser to return usable content.
- If you do not want to install Playwright, set `BROWSE.enable_playwright` to `false` manually, but expect weaker browse quality on dynamic or protected sites.
- The settings loader keeps basic compatibility with the old v3 keys such as `MODEL_PROVIDER`, `MODEL_URL`, `MODEL_AUTH`, `MODEL_OPTIONS`, `MAX_COLUMN_NUM`, and `USE_CUSTOMIZE_OUTLINE`.


================================================
FILE: README_CN.md
================================================
# Agently Daily News Collector v4

本项目已经基于 **Agently v4** 完整重写，核心实现改为：

- 使用 `TriggerFlow` 编排整条新闻采集流程
- 使用 Agently v4 内置 `Search` / `Browse` 工具
- 使用结构化输出契约替代旧版 v3 `Workflow` API

> 版本约束：本项目仅适用于 **Agently v4.0.8.3 及以上版本**。当前实现已经使用 `TriggerFlow sub flow` 组织栏目子流程；如果你安装的是更早的 v4 版本，主流程与子流程之间的组合能力将与当前代码不兼容。

原有 Agently v3 项目已经整体归档到 [`./v3`](./v3)。

## 功能说明

- 输入一个主题，自动生成多栏目新闻汇总
- 自动完成搜索、筛选、浏览正文、总结和 Markdown 排版
- 最终报告输出到 `./outputs`
- 提示词保存在 `./prompts`，便于继续调优
- 提供独立的 `./tools` 适配层，方便替换搜索和浏览实现
- 提供独立的 `./workflow` 目录，方便单独调整流程编排

## 使用方式

1. 安装依赖：

```bash
pip install -r requirements.txt
```

如果你是手动安装 Agently，请确认版本至少为：

```bash
pip install "agently>=4.0.8.3"
```

2. 修改 [`SETTINGS.yaml`](./SETTINGS.yaml)：

- 保持模型配置为环境变量占位符
- 在环境变量中提供下面三个值：

```bash
export AGENTLY_NEWS_BASE_URL="https://api.openai.com/v1"
export AGENTLY_NEWS_MODEL="gpt-4.1-mini"
export AGENTLY_NEWS_API_KEY="your_api_key"
```

- 或者写到本地 `.env` 文件中：

```dotenv
AGENTLY_NEWS_BASE_URL=https://api.openai.com/v1
AGENTLY_NEWS_MODEL=gpt-4.1-mini
AGENTLY_NEWS_API_KEY=your_api_key
```

- 按需调整输出语言、搜索参数和并发参数
- 如果你的 OpenAI-compatible 服务本身不需要鉴权，可以不设置 `AGENTLY_NEWS_API_KEY`，项目会自动跳过 `auth`

3. 启动：

```bash
python app.py
```

也可以直接把主题作为命令行参数传入：

```bash
python app.py "AI Agents"
```

## 目录结构

```text
.
├── app.py
├── news_collector/
├── tools/
├── workflow/
├── prompts/
├── outputs/
├── logs/
└── v3/
```

## 重要说明：v3 -> v4 的关键变化

业务主线其实没有变，仍然基本是：

`outline -> search -> pick -> browse + summarize -> write column -> render markdown`

真正变化的是这条链路在工程上的组织方式。

### 从本项目角度看，主要改了什么

- 旧版 v3 主要是 `./workflows` 里的主流程加栏目子流程，再配合项目内自定义的 `search.py` / `browse.py` 和 storage 传值。
- 新版 v4 把职责拆得更清楚：
  - `news_collector/`：app / integration 层
  - `workflow/`：主 flow、栏目 sub flow 与各 chunk 的具体实现
  - `tools/`：搜索与抓取适配层
  - `prompts/`：结构化提示词契约
- 模型配置不再写死在 Python 代码里，而是统一通过 `SETTINGS.yaml` 里的 `${ENV.xxx}` 占位符注入，部署和切换环境更简单。
- 搜索、浏览、日志等依赖不再散落在工作流实现内部，而是通过 TriggerFlow runtime resources 注入，后续替换实现时不需要改业务流程本身。
- 现在工作流规划也更贴近业务边界：
  - 主 flow 负责 `prepare_request -> generate_outline -> for_each(column) -> render_report`
  - 栏目 sub flow 负责 `search -> pick -> summarize -> write_column`
  - 栏目内部的 `summarize` 又继续下沉为一个 summary sub flow，用 TriggerFlow 自己的 `for_each + collect` 做并发收拢，而不是在业务代码里手写 `asyncio.gather`
  - 这样主流程关注“整份日报如何生成”，子流程关注“单个栏目如何产出”
  - `sub flow` 的直接价值是：栏目链路现在可以被当成一个独立、可复用、可单独演进的流程单元来看待，而不是继续埋在父流程的某个大 chunk 里

### 本项目实际用到了 Agently v4 的哪些关键能力

- **TriggerFlow 编排**
  - 用更显式的流程图式写法替代 v3 的旧 Workflow 风格，支持 `to`、`for_each`、`sub flow` 等组合方式。
  - 和旧版 v3 基本串行执行不同，这个 v4 版本会并发处理多个栏目，并在栏目内部并发总结多条入选新闻。
  - 对本项目的意义：新闻采集链路更容易拆 chunk、看依赖、调并发，也更适合后续继续演进；现在“主流程”和“栏目流程”可以直接用 sub flow 建模，而不是继续把整条栏目链路塞进一个大 chunk。
- **Sub Flow 组合能力**
  - 现在可以把“栏目生成”这种天然重复出现的业务子流程，抽成独立的 TriggerFlow，再由父 flow 在 `for_each(column)` 中重复调用。
  - 对本项目的意义：
    - 父 flow 只保留日报级编排，职责更稳定
    - 栏目流程可以单独测试、单独可视化、单独导出配置
    - 后续如果新增“快讯栏目”“深度栏目”“地区栏目”，可以直接复用或派生子流程，而不是继续复制粘贴父流程节点
    - 主流程和子流程之间通过 `capture / write_back` 显式传递输入、状态和资源，边界比闭包式调用清晰得多
- **结构化输出契约**
  - 现在 outline、pick、summarize、write column 都直接在 YAML prompt 里声明输出结构。
  - 对本项目的意义：少写很多手工解析代码，步骤之间的接口更清晰，调 prompt 时更可控。
- **内置 Search / Browse 工具**
  - 默认直接使用 Agently v4 提供的 Search / Browse，而不是沿用 v3 里项目自带的工具实现。
  - 对本项目的意义：减少项目自维护基础设施代码，同时又保留了 `./tools` 层，方便用户自己替换实现。
- **Runtime resources 与 state 命名空间**
  - 通过 TriggerFlow runtime resources 注入 `logger`、`search_tool`、`browse_tool`，通过 runtime state 保存 `request`、`outline`、中间结果。
  - 对本项目的意义：把“依赖注入”和“流程状态”拆开，chunk 代码更薄，也更容易维护。
- **环境变量感知的 settings**
  - 使用 Agently v4 的 `set_settings(..., auto_load_env=True)` 配合 `${ENV.xxx}` 占位符。
  - 对本项目的意义：`base_url`、`model`、`api_key` 都可以按环境切换，不需要改代码，也更适合本地开发和部署。

### 这些改动对项目整体的意义

- 对 v3 用户来说，产品级行为仍然熟悉，但项目结构已经从“单体 workflow 脚本”变成了更清晰的 app / workflow / tools / prompts 分层。
- 更多能力直接复用了 Agently v4 原生机制，而不是继续在项目里堆自定义胶水代码。
- 真正的并发执行现在成为默认能力。v3 版本整体上仍是串行 workflow，而 v4 可以通过 TriggerFlow 并发跑栏目和栏目内摘要，直接改善总耗时。
- 后续无论是替换工具、调整提示词，还是演进工作流步骤，风险都比 v3 结构更低；主流程和栏目流程也终于恢复成了清晰的父子结构。
- 这也意味着工作流演进可以按层进行：日报级逻辑改父 flow，栏目级逻辑改 sub flow，二者不必总是一起变动。

## 说明

- Agently v4 要求 Python `>=3.10`
- 本项目要求 Agently `>=4.0.8.3`
- 模型配置现在使用 Agently v4 的 `auto_load_env=True` 和 `${ENV.xxx}` 占位符
- `tools/` 默认封装 Agently v4 内置工具；如果你要接自己的搜索或抓取方案，只需要替换这里的工厂函数
- `workflow/` 现在按业务边界拆成主 flow、栏目 sub flow、报告级 chunks、栏目级 chunks
- `news_collector/` 现在承担 app/integration 层职责，负责配置、模型装配和 CLI 入口支持
- 当前仓库里的 [`SETTINGS.yaml`](./SETTINGS.yaml) 默认开启 `BROWSE.enable_playwright: true`，因为很多新闻页面只有在真实浏览器环境下才能抓到可用正文
- 如果你不想额外安装 Playwright，可以手动把 `BROWSE.enable_playwright` 改成 `false`，但动态站点、受保护页面和部分媒体站的抓取质量会明显下降
- 新版配置优先读取 `MODEL / SEARCH / BROWSE / WORKFLOW / OUTLINE / OUTPUT` 结构，同时兼容部分旧版 v3 配置键，例如 `MODEL_PROVIDER`、`MODEL_URL`、`MODEL_AUTH`、`MODEL_OPTIONS`、`MAX_COLUMN_NUM`、`USE_CUSTOMIZE_OUTLINE`


================================================
FILE: SETTINGS.yaml
================================================
# Debug
DEBUG: false

# Shared proxy for model/search/browse. Leave empty if not needed.
PROXY: http://127.0.0.1:7890

# Agently v4 model configuration.
# Use Agently v4 `${ENV.xxx}` placeholders and let Agently resolve them from
# the current environment or `.env` file when applying model settings.
MODEL:
  provider: OpenAICompatible
  base_url: ${ENV.DEEPSEEK_BASE_URL}
  model: ${ENV.DEEPSEEK_DEFAULT_MODEL}
  model_type: chat
  auth:
    api_key: ${ENV.DEEPSEEK_API_KEY}
  request_options:
    temperature: 0.2

SEARCH:
  max_results: 8
  timelimit: d
  region: us-en
  backend: auto

BROWSE:
  enable_playwright: true
  playwright_headless: true
  response_mode: markdown
  max_content_length: 12000
  min_content_length: 80

WORKFLOW:
  max_column_num: 3
  max_news_per_column: 3
  output_language: Chinese
  column_concurrency: 3
  summary_concurrency: 3

OUTLINE:
  use_customized: false
  customized:
    report_title: Today's News about Large Model Applications
    column_list:
      - column_title: Latest Launches
        column_requirement: Focus on newly announced or newly released large model applications in the last few days.
        search_keywords: large model application launch this week
      - column_title: Hot Topics
        column_requirement: Focus on the most discussed or fastest-growing large model applications.
        search_keywords: large model application trending news
      - column_title: Industry Moves
        column_requirement: Focus on enterprise adoption, funding, partnerships, or strategy shifts around large model applications.
        search_keywords: large model application enterprise partnership funding

OUTPUT:
  directory: outputs


================================================
FILE: app.py
================================================
from news_collector.cli import main


if __name__ == "__main__":
    raise SystemExit(main())


================================================
FILE: logs/.gitkeep
================================================


================================================
FILE: news_collector/__init__.py
================================================
def __getattr__(name: str):
    if name == "AppSettings":
        from .config import AppSettings

        return AppSettings
    if name == "DailyNewsCollector":
        from .collector import DailyNewsCollector

        return DailyNewsCollector
    if name == "main":
        from .cli import main

        return main
    raise AttributeError(name)


================================================
FILE: news_collector/cli.py
================================================
from __future__ import annotations

import sys
from pathlib import Path

from .config import AppSettings
from .collector import DailyNewsCollector
from .logging_utils import configure_logging


ROOT_DIR = Path(__file__).resolve().parent.parent
SETTINGS_PATH = ROOT_DIR / "SETTINGS.yaml"


def main() -> int:
    settings = AppSettings.load(SETTINGS_PATH)
    logger = configure_logging(
        debug=settings.debug,
        log_dir=ROOT_DIR / "logs",
    )

    topic = " ".join(sys.argv[1:]).strip()
    if not topic:
        topic = input("请输入要生成新闻汇总的主题 / Please input the topic: ").strip()

    if not topic:
        print("Topic is required.")
        return 1

    collector = DailyNewsCollector(
        settings=settings,
        root_dir=ROOT_DIR,
        logger=logger,
    )

    try:
        result = collector.collect(topic)
    except Exception as exc:  # pragma: no cover - CLI guard
        logger.exception("Daily news collection failed: %s", exc)
        return 1

    print(result["markdown"])
    print(f"\n[Saved to] {result['output_path']}")
    return 0


================================================
FILE: news_collector/collector.py
================================================
from __future__ import annotations

import logging
import os
import re
from pathlib import Path
from typing import Any, cast

from agently import Agently

from tools import create_browse_tool, create_search_tool
from workflow import build_daily_news_flow

from .config import AppSettings


class DailyNewsCollector:
    def __init__(
        self,
        *,
        settings: AppSettings,
        root_dir: str | Path,
        logger: logging.Logger,
    ):
        self.settings = settings
        self.root_dir = Path(root_dir).resolve()
        self.logger = logger

        self._configure_agently()

        search_tool = create_search_tool(self.settings)
        browse_tool = create_browse_tool(self.settings)
        self.flow = build_daily_news_flow(
            settings=self.settings,
            root_dir=self.root_dir,
            model_label=self.model_label,
        )
        self.flow.update_runtime_resources(
            logger=self.logger,
            search_tool=search_tool,
            browse_tool=browse_tool,
        )

    def collect(self, topic: str) -> dict[str, Any]:
        normalized_topic = topic.strip()
        if not normalized_topic:
            raise ValueError("Topic is required.")
        result = self.flow.start(normalized_topic)
        return result

    def _configure_agently(self) -> None:
        from dotenv import find_dotenv, load_dotenv

        load_dotenv(find_dotenv())
        model_settings = self.settings.model.to_agently_settings(self.settings.proxy)
        self._ensure_required_model_env(
            {
                "base_url": model_settings.get("base_url"),
                "model": model_settings.get("model"),
            }
        )
        if self._missing_env_names(model_settings.get("auth")):
            model_settings.pop("auth", None)

        resolved_model_name = self._resolve_env_value(model_settings.get("model"))
        self.model_label = f"{self.settings.model.provider} / {resolved_model_name}"
        Agently.set_settings("debug", self.settings.debug)
        Agently.set_settings(
            self.settings.model.provider,
            model_settings,
            auto_load_env=True,
        )

    def _ensure_required_model_env(self, model_settings: dict[str, Any]) -> None:
        env_names = sorted(set(self._collect_env_names(model_settings)))
        if not env_names:
            return

        missing_env_names = [
            name for name in env_names if os.getenv(name) in (None, "")
        ]
        if missing_env_names:
            raise EnvironmentError(
                "Missing required model environment variables: "
                + ", ".join(missing_env_names)
            )

    @classmethod
    def _collect_env_names(cls, value: Any) -> list[str]:
        if isinstance(value, str):
            return re.findall(r"\$\{\s*ENV\.([^}]+?)\s*\}", value)
        if isinstance(value, dict):
            env_names: list[str] = []
            for item in value.values():
                env_names.extend(cls._collect_env_names(item))
            return env_names
        if isinstance(value, list):
            env_names: list[str] = []
            for item in value:
                env_names.extend(cls._collect_env_names(item))
            return env_names
        return []

    @classmethod
    def _missing_env_names(cls, value: Any) -> list[str]:
        env_names = sorted(set(cls._collect_env_names(value)))
        return [name for name in env_names if os.getenv(name) in (None, "")]

    @staticmethod
    def _resolve_env_value(value: Any) -> str:
        if not isinstance(value, str):
            return str(value)

        def replacer(match: re.Match[str]) -> str:
            env_name = match.group(1).strip()
            return os.getenv(env_name, match.group(0))

        return re.sub(r"\$\{\s*ENV\.([^}]+?)\s*\}", replacer, value)


================================================
FILE: news_collector/config.py
================================================
from __future__ import annotations

import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal, TypeAlias, TypeVar, cast

import yaml


ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}")
LiteralStrT = TypeVar("LiteralStrT", bound=str)

ModelProvider: TypeAlias = Literal["OpenAICompatible", "OpenAI", "OAIClient"]
ModelType: TypeAlias = Literal["chat", "completions", "embeddings"]
SearchBackend: TypeAlias = Literal[
    "auto",
    "bing",
    "duckduckgo",
    "yahoo",
    "google",
    "mullvad_google",
    "yandex",
    "wikipedia",
]
SearchNewsTimeLimit: TypeAlias = Literal["d", "w", "m"]
SearchRegion: TypeAlias = Literal[
    "xa-ar",
    "xa-en",
    "ar-es",
    "au-en",
    "at-de",
    "be-fr",
    "be-nl",
    "br-pt",
    "bg-bg",
    "ca-en",
    "ca-fr",
    "ct-ca",
    "cl-es",
    "cn-zh",
    "co-es",
    "hr-hr",
    "cz-cs",
    "dk-da",
    "ee-et",
    "fi-fi",
    "fr-fr",
    "de-de",
    "gr-el",
    "hk-tzh",
    "hu-hu",
    "in-en",
    "id-id",
    "id-en",
    "ie-en",
    "il-he",
    "it-it",
    "jp-jp",
    "kr-kr",
    "lv-lv",
    "lt-lt",
    "xl-es",
    "my-ms",
    "my-en",
    "mx-es",
    "nl-nl",
    "nz-en",
    "no-no",
    "pe-es",
    "ph-en",
    "ph-tl",
    "pl-pl",
    "pt-pt",
    "ro-ro",
    "ru-ru",
    "sg-en",
    "sk-sk",
    "sl-sl",
    "za-en",
    "es-es",
    "se-sv",
    "ch-de",
    "ch-fr",
    "ch-it",
    "tw-tzh",
    "th-th",
    "tr-tr",
    "ua-uk",
    "uk-en",
    "us-en",
    "ue-es",
    "ve-es",
    "vn-vi",
]
BrowseResponseMode: TypeAlias = Literal["markdown", "text"]

MODEL_PROVIDER_VALUES: tuple[ModelProvider, ...] = ("OpenAICompatible", "OpenAI", "OAIClient")
MODEL_TYPE_VALUES: tuple[ModelType, ...] = ("chat", "completions", "embeddings")
SEARCH_BACKEND_VALUES: tuple[SearchBackend, ...] = (
    "auto",
    "bing",
    "duckduckgo",
    "yahoo",
    "google",
    "mullvad_google",
    "yandex",
    "wikipedia",
)
SEARCH_TIMELIMIT_VALUES: tuple[SearchNewsTimeLimit, ...] = ("d", "w", "m")
SEARCH_REGION_VALUES: tuple[SearchRegion, ...] = (
    "xa-ar",
    "xa-en",
    "ar-es",
    "au-en",
    "at-de",
    "be-fr",
    "be-nl",
    "br-pt",
    "bg-bg",
    "ca-en",
    "ca-fr",
    "ct-ca",
    "cl-es",
    "cn-zh",
    "co-es",
    "hr-hr",
    "cz-cs",
    "dk-da",
    "ee-et",
    "fi-fi",
    "fr-fr",
    "de-de",
    "gr-el",
    "hk-tzh",
    "hu-hu",
    "in-en",
    "id-id",
    "id-en",
    "ie-en",
    "il-he",
    "it-it",
    "jp-jp",
    "kr-kr",
    "lv-lv",
    "lt-lt",
    "xl-es",
    "my-ms",
    "my-en",
    "mx-es",
    "nl-nl",
    "nz-en",
    "no-no",
    "pe-es",
    "ph-en",
    "ph-tl",
    "pl-pl",
    "pt-pt",
    "ro-ro",
    "ru-ru",
    "sg-en",
    "sk-sk",
    "sl-sl",
    "za-en",
    "es-es",
    "se-sv",
    "ch-de",
    "ch-fr",
    "ch-it",
    "tw-tzh",
    "th-th",
    "tr-tr",
    "ua-uk",
    "uk-en",
    "us-en",
    "ue-es",
    "ve-es",
    "vn-vi",
)
BROWSE_RESPONSE_MODE_VALUES: tuple[BrowseResponseMode, ...] = ("markdown", "text")


def _resolve_env_placeholders(value: Any) -> Any:
    if isinstance(value, str):
        def replace(match: re.Match[str]) -> str:
            env_name = match.group(1)
            default_value = match.group(2) or ""
            return os.getenv(env_name, default_value)

        return ENV_PATTERN.sub(replace, value)
    if isinstance(value, list):
        return [_resolve_env_placeholders(item) for item in value]
    if isinstance(value, dict):
        return {
            key: _resolve_env_placeholders(item)
            for key, item in value.items()
        }
    return value


def _as_dict(value: Any) -> dict[str, Any]:
    return value if isinstance(value, dict) else {}


def _as_int(value: Any, default: int) -> int:
    try:
        return int(value)
    except (TypeError, ValueError):
        return default


def _as_bool(value: Any, default: bool = False) -> bool:
    if value is None:
        return default
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        normalized = value.strip().lower()
        if normalized in {"1", "true", "yes", "on"}:
            return True
        if normalized in {"0", "false", "no", "off"}:
            return False
    return bool(value)


def _as_optional_str(value: Any) -> str | None:
    if value is None:
        return None
    text = str(value).strip()
    if not text or text.lower() in {"none", "null"}:
        return None
    return text


def _normalize_auth(value: Any) -> Any:
    if isinstance(value, str):
        normalized = _as_optional_str(value)
        if normalized and "input your api key" in normalized.lower():
            return None
        return normalized
    if isinstance(value, dict):
        normalized = {
            str(key): item
            for key, item in value.items()
            if item not in (None, "", [], {})
        }
        api_key = _as_optional_str(normalized.get("api_key"))
        if api_key is None:
            normalized.pop("api_key", None)
        else:
            normalized["api_key"] = api_key
        return normalized or None
    return value


def _as_literal(
    value: Any,
    *,
    allowed: tuple[LiteralStrT, ...],
    default: LiteralStrT,
) -> LiteralStrT:
    if isinstance(value, str):
        candidate = value.strip()
        if candidate in allowed:
            return cast(LiteralStrT, candidate)
        lower_candidate = candidate.lower()
        for item in allowed:
            if lower_candidate == item.lower():
                return item
    return default


@dataclass(slots=True)
class ModelConfig:
    provider: ModelProvider = "OpenAICompatible"
    base_url: str = "https://api.openai.com/v1"
    model: str = "gpt-4.1-mini"
    model_type: ModelType = "chat"
    auth: Any = None
    request_options: dict[str, Any] = field(default_factory=dict)
    proxy: str | None = None

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "ModelConfig":
        block = _as_dict(raw.get("MODEL") or raw.get("model"))
        legacy_request_options = dict(_as_dict(raw.get("MODEL_OPTIONS")))
        block_request_options = dict(_as_dict(block.get("request_options") or block.get("options")))
        request_options = block_request_options or legacy_request_options
        model_name = block.get("model") or request_options.pop("model", None) or "gpt-4.1-mini"
        return cls(
            provider=_as_literal(
                block.get("provider") or raw.get("MODEL_PROVIDER"),
                allowed=MODEL_PROVIDER_VALUES,
                default="OpenAICompatible",
            ),
            base_url=str(block.get("base_url") or raw.get("MODEL_URL") or "https://api.openai.com/v1"),
            model=str(model_name),
            model_type=_as_literal(
                block.get("model_type"),
                allowed=MODEL_TYPE_VALUES,
                default="chat",
            ),
            auth=_normalize_auth(block.get("auth", raw.get("MODEL_AUTH"))),
            request_options=request_options,
            proxy=_as_optional_str(block.get("proxy")),
        )

    def to_agently_settings(self, global_proxy: str | None = None) -> dict[str, Any]:
        settings: dict[str, Any] = {
            "base_url": self.base_url,
            "model": self.model,
            "model_type": self.model_type,
            "request_options": self.request_options,
        }
        proxy = self.proxy or global_proxy
        if proxy:
            settings["proxy"] = proxy
        if self.auth is not None:
            settings["auth"] = self.auth
        return settings


@dataclass(slots=True)
class SearchConfig:
    max_results: int = 8
    timelimit: SearchNewsTimeLimit = "d"
    region: SearchRegion = "us-en"
    backend: SearchBackend = "auto"
    proxy: str | None = None

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "SearchConfig":
        block = _as_dict(raw.get("SEARCH") or raw.get("search"))
        return cls(
            max_results=max(_as_int(block.get("max_results", raw.get("MAX_SEARCH_RESULTS")), 8), 1),
            timelimit=_as_literal(
                block.get("timelimit"),
                allowed=SEARCH_TIMELIMIT_VALUES,
                default="d",
            ),
            region=_as_literal(
                block.get("region"),
                allowed=SEARCH_REGION_VALUES,
                default="us-en",
            ),
            backend=_as_literal(
                block.get("backend"),
                allowed=SEARCH_BACKEND_VALUES,
                default="auto",
            ),
            proxy=_as_optional_str(block.get("proxy")),
        )


@dataclass(slots=True)
class BrowseConfig:
    enable_playwright: bool = False
    playwright_headless: bool = True
    response_mode: BrowseResponseMode = "markdown"
    max_content_length: int = 12000
    min_content_length: int = 80
    proxy: str | None = None

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "BrowseConfig":
        block = _as_dict(raw.get("BROWSE") or raw.get("browse"))
        return cls(
            enable_playwright=_as_bool(block.get("enable_playwright"), False),
            playwright_headless=_as_bool(block.get("playwright_headless"), True),
            response_mode=_as_literal(
                block.get("response_mode"),
                allowed=BROWSE_RESPONSE_MODE_VALUES,
                default="markdown",
            ),
            max_content_length=max(_as_int(block.get("max_content_length"), 12000), 2000),
            min_content_length=max(_as_int(block.get("min_content_length"), 80), 20),
            proxy=_as_optional_str(block.get("proxy")),
        )


@dataclass(slots=True)
class WorkflowConfig:
    max_column_num: int = 3
    max_news_per_column: int = 3
    output_language: str = "Chinese"
    column_concurrency: int = 3
    summary_concurrency: int = 3

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "WorkflowConfig":
        block = _as_dict(raw.get("WORKFLOW") or raw.get("workflow"))
        return cls(
            max_column_num=max(_as_int(block.get("max_column_num", raw.get("MAX_COLUMN_NUM")), 3), 1),
            max_news_per_column=max(_as_int(block.get("max_news_per_column"), 3), 1),
            output_language=str(block.get("output_language") or raw.get("OUTPUT_LANGUAGE") or "Chinese"),
            column_concurrency=max(_as_int(block.get("column_concurrency"), 3), 1),
            summary_concurrency=max(_as_int(block.get("summary_concurrency"), 3), 1),
        )


@dataclass(slots=True)
class OutlineConfig:
    use_customized: bool = False
    customized: dict[str, Any] = field(default_factory=dict)

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "OutlineConfig":
        block = _as_dict(raw.get("OUTLINE") or raw.get("outline"))
        customized = block.get("customized", raw.get("CUSTOMIZE_OUTLINE")) or {}
        return cls(
            use_customized=_as_bool(block.get("use_customized", raw.get("USE_CUSTOMIZE_OUTLINE", False))),
            customized=customized if isinstance(customized, dict) else {},
        )


@dataclass(slots=True)
class OutputConfig:
    directory: str = "outputs"

    @classmethod
    def from_raw(cls, raw: dict[str, Any]) -> "OutputConfig":
        block = _as_dict(raw.get("OUTPUT") or raw.get("output"))
        return cls(directory=str(block.get("directory") or "outputs"))


@dataclass(slots=True)
class AppSettings:
    debug: bool = False
    proxy: str | None = None
    model: ModelConfig = field(default_factory=ModelConfig)
    search: SearchConfig = field(default_factory=SearchConfig)
    browse: BrowseConfig = field(default_factory=BrowseConfig)
    workflow: WorkflowConfig = field(default_factory=WorkflowConfig)
    outline: OutlineConfig = field(default_factory=OutlineConfig)
    output: OutputConfig = field(default_factory=OutputConfig)

    @classmethod
    def load(cls, path: str | Path) -> "AppSettings":
        config_path = Path(path)
        raw = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
        if not isinstance(raw, dict):
            raise TypeError(f"Settings file must contain a dictionary, got: {type(raw)}")
        resolved = _resolve_env_placeholders(raw)
        return cls(
            debug=_as_bool(resolved.get("DEBUG", resolved.get("debug", False))),
            proxy=_as_optional_str(resolved.get("PROXY", resolved.get("proxy"))),
            model=ModelConfig.from_raw(resolved),
            search=SearchConfig.from_raw(resolved),
            browse=BrowseConfig.from_raw(resolved),
            workflow=WorkflowConfig.from_raw(resolved),
            outline=OutlineConfig.from_raw(resolved),
            output=OutputConfig.from_raw(resolved),
        )


================================================
FILE: news_collector/logging_utils.py
================================================
from __future__ import annotations

import logging
from pathlib import Path


def configure_logging(*, debug: bool, log_dir: str | Path) -> logging.Logger:
    target_dir = Path(log_dir)
    target_dir.mkdir(parents=True, exist_ok=True)

    logger = logging.getLogger("agently_daily_news_collector")
    logger.setLevel(logging.DEBUG if debug else logging.INFO)
    logger.propagate = False

    if logger.handlers:
        return logger

    formatter = logging.Formatter(
        "%(asctime)s [%(levelname)s] %(message)s",
    )

    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.DEBUG if debug else logging.INFO)
    console_handler.setFormatter(formatter)

    file_handler = logging.FileHandler(
        target_dir / "collector.log",
        encoding="utf-8",
    )
    file_handler.setLevel(logging.DEBUG if debug else logging.INFO)
    file_handler.setFormatter(formatter)

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    return logger


================================================
FILE: news_collector/markdown.py
================================================
from __future__ import annotations

from typing import Any


def _labels_for_language(language: str) -> dict[str, str]:
    normalized = language.lower()
    if "chinese" in normalized or normalized.startswith("zh"):
        return {
            "generated_at": "生成时间",
            "topic": "主题",
            "prologue": "导语",
            "news_list": "新闻列表",
            "source": "来源",
            "date": "日期",
            "summary": "摘要",
            "comment": "推荐理由",
            "model": "模型",
        }
    return {
        "generated_at": "Generated At",
        "topic": "Topic",
        "prologue": "Prologue",
        "news_list": "News List",
        "source": "Source",
        "date": "Date",
        "summary": "Summary",
        "comment": "Why It Matters",
        "model": "Model",
    }


def render_markdown(
    *,
    report_title: str,
    generated_at: str,
    topic: str,
    language: str,
    columns: list[dict[str, Any]],
    model_label: str,
) -> str:
    labels = _labels_for_language(language)
    lines = [
        f"# {report_title}",
        "",
        f"> {labels['generated_at']}: {generated_at}",
        f"> {labels['topic']}: {topic}",
        "",
    ]

    for column in columns:
        lines.extend(
            [
                f"## {column['title']}",
                "",
                f"### {labels['prologue']}",
                "",
                column["prologue"],
                "",
                f"### {labels['news_list']}",
                "",
            ]
        )

        for news in column["news_list"]:
            lines.append(f"- [{news['title']}]({news['url']})")
            meta_parts = []
            if news.get("source"):
                meta_parts.append(f"{labels['source']}: {news['source']}")
            if news.get("date"):
                meta_parts.append(f"{labels['date']}: {news['date']}")
            if meta_parts:
                lines.append(f"  - {' | '.join(meta_parts)}")
            lines.append(f"  - {labels['summary']}: {news['summary']}")
            lines.append(f"  - {labels['comment']}: {news['recommend_comment']}")
            lines.append("")

    lines.extend(
        [
            "---",
            "",
            "Powered by [Agently 4](https://github.com/AgentEra/Agently)",
            "",
            f"{labels['model']}: {model_label}",
        ]
    )

    return "\n".join(lines).strip() + "\n"


================================================
FILE: outputs/.gitkeep
================================================


================================================
FILE: prompts/create_outline.yaml
================================================
input:
  topic: ${topic}
  today: ${today}
info:
  output_language: ${language}
  max_column_num: ${max_column_num}
instruct:
  - Design a daily news report outline about {input.topic}.
  - Focus on recent and genuinely newsworthy developments, not timeless background material.
  - Create between 2 and {info.max_column_num} distinct columns.
  - Each column should have a clear editorial angle and usable search keywords.
  - column_requirement should describe a practical coverage direction, not an overly narrow fact checklist.
  - Avoid making column_requirement depend on very specific version numbers, exact financing stages, exact pull requests, or unverified assumptions unless the topic itself clearly requires that specificity.
  - Prefer column angles that can still include adjacent and strongly related developments if they help readers understand the topic.
  - search_keywords must be concise and search-engine friendly, ideally 4-10 terms instead of a long sentence.
  - Prefer entity names and English or mixed-language keywords when the topic includes product names, project names, or proper nouns.
  - Avoid stuffing search_keywords with excessive years, clauses, or explanatory text.
  - report_title should look like a publishable briefing headline in {info.output_language}.
output:
  report_title:
    $type: str
    $desc: natural title for the final news report
  column_list:
    $type:
      - column_title:
          $type: str
          $desc: concise column heading
        column_requirement:
          $type: str
          $desc: practical coverage direction for stories in this column; keep it specific but not overly narrow
        search_keywords:
          $type: str
          $desc: practical search query that includes the topic and matches this column angle
    $desc: list of columns for the report


================================================
FILE: prompts/pick_news.yaml
================================================
input: ${column_news}
info:
  column_title: ${column_title}
  column_requirement: ${column_requirement}
  max_news_per_column: ${max_news_per_column}
instruct:
  - Review the candidate news list for the current column.
  - This stage is a shortlist stage, so prioritize recall over precision.
  - Prefer concrete, recent, source-backed stories that plausibly satisfy or strongly relate to {info.column_requirement}.
  - Keep clearly relevant or potentially relevant stories for the next summarizing step, even if the snippet is incomplete.
  - Reject only obvious duplicates, clearly unrelated stories, or items that are too thin to justify browsing.
  - Keep at most {info.max_news_per_column} items with can_use equal to true.
  - Use relevance_score from 0 to 10.
output:
  - id:
      $type: int
      $desc: value from {input.[].id}
    can_use:
      $type: bool
      $desc: whether this item should enter the shortlist for later webpage reading
    relevance_score:
      $type: int
      $desc: ranking score from 0 to 10 for shortlisted items
    recommend_comment:
      $type: str
      $desc: short why-it-matters note if can_use is true, otherwise empty string


================================================
FILE: prompts/summarize_news.yaml
================================================
input: ${news_content}
info:
  news_title: ${news_title}
  column_requirement: ${column_requirement}
  language: ${language}
instruct:
  - Read the browsed page content and extract the core facts related to {info.news_title}.
  - This stage is the final relevance check because you can read the full page content here.
  - If the page content is too thin, inaccessible, or clearly irrelevant to {info.column_requirement}, set can_summarize to false.
  - If the page contains enough concrete facts that are meaningfully related to the column, set can_summarize to true even when the fit is not perfect.
  - summary must be one compact paragraph in {info.language}.
  - Focus on facts that explain why the story belongs in this column.
  - When the article is only partially related, summarize only the relevant part instead of rejecting it outright.
output:
  can_summarize:
    $type: bool
    $desc: whether the page contains enough useful information to summarize
  summary:
    $type: str
    $desc: one-paragraph summary in {info.language}, or empty string if can_summarize is false


================================================
FILE: prompts/write_column.yaml
================================================
input: ${news_list}
info:
  column_title: ${column_title}
  column_requirement: ${column_requirement}
  language: ${language}
instruct:
  - Build the final column using the shortlisted stories.
  - Keep the strongest non-duplicated stories only.
  - prologue should be 2 to 3 sentences in {info.language}.
  - When useful, cite key stories with Markdown links like [title](url).
output:
  prologue:
    $type: str
    $desc: opening paragraph for this column in {info.language}
  news_list:
    $type:
      - id:
          $type: int
          $desc: value from {input.[].id}
        recommend_comment:
          $type: str
          $desc: refined why-it-matters note for the selected story
    $desc: ordered list of stories to keep in the final column


================================================
FILE: requirements.txt
================================================
agently>=4.0.8.3
PyYAML>=6.0.2
ddgs>=9.10.0
beautifulsoup4>=4.12.3
python-dotenv>=1.0.1


================================================
FILE: tools/README.md
================================================
# Tools Layer

`tools/` 是项目的可替换工具适配层。

默认实现：

- `tools/builtin.py`
- 直接封装 Agently v4 内置 `Search` / `Browse`

当前入口：

- `tools/__init__.py`

如果你想替换为自己的搜索或网页抓取实现，只需要：

1. 新建一个模块，例如 `tools/custom.py`
2. 实现 `SearchToolProtocol` 和 `BrowseToolProtocol` 对应的方法
3. 在 `tools/__init__.py` 中把 `create_search_tool` / `create_browse_tool` 改为导出你的工厂函数

最小接口约束：

```python
class SearchToolProtocol(Protocol):
    async def search_news(
        self,
        *,
        query: str,
        timelimit: SearchNewsTimeLimit,
        max_results: int,
    ) -> list[dict[str, Any]]:
        ...


class BrowseToolProtocol(Protocol):
    async def browse(self, url: str) -> str:
        ...
```


================================================
FILE: tools/__init__.py
================================================
from .base import BrowseToolProtocol, SearchToolProtocol
from .builtin import create_browse_tool, create_search_tool

__all__ = [
    "BrowseToolProtocol",
    "SearchToolProtocol",
    "create_browse_tool",
    "create_search_tool",
]


================================================
FILE: tools/base.py
================================================
from __future__ import annotations

from typing import Any, Protocol

from news_collector.config import SearchNewsTimeLimit


class SearchToolProtocol(Protocol):
    async def search_news(
        self,
        *,
        query: str,
        timelimit: SearchNewsTimeLimit,
        max_results: int,
    ) -> list[dict[str, Any]]:
        ...


class BrowseToolProtocol(Protocol):
    async def browse(self, url: str) -> str:
        ...


================================================
FILE: tools/builtin.py
================================================
from __future__ import annotations

from typing import Any

from agently.builtins.tools import Browse, Search
from ddgs.exceptions import DDGSException

from news_collector.config import AppSettings, SearchNewsTimeLimit

from .base import BrowseToolProtocol, SearchToolProtocol


class AgentlyBuiltinSearchTool(SearchToolProtocol):
    def __init__(self, settings: AppSettings):
        self._tool = Search(
            proxy=settings.search.proxy or settings.proxy,
            region=settings.search.region,
            backend=settings.search.backend,
        )

    async def search_news(
        self,
        *,
        query: str,
        timelimit: SearchNewsTimeLimit,
        max_results: int,
    ) -> list[dict[str, Any]]:
        try:
            results = await self._tool.search_news(
                query=query,
                timelimit=timelimit,
                max_results=max_results,
            )
        except DDGSException as exc:
            if "No results found" in str(exc):
                return []
            raise
        return results if isinstance(results, list) else []


class AgentlyBuiltinBrowseTool(BrowseToolProtocol):
    def __init__(self, settings: AppSettings):
        self._tool = Browse(
            proxy=settings.browse.proxy or settings.proxy,
            enable_pyautogui=False,
            enable_playwright=settings.browse.enable_playwright,
            enable_bs4=True,
            response_mode=settings.browse.response_mode,
            max_content_length=settings.browse.max_content_length,
            min_content_length=settings.browse.min_content_length,
            playwright_headless=settings.browse.playwright_headless,
        )

    async def browse(self, url: str) -> str:
        result = await self._tool.browse(url)
        return str(result or "")


def create_search_tool(settings: AppSettings) -> SearchToolProtocol:
    return AgentlyBuiltinSearchTool(settings)


def create_browse_tool(settings: AppSettings) -> BrowseToolProtocol:
    return AgentlyBuiltinBrowseTool(settings)


================================================
FILE: v3/README.md
================================================
<div style="text-align:center">

<h1>Agently-Daily-News-Collector</h1>

<b>English | <a href = "./README_CN.md">中文说明</a></b>

</div>

**Agently Daily News Collector** is an open-source LLM based automatically news collecting workflow showcase project powered by [**_<font color = "red">Agent</font><font color = "blue">ly</font>_** AI application development framework](https://github.com/Maplemx/Agently).

You can use this project to generate almost any topic of news collection. All you need to do is simply input the field topic of your news collection. Then you wait and the AI agents will do their jobs automatically until a high quality news collection is generated and saved into a markdown file.

News collection file examples:

`MarkDown File` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models2024-05-02.md)

`PDF File` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models%202024-05-02.pdf)

> **ℹ️ Notice:**
> 
> Visit https://github.com/Maplemx/Agently if you want to learn more about **_<font color = "red">Agent</font><font color = "blue">ly</font>_** AI Application development framework.

## How to Use

### Step 1: Clone this repo

Run this command in shell:

```shell
git clone git@github.com:AgentEra/Agently-Daily-News-Collector.git
```

### Step 2: Edit settings YAML file

You can find [`SETTINGS.yaml`](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/SETTINGS.yaml) file in the project dir.

Input your model's API key and change other settings as your wish.

If you want to use other model, you can read [this document](https://github.com/Maplemx/Agently/blob/main/docs/guidebook/application_development_handbook.ipynb) or [this Agently official website page](http://agently.tech/features/model_request.html) to see how to set the settings.

### Step 3: Start

Because this project is a Python project, you need to install Python first. You can find installation instruction on [Python official website](https://www.python.org/).

At the first time to run this project, you should use this command in shell to download and install dependency packages:

```shell
pip install -r path/to/project/requirements.txt
```

Wait until the dependency packages are installed then use this command in shell to start the generation process.

```shell
python path/to/project/app.py
```

You will see a tip `[Please input the topic of your daily news collection]:`.

Input your topic idea about the field of news that you want to collect, then you're good to go.

During the process, there'll be some logs printed to shell to present what tasks are done like this:

```shell
2024-05-02 22:44:27,347 [INFO]  [Outline Generated] {'report_title': "Today's news about AI Models Appliaction", 'column_list': [{'column_title': 'Latest News', 'column_requirement': 'The content is related to AI Models Appliaction, and the time is within 24 hours', 'search_keywords': 'AI Models Appliaction news latest'}, {'column_title': 'Hot News', 'column_requirement': 'The content is related to AI Models Appliaction, and the interaction is high', 'search_keywords': 'AI Models Appliaction news hot'}, {'column_title': 'Related News', 'column_requirement': 'The content is related to AI Models Appliaction, but not news', 'search_keywords': 'AI Models Appliaction report'}]}
2024-05-02 22:44:32,352 [INFO]  [Start Generate Column] Latest News
2024-05-02 22:44:34,132 [INFO]  [Search News Count] 8
2024-05-02 22:44:46,062 [INFO]  [Picked News Count] 2
2024-05-02 22:44:46,062 [INFO]  [Summarzing]    With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma
2024-05-02 22:44:52,579 [INFO]  [Summarzing]    Success
2024-05-02 22:44:57,580 [INFO]  [Summarzing]    Over 500 AI models are now optimised for Core Ultra processors, says Intel
2024-05-02 22:45:02,130 [INFO]  [Summarzing]    Success
2024-05-02 22:45:19,475 [INFO]  [Column Data Prepared]  {'title': 'Latest News', 'prologue': 'Stay up-to-date with the latest advancements in AI technology with these news updates: [Yseop Partners with AWS to Develop Generative AI for BioPharma](https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html) and [Intel Optimizes Over 500 AI Models for Core Ultra Processors](https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html).', 'news_list': [{'url': 'https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html', 'title': 'With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma', 'summary': "Yseop utilizes AWS to create a new Generative AI application for the Biopharma sector. This application leverages AWS for its scalability and security, and it allows Biopharma companies to bring pharmaceuticals and vaccines to the market more quickly. Yseop's platform integrates LLM models for generating scientific content while meeting the security standards of the pharmaceutical industry.", 'recommend_comment': 'AWS partnership helps Yseop develop an innovative Generative AI application for the BioPharma industry, enabling companies to expedite the delivery of pharmaceuticals and vaccines to market. The integration of LLM models and compliance with stringent pharmaceutical industry security standards make this a valuable solution for BioPharma companies.'}, {'url': 'https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html', 'title': 'Over 500 AI models are now optimised for Core Ultra processors, says Intel', 'summary': 'Intel stated over 500 AI models are optimized for Core Ultra processors. These models are accessible from well-known sources like OpenVINO Model Zoo, Hugging Face, ONNX Model Zoo, and PyTorch.', 'recommend_comment': "Intel's optimization of over 500 AI models for Core Ultra processors provides access to a vast selection of pre-trained models from reputable sources. This optimization enhances the performance and efficiency of AI applications, making it easier for developers to deploy AI solutions on Intel-based hardware."}]}
```

Whole process will take some time, so just relax and have some rest☕️.

### Step 4: Get your news collection markdown file!

When the process is done finally, you will see a tip like this with markdown text that generated printed on screen:

```shell
2024-05-02 21:57:20,521 [INFO] [Markdown Generated]
```

Then you can find a markdown file named `<collection name> <generated date>.md` in your project dir.

Enjoy it! 😄

---

## Mainly Dependencies

- **Agently AI Development Framework**: https://github.com/Maplemx/Agently | https://pypi.org/project/Agently/
- **duckduckgo-search**: https://pypi.org/project/duckduckgo-search/
- **BeautifulSoup4**: https://pypi.org/project/beautifulsoup4/
- **PyYAM**L: https://pypi.org/project/pyyaml/

---

Please ⭐️ this repo and [Agently](https://github.com/Maplemx/Agently) main repo if you like it! Thank you very much!

> 💡 Ideas / Bug Report: [Report Issues Here](https://github.com/AgentEra/Agently-Daily-News-Collector/issues)
>
> 📧 Email Us: [developer@agently.cn](mailto:developer@agently.cn)
>
> 👾 Discord Group:
>
> [Click Here to Join](https://discord.gg/4HnarMBpYT) or Scan the QR Code Down Below
>
> <img width="120" alt="image" src="https://github.com/Maplemx/Agently/assets/4413155/089c239c-6133-4844-840c-b48c42ccbad1">
>
> 💬 WeChat Group（加入微信群）:
>
>  [Click Here to Apply](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan the QR Code Down Below
>
> <img width="120" alt="image" src="https://github.com/Maplemx/Agently/assets/4413155/fb95e15e-c6bd-4dd4-8fc9-99285df9d443">

================================================
FILE: v3/README_CN.md
================================================
<div style="text-align:center">

<h1>Agently-Daily-News-Collector</h1>

<h3>Agently 新闻汇总报告生成器</h3>

<b><a href = "./README.md">English Introduction</a> | 中文说明</b>

</div>

**Agently新闻汇总报告生成器**是一个基于[**_<font color = "red">Agent</font><font color = "blue">ly</font>_** AI应用开发框架](https://github.com/Maplemx/Agently)开发的应用项目。本项目构建了**基于大语言模型驱动的全自动工作流**，能够根据用户输入的主题关键词，自动完成新闻汇总报告的结构设计、栏目组稿（含新闻检索、筛查、总结、栏目信息撰写）及报告MarkDown格式文件的输出全过程。同时，本项目**完全开源**，欢迎开发者们通过Fork->PR的方式共同优化。

新闻汇总报告的样例可参考：

`MarkDown文件` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models2024-05-02.md)

`PDF文件` [Lastest Updated on AI Models 2024-05-02](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/examples/Latest%20Updates%20on%20AI%20Models%202024-05-02.pdf)

> 如果您希望进一步了解[**_<font color = "red">Agent</font><font color = "blue">ly</font>_** AI应用开发框架](https://github.com/Maplemx/Agently)，您可以访问框架的[主仓库地址](https://github.com/Maplemx/Agently)或是[中文官网](http://Agently.cn)阅读更多相关信息，框架提供了丰富的教程和案例，帮助您逐步上手。

## 如何使用

### 第一步：将本仓库Clone到本地

在您的开发目录中使用以下Shell脚本指令：

```shell
git clone git@github.com:AgentEra/Agently-Daily-News-Collector.git
```

### 第二步：修改SETTINGS.yaml设置文件

您可以在Clone到本地的项目文件夹中找到[`SETTINGS.yaml`](https://github.com/AgentEra/Agently-Daily-News-Collector/blob/main/SETTINGS.yaml)这个文件，再根据您的需要修改其中的设置项即可。

下面是具体的设置项说明：

```yaml
# Debug Settings
IS_DEBUG: false # 如果此项为true，将会输出更多执行过程信息，包括搜索和模型请求的明细信息
# Proxy Settings
PROXY: http://127.0.0.1:7890 # 项目中的搜索和模型请求可能会需要使用前向代理，可以通过此项设置代理信息
# Model Settings
MODEL_PROVIDER: OAIClient #默认使用OpenAI格式的兼容客户端，此客户端能够适配OpenAI以及各类兼容OpenAI格式的本地模型
MODEL_URL: http://base_url_path # 如果您需要修改Base URL，使用此项进行设置
MODEL_AUTH:
  api_key: "" # 在这里输入鉴权用的API-Key信息
MODEL_OPTIONS: # 在这里指定模型需要的其他参数，如指定具体的模型，或是调整temperture
  model: gpt-3.5-turbo
  temperture: 0.8
# Application Settings
MAX_COLUMN_NUM: 3 # 在这里设置汇总报告结构中的专栏数量 
OUTPUT_LANGUAGE: Chinese # 在这里设置汇总报告的输出语种，默认为英语，您可能需要手动改成中文
MAX_SEARCH_RESULTS: 8 # 在这里设置每个栏目搜索的最大结果数量
# 注意，如果数量设置过大，可能会导致超出模型的处理窗口大小，请根据模型具体情况设置
SLEEP_TIME: 5 # 在这里设置每次模型请求后的等待时间，以防止频繁请求导致模型拒绝访问
```

如果您想要了解切换其他模型的更多细节，可以阅读Agently官方网站关于[模型设置的说明页面](http://agently.tech/features/model_request.html)。

### 第三步：启动任务

因为本项目为Python项目，您需要在本地安装Python环境。您可以在[Python官方网站](https://www.python.org/)找到适合您的安装方法。

然后，在您的项目目录下使用以下Shell脚本指令更新项目依赖包：

```shell
pip install -r requirements.txt
```

依赖包安装完毕后，通过以下Shell脚本指令即可启动：

```shell
python app.py
```

随后您会看到一个提示：`[Please input the topic of your daily news collection]:`。

根据提示输入您想要汇总的新闻领域主题关键词，或是用一句话描述您想要生成什么样的新闻汇总报告，然后任务就会开始自动运行了。在这里，您可以输入任何语种的内容，但生成内容的语种会和您在第二步中的设置的语种要求相同。

接下来您就可以等待运行的结果了，整个过程大约需要5-8分钟。

在运行的过程中，您会看到类似下面展示的输出日志，这些日志将帮助您了解当前在处理的任务，以及运行的关键进展情况：

```shell
2024-05-02 22:44:27,347 [INFO]  [Outline Generated] {'report_title': "Today's news about AI Models Appliaction", 'column_list': [{'column_title': 'Latest News', 'column_requirement': 'The content is related to AI Models Appliaction, and the time is within 24 hours', 'search_keywords': 'AI Models Appliaction news latest'}, {'column_title': 'Hot News', 'column_requirement': 'The content is related to AI Models Appliaction, and the interaction is high', 'search_keywords': 'AI Models Appliaction news hot'}, {'column_title': 'Related News', 'column_requirement': 'The content is related to AI Models Appliaction, but not news', 'search_keywords': 'AI Models Appliaction report'}]}
2024-05-02 22:44:32,352 [INFO]  [Start Generate Column] Latest News
2024-05-02 22:44:34,132 [INFO]  [Search News Count] 8
2024-05-02 22:44:46,062 [INFO]  [Picked News Count] 2
2024-05-02 22:44:46,062 [INFO]  [Summarzing]    With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma
2024-05-02 22:44:52,579 [INFO]  [Summarzing]    Success
2024-05-02 22:44:57,580 [INFO]  [Summarzing]    Over 500 AI models are now optimised for Core Ultra processors, says Intel
2024-05-02 22:45:02,130 [INFO]  [Summarzing]    Success
2024-05-02 22:45:19,475 [INFO]  [Column Data Prepared]  {'title': 'Latest News', 'prologue': 'Stay up-to-date with the latest advancements in AI technology with these news updates: [Yseop Partners with AWS to Develop Generative AI for BioPharma](https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html) and [Intel Optimizes Over 500 AI Models for Core Ultra Processors](https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html).', 'news_list': [{'url': 'https://finance.yahoo.com/news/support-aws-yseop-develops-unique-130000171.html', 'title': 'With Support from AWS, Yseop Develops a Unique Generative AI Application for Regulatory Document Generation Across BioPharma', 'summary': "Yseop utilizes AWS to create a new Generative AI application for the Biopharma sector. This application leverages AWS for its scalability and security, and it allows Biopharma companies to bring pharmaceuticals and vaccines to the market more quickly. Yseop's platform integrates LLM models for generating scientific content while meeting the security standards of the pharmaceutical industry.", 'recommend_comment': 'AWS partnership helps Yseop develop an innovative Generative AI application for the BioPharma industry, enabling companies to expedite the delivery of pharmaceuticals and vaccines to market. The integration of LLM models and compliance with stringent pharmaceutical industry security standards make this a valuable solution for BioPharma companies.'}, {'url': 'https://www.business-standard.com/technology/tech-news/over-500-ai-models-are-now-optimised-for-core-ultra-processors-says-intel-124050200482_1.html', 'title': 'Over 500 AI models are now optimised for Core Ultra processors, says Intel', 'summary': 'Intel stated over 500 AI models are optimized for Core Ultra processors. These models are accessible from well-known sources like OpenVINO Model Zoo, Hugging Face, ONNX Model Zoo, and PyTorch.', 'recommend_comment': "Intel's optimization of over 500 AI models for Core Ultra processors provides access to a vast selection of pre-trained models from reputable sources. This optimization enhances the performance and efficiency of AI applications, making it easier for developers to deploy AI solutions on Intel-based hardware."}]}
```
### 第四步：得到一份新鲜出炉的新闻汇总报告📰！

在整个处理过程结束时，您将会看到类似下方的提示，并可以看到完整的报告MarkDown格式结果被输出到屏幕上：

```shell
2024-05-02 21:57:20,521 [INFO] [Markdown Generated]
```

同时，您也可以在您的项目文件夹中找到一份命名格式为`<汇总报告名称> <生成日期>.md`的文件。

大功告成！🎉

---

## 主要依赖说明

- Agently AI应用开发框架：https://github.com/Maplemx/Agently | https://pypi.org/project/Agently/ | http://Agently.cn
- duckduckgo-search: https://pypi.org/project/duckduckgo-search/
- BeautifulSoup4: https://pypi.org/project/beautifulsoup4/
- PyYAML: https://pypi.org/project/pyyaml/

---

如果您喜欢这个项目，请为本项目以及[Agently框架主仓库](https://github.com/Maplemx/Agently)点亮⭐️。

如果您希望了解更多关于本项目的线上产品化版本信息，欢迎通过下面的方式加入我们的讨论群，我们将在近期组织线上产品化版本的测试。

> 💡 意见反馈/Bug提交: [Report Issues Here](https://github.com/AgentEra/Agently-Daily-News-Collector/issues)
>
> 📧 联系我们: [developer@agently.cn](mailto:developer@agently.cn)
>
> 💬 加入微信讨论群:
>
>  [点击这里填写申请表](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf)或扫描下方二维码申请入群
>
> <img width="120" alt="image" src="https://github.com/Maplemx/Agently/assets/4413155/fb95e15e-c6bd-4dd4-8fc9-99285df9d443">

================================================
FILE: v3/SETTINGS.yaml
================================================
# Debug Settings
IS_DEBUG: false
# Proxy Settings
#PROXY: http://127.0.0.1:7890
# Model Settings
MODEL_PROVIDER: OAIClient
#MODEL_URL: 
MODEL_AUTH:
  api_key: "<Input Your API Key Here>"
MODEL_OPTIONS:
  model: gpt-3.5-turbo
# Application Settings
MAX_COLUMN_NUM: 3
OUTPUT_LANGUAGE: English
MAX_SEARCH_RESULTS: 8
SLEEP_TIME: 5
# Outline Settings
USE_CUSTOMIZE_OUTLINE: false
CUSTOMIZE_OUTLINE:
  report_title: "Today's News about Large Model Applications"
  column_list:
    - column_title: New Apps
      column_requirement: Looking for those applications powered by large models which announced recently
      search_keywords: large model application announce this week
    - column_title: Hot Apps
      column_requirement: Looking for those applications powered by large models which are most popular or are discussed most
      search_keywords: large model application popular hot 
    - column_title: Fun Apps
      column_requirement: Looking for those applications powered by large models which are funny or inspirational
      search_keywords: large model application cool fun inspire

================================================
FILE: v3/app.py
================================================
import Agently
import utils.yaml_reader as yaml
from utils.logger import Logger
from workflows import main_workflow
from utils.path import root_path

# Settings and Logger
SETTINGS = yaml.read("./SETTINGS.yaml")
logger = Logger(console_level = "DEBUG" if SETTINGS.IS_DEBUG else "INFO")

# Agent Factory
agent_factory = (
    Agently.AgentFactory(is_debug=SETTINGS.IS_DEBUG)
        .set_settings("current_model", SETTINGS.MODEL_PROVIDER)
        .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.auth", SETTINGS.MODEL_AUTH)
        .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.url", SETTINGS.MODEL_URL if hasattr(SETTINGS, "MODEL_URL") else None)
        .set_settings(f"model.{ SETTINGS.MODEL_PROVIDER }.options", SETTINGS.MODEL_OPTIONS if hasattr(SETTINGS, "MODEL_OPTIONS") else {})
)

# Start Workflow
main_workflow.start(
    agent_factory=agent_factory,
    SETTINGS=SETTINGS,
    root_path=root_path,
    logger=logger,
)

================================================
FILE: v3/examples/Latest Updates on AI Models2024-05-02.md
================================================
# Latest Updates on AI Models

> 2024-05-02 Thursday

## Industry Trends

### PROLOGUE

> The selected news articles are related to current trends and developments in the field of AI models. They cover various aspects of AI implementation across industries, from real-time pharma news delivery and control room optimization to accelerated adoption of AI and competitive dynamics in the AI sector. The articles provide insights into how AI is shaping operational efficiency, innovation, decision-making, and industry-specific tasks, reflecting the increasing utilization of AI technology for growth and productivity enhancement.

### NEWS LIST

- [AppliedXL Collaborates with Bloomberg to Provide AI-Powered, Real-Time Pharma News on the Bloomberg Terminal](https://www.lelezard.com/en/news-21360719.html)

    - `[summray]` AppliedXL collaborates with Bloomberg to provide AI-powered, real-time pharma news on the Bloomberg Terminal. The collaboration aims to deliver key insights to help users stay ahead of catalyst events in the pharmaceutical industry. AppliedXL's AI technology analyzes live public data to uncover signals and trends, which are then distilled into early news stories included in real-time news feeds for early signal detection and market analysis. The collaboration focuses on the life sciences and biopharma space, alerting users to irregularities in clinical trial progressions and other market-moving events. AppliedXL's AI technology combines machine learning and human expertise to provide precise and contextualized information efficiently.
    - `[comment]` This news article discusses how AppliedXL collaborates with Bloomberg to provide AI-powered, real-time pharma news on the Bloomberg Terminal. The collaboration aims to deliver key insights to help users stay ahead of catalyst events in the pharmaceutical industry, showcasing the use of AI in delivering real-time industry updates.

- [AI for control rooms](https://www.symmetrymagazine.org/article/ai-for-control-rooms?language_content_entity=und)

    - `[summray]` AI is being utilized in control rooms within the fields of particle physics and astrophysics to assist with complex tasks. From machine learning algorithms helping to keep particle beams flowing in accelerators to optimizing telescope scheduling for studying galaxies, AI is proving to be a valuable tool for scientists. Additionally, AI is being developed to aid electric grid operators in managing the increasing number of energy resources connecting to the grid. The goal is not to replace human operators but to enhance decision-making by presenting them with the best tool options immediately and learning from human feedback.
    - `[comment]` The article highlights the use of AI in control rooms within particle physics and astrophysics, assisting with complex tasks. It showcases how AI is enhancing decision-making and presenting the best tool options immediately to human operators, aligning with the current trend of utilizing AI to optimize processes.

- [Six AI industry trends we're tracking in 2024 (and beyond)](https://diginomica.com/six-ai-industry-trends-were-tracking-2024-and-beyond)

    - `[summray]` In 2024, the adoption of AI across industries has accelerated significantly, with projections indicating that by 2040, 1.3 million businesses will be utilizing AI to drive innovation. Various sectors such as the telecom industry, manufacturing, energy, utilities, construction, asset-centric service providers, and defense companies are leveraging AI and automation to enhance operational efficiency, drive performance, accelerate evolution, alleviate challenges, transform fleet management, and strengthen cybersecurity. As organizations invest in advanced technology like AI to optimize processes and automate industry-specific tasks, the potential for growth and productivity enhancement is vast, signaling a shift towards more resilient and digitally transformed operations.
    - `[comment]` The content discusses the accelerated adoption of AI across various industries, enhancing operational efficiency and driving innovation. It reflects the trend of organizations investing in advanced technology like AI to optimize processes and automate industry-specific tasks for enhanced growth and productivity.

- [Microsoft's Fear Of Google's AI Dominance Led To OpenAI Investment, Internal Email Reveals: 'We're Multiple Years Behind The Competition'](https://www.benzinga.com/news/24/05/38582364/microsofts-fear-of-googles-ai-dominance-led-to-openai-investment-internal-email-reveals-were-multipl)

    - `[summray]` An internal email from Microsoft Corp. revealed that the company's investment in OpenAI was motivated by the fear of falling behind Google in AI capabilities. Microsoft's chief technology officer Kevin Scott expressed concerns about the lack of machine learning scale, infrastructure, and development speed compared to Google and OpenAI. The email highlighted the intense competition in the AI space, with Microsoft investing over $13 billion in OpenAI to enhance various services. The email sheds light on the rivalry between Microsoft and Google in the AI sector, with Google introducing Bard (now Gemini) to compete with OpenAI's ChatGPT, facing some challenges during the launch. This news article reflects the current trends and developments in AI models and the competitive landscape in the industry.
    - `[comment]` The news reveals Microsoft's investment in OpenAI motivated by the fear of falling behind Google in AI capabilities. It sheds light on the intense competition in the AI space, showcasing the current trends and developments in AI models and the competitive landscape in the industry.

- [Q1 2024 Cognizant Technology Solutions Corp Earnings Call](https://finance.yahoo.com/news/q1-2024-cognizant-technology-solutions-123608449.html)

    - `[summray]` Cognizant Technology Solutions reported on their Q1 2024 Earnings Call, highlighting progress against strategic priorities in a challenging demand environment. They delivered revenue growth exceeding guidance, expanded adjusted operating margin, and noted improvements in voluntary attrition. The company saw sequential growth in Health Sciences and Communications, Media and Technology, with declines in Financial Services. The demand environment remains uncertain, shifting client spending to cost-saving projects. Cognizant focuses on innovation, including AI, cloud, and digital technologies. They mentioned partnerships with Microsoft, Google Cloud, and NVIDIA for AI initiatives. The company emphasized the importance of collaboration, cited recognition for innovation, and highlighted their Bluebolt grassroots initiative. Overall, they aim to increase revenue growth, become an employer of choice, and simplify operations.
    - `[comment]` Cognizant Technology Solutions' Q1 2024 Earnings Call highlights their focus on innovation, including AI, cloud, and digital technologies. The partnerships with Microsoft, Google Cloud, and NVIDIA for AI initiatives showcase the ongoing trend of companies leveraging AI for growth and becoming employers of choice.

## Innovations and Research

### PROLOGUE

> Recent innovations and breakthroughs in the AI models domain are highlighted in the selected news articles. China's advancements in AI technologies, including the SenseNova 5.0 large language model and Vidu text-to-video AI tool, demonstrate the country's commitment to cutting-edge AI developments. Additionally, the rise of generative AI is emphasized as a key trend for driving innovation and organizational growth. Furthermore, a team of researchers has outlined guidelines for the responsible use of machine learning in science, aiming to enhance credibility and reproducibility in research. Explore more about these advancements and guidelines in the following articles: [China's AI Advances](https://swarajyamag.com/technology/chinas-ai-advances-that-are-flying-under-the-radar), [Generative AI's Exponential Potential](https://www.forbes.com/sites/forbestechcouncil/2024/05/02/innovators-should-seize-on-generative-ais-exponential-potential/), [Science's AI Problem](https://www.sciencedaily.com/releases/2024/05/240501153055.htm)

### NEWS LIST

- [China's AI Advances That Are Flying Under The Radar](https://swarajyamag.com/technology/chinas-ai-advances-that-are-flying-under-the-radar)

    - `[summray]` China is making significant advancements in Artificial Intelligence (AI), with recent releases rivalling those in the United States. SenseTime unveiled the SenseNova 5.0 large language model (LLM) with impressive capabilities in knowledge, mathematics, reasoning, and coding. The model surpasses OpenAI's GPT-4 Turbo and tops various multimodal benchmarks. Another innovation is Vidu, a text-to-video AI tool that can generate 16-second videos based on simple text prompts. Additionally, Stardust Intelligence introduced the Astribot S1 humanoid robot, capable of performing household chores and imitating human movements. China is demonstrating seriousness in its AI ambitions, with over 40 approved AI models for public use and a vision to empower billions of people with AI robot assistants.
    - `[comment]` China's advancements in AI, such as the SenseNova 5.0 large language model and Vidu text-to-video AI tool, showcase the country's commitment to innovative technologies in the AI domain.

- [Innovators Should Seize On Generative AI's Exponential Potential](https://www.forbes.com/sites/forbestechcouncil/2024/05/02/innovators-should-seize-on-generative-ais-exponential-potential/)

    - `[summray]` Generative AI is identified as a significant trend in the tech industry that necessitates rapid adaptation. The market for generative AI is projected to grow rapidly, with organizations investing in the technology to drive innovation. McKinsey details how generative AI can accelerate organizational growth by rapidly processing information, writing code for self-improvement, and enhancing competitive edge. By utilizing generative AI tools tailored for each phase of innovation, organizations can revamp their innovation processes to tap into the technology's potential. The importance of experimentation, prototyping, and scaling is emphasized, with generative AI offering various tools to aid in these processes. The democratization of innovation across employees and the augmentation of emerging technologies hold promise for accelerating organization's adaptability and competitiveness in leveraging generative AI for innovation.
    - `[comment]` Generative AI is a crucial trend in tech, with potential to drive rapid innovation and organizational growth. Organizations should leverage generative AI tools for revolutionizing their innovation processes.

- [Science has an AI problem: This group says they can fix it](https://www.sciencedaily.com/releases/2024/05/240501153055.htm)

    - `[summray]` An interdisciplinary team of 19 researchers, led by Princeton University computer scientists Arvind Narayanan and Sayash Kapoor, has published guidelines for the responsible use of machine learning in science to address the credibility crisis in research caused by deep flaws in machine learning methods. The guidelines focus on transparency and integrity, calling for detailed descriptions of machine learning models, code, data, hardware specifications, experimental design, and project goals. The aim is to ensure reproducibility of results, validate claims, and accelerate scientific progress by improving the quality of published papers.
    - `[comment]` A team of researchers has provided guidelines for responsible use of machine learning in science to address credibility issues. Transparency and integrity in machine learning models are crucial for reproducibility of results and scientific progress.

## Future Outlook

### PROLOGUE

> The following article delves into the future prospects, challenges, and potential advancements of AI models in the context of business operations and employee dynamics. It explores the impact of technological advancements, particularly Artificial Intelligence (AI), on businesses and employees, focusing on layoffs resulting from automation. For more information, you can visit the article [Layoffs in the wake of technological advancements: The inherent benefits for businesses and employees](https://www.ghanaweb.com/GhanaHomePage/business/Layoffs-in-the-wake-of-technological-advancements-The-inherent-benefits-for-businesses-and-employees-1928854).

### NEWS LIST

- [Layoffs in the wake of technological advancements: The inherent benefits for businesses and employees](https://www.ghanaweb.com/GhanaHomePage/business/Layoffs-in-the-wake-of-technological-advancements-The-inherent-benefits-for-businesses-and-employees-1928854)

    - `[summray]` The article discusses the impact of technological advancements, particularly Artificial Intelligence (AI), on businesses and employees, focusing on layoffs as a result of automation. It highlights the benefits and challenges of AI in the workplace, such as increased productivity, job displacement, and layoff exercises. The causes of layoffs, including economic downturns, technological advancements, restructuring, shifting consumer preferences, and cost-saving measures, are explored, along with the opportunities they bring for businesses. Additionally, the article outlines the benefits of layoffs for employees, such as severance packages, career reevaluation, increased market value, networking opportunities, personal growth, and entrepreneurial opportunities.
    - `[comment]` The article provides insights into the impact of technological advancements, specifically AI, on businesses and employees, highlighting the challenges and benefits associated with layoffs. It is a relevant read for understanding the future prospects of AI models in the workplace.


---

Powered by [Agently AI Application Development Framework & Agently Workflow](https://github.com/Maplemx/Agently)

Model Information：OAIClient - {'model': 'gpt-3.5-turbo'}

**_<font color = "red">Agent</font><font color = "blue">ly</font>_** [Guidebook](https://github.com/Maplemx/Agently/blob/main/docs/guidebook)

[Apply Developers WeChat Group](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan QR Code to Apply.

<img width="120" alt="image" src="https://github.com/Maplemx/Agently/assets/4413155/7f4bc9bf-a125-4a1e-a0a4-0170b718c1a6">

================================================
FILE: v3/prompts/create_outline.yaml
================================================
input: ${topic}
instruct:
    task: prepare news collection outline according {input}'s topic
    output language: ${language}
output:
    report_title:
        $type: str
        $desc: generate a title for this news collection like "daily news about sports", "today's news about finance"
    column_list:
        $type:
            - column_title:
                  $type: str
              column_requirement:
                  $type: str
                  $desc: describe recheck standard about the contents in this column to make sure all contents are aimed at the requirement of {input}'s topic
              search_keywords:
                  $type: str
                  $desc: search keywords for this column splited by space. make sure the filed keyword about {input} is included in keywords.
        $desc: the number of columns <= ${max_column_num}

================================================
FILE: v3/prompts/pick_news.yaml
================================================
input: ${column_news}
instruct:
  news select rules:
    - ${column_requirement}
    - if several news are similar, just retain the one with most famous source and output {can_use} as false for others
output:
  - id:
      $type: int
      $desc: value from {input.[].id}
    can_use:
      $type: bool
      $desc: judge if {input.brief} can be used according {instruct}
    recommend_comment:
      $type: str
      $desc: provide your recommend comment if {can_use} == true, or just output null

================================================
FILE: v3/prompts/summarize.yaml
================================================
input: ${news_content}
info:
  column requirement: ${column_requirement}
  news title: ${news_title}
instruct:
  output language: ${language}
  summary rule:
    - find and summarize the main content part of the news content which is collected from webpage
    - summary focus on relative content to {column requirement} and {news title}
    - summary in one paragraph without linebreak
output:
  can_summarize:
    $type: bool
    $desc: judge if {input} has enough relative content to be summarized
  translated_title:
    $type: str
    $desc: translate {input.news title} into ${language}
  summary:
    $type: str
    $desc: summarize {input} according {info} and {instruct} if {can_summarize} == true, or output null


================================================
FILE: v3/prompts/write_column.yaml
================================================
input: ${slimmed_news}
info:
  column requirement: ${column_requirement}
instruct:
  news select rules:
    - if there're serveral similar content news, only select one of them into {news_list}
    - all news selected must follow or be relative to {column requirement}
  output language: ${language}
output:
  news_list:
    $type:
      - id:
          $type: int
          $desc: value from {input.[].id}
        recommend_comment:
          $type: str
          $desc: provide your recommend comment of this news according your role and {column requirement}
    $desc: select news into column list according {news select rules} from {input}
  prologue:
    $type: str
    $desc: write a prologue for readers according {news_list} and {news select rules}, you can use [<key info>](news url) to mark key information

================================================
FILE: v3/requirements.txt
================================================
Agently>=3.2.2.8
PyYAML==6.0.1
duckduckgo_search>=5.3.0
beautifulsoup4>=4.12.3

================================================
FILE: v3/utils/__init__.py
================================================


================================================
FILE: v3/utils/logger.py
================================================
import os
import logging

logging.getLogger().setLevel(logging.NOTSET)

class Logger(object):
    def __init__(self, **kwargs):
        name = kwargs.get("name", "Agently-Daily-News-Collector")
        log_level = kwargs.get("log_level", "ERROR")
        console_level = kwargs.get("console_level", "INFO")
        log_format = kwargs.get("format", "%(asctime)s\t[%(levelname)s]\t%(message)s")
        log_path = kwargs.get("path", "./logs/Agently_daily_news_collector.log")
        handlers = kwargs.get("handlers", [])
        self.logger = logging.getLogger(name)
        if self.logger.hasHandlers():
            self.logger.handlers.clear()
        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(getattr(logging, console_level))
        stream_handler.setFormatter(logging.Formatter(log_format))
        self.logger.addHandler(stream_handler)
        file_handler = logging.FileHandler(log_path)
        file_handler.setLevel(getattr(logging, log_level))
        file_handler.setFormatter(logging.Formatter(log_format))
        self.logger.addHandler(file_handler)
        for handler in handlers:
            self.logger.addHandler(handler)

    def __transform(self, *args, **kwargs):
        message = ""
        for arg in args:
            message += f"{ arg }\t"
        message = message[:-1]
        kwargs_to_list = []
        kwargs_message = ""
        for key, value in kwargs.items():
            kwargs_to_list.append(f"{ key }: { str(value) }")
        kwargs_message += "\t".join(kwargs_to_list)
        if kwargs_message != "":
            message += f"\t{ kwargs_message }"
        return message

    def debug(self, *args, **kwargs):
        return self.logger.debug(self.__transform(*args, **kwargs))

    def info(self, *args, **kwargs):
        return self.logger.info(self.__transform(*args, **kwargs))

    def warning(self, *args, **kwargs):
        return self.logger.warning(self.__transform(*args, **kwargs))

    def error(self, *args, **kwargs):
        return self.logger.error(self.__transform(*args, **kwargs))

    def critical(self, *args, **kwargs):
        return self.logger.critical(self.__transform(*args, **kwargs))

logger = Logger()

================================================
FILE: v3/utils/path.py
================================================
import os
root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

================================================
FILE: v3/utils/yaml_reader.py
================================================
import yaml
from types import SimpleNamespace

class YAMLResult(SimpleNamespace):
    pass

def read(yaml_path:str):
    try:
        with open(yaml_path, "r") as yaml_file:
            yaml_dict = yaml.safe_load(yaml_file)
            return YAMLResult(**yaml_dict)
    except Exception as e:
        raise Exception(f"[YAML Reader] Error occured when read YAML from path '{ yaml_path }'.\nError: { str(e) }")

================================================
FILE: v3/workflows/__init__.py
================================================


================================================
FILE: v3/workflows/column_workflow.py
================================================
import time
import Agently
from .tools.search import search
from .tools.browse import browse

def start(column_outline, *, agent_factory, SETTINGS, root_path, logger):
    tool_proxy = (
        SETTINGS.TOOL_PROXY
        if hasattr(SETTINGS, "TOOL_PROXY")
        else
        (
            SETTINGS.PROXY
            if hasattr(SETTINGS, "PROXY")
            else None
        ) 
    )
    logger.info("[Start Generate Column]", column_outline["column_title"])
    column_workflow = Agently.Workflow()
    column_editor_agent = agent_factory.create_agent()
    # You can set column editor agent here, read https://github.com/Maplemx/Agently/tree/main/docs/guidebook to explore
    """
    (
        column_editor_agent
            .set_role("...")
            .set_user_info("...")
    )
    """

    # Define Workflow Chunks
    @column_workflow.chunk("start", type="Start")

    @column_workflow.chunk("search")
    def search_executor(inputs, storage):
        storage.set(
            "searched_news",
            search(
                column_outline["search_keywords"],
                timelimit=SETTINGS.NEWS_TIME_LIMIT if hasattr(SETTINGS, "NEWS_TIME_LIMIT") else "d",
                proxy=tool_proxy,
                logger=logger,
            )
        )

    @column_workflow.chunk("pick_news")
    def pick_news_executor(inputs, storage):
        searched_news = storage.get("searched_news", [])
        logger.info("[Search News Count]", len(searched_news))
        if len(searched_news) > 0:
            pick_results = (
                column_editor_agent
                    .load_yaml_prompt(
                        path=f"{ root_path }/prompts/pick_news.yaml",
                        variables={
                            "column_news": searched_news,
                            "column_requirement": column_outline["column_requirement"],
                        }
                    )
                    .start()
            )
            # sleep to avoid requesting too often
            time.sleep(SETTINGS.SLEEP_TIME)
            picked_news = []
            for pick_result in pick_results:
                if pick_result["can_use"]:
                    news = searched_news[int(pick_result["id"])].copy()
                    news.update({ "recommend_comment": pick_result["recommend_comment"] })
                    picked_news.append(news)
            storage.set("picked_news", picked_news)
            logger.info("[Picked News Count]", len(picked_news))
        else:
            storage.set("picked_news", [])
            logger.info("[Picked News Count]", 0)

    @column_workflow.chunk("read_and_summarize")
    def read_and_summarize_executor(inputs, storage):
        picked_news = storage.get("picked_news", [])
        readed_news = []
        if picked_news and len(picked_news) > 0:
            for news in picked_news:
                logger.info("[Summarzing]", news["title"])
                news_content = browse(
                    news["url"],
                    proxy=tool_proxy,
                    logger=logger,
                )
                if news_content and news_content != "":
                    try:
                        summary_result = (
                            column_editor_agent
                                .load_yaml_prompt(
                                    path=f"{ root_path }/prompts/summarize.yaml",
                                    variables={
                                        "news_content": news_content,
                                        "column_requirement": column_outline["column_requirement"],
                                        "news_title": news["title"],
                                        "language": SETTINGS.OUTPUT_LANGUAGE,
                                    }
                                )
                                .start()
                        )
                        if summary_result["can_summarize"]:
                            readed_news_info = news.copy()
                            readed_news_info.update({
                                "title": summary_result["translated_title"],
                                "summary": summary_result["summary"]
                            })
                            readed_news.append(readed_news_info)
                            logger.info("[Summarzing]", "Success")
                        else:
                            logger.info("[Summarzing]", "Failed")
                        # sleep to avoid requesting too often
                        time.sleep(SETTINGS.SLEEP_TIME)
                    except Exception as e:
                        logger.error(f"[Summarzie]: Can not summarize '{ news['title'] }'.\tError: { str(e) }")
        storage.set("readed_news", readed_news)

    @column_workflow.chunk("write_column")
    def write_column_executor(inputs, storage):
        readed_news = storage.get("readed_news", [])
        if readed_news and len(readed_news) > 0:
            slimmed_news = []
            for index, news in enumerate(readed_news):
                slimmed_news.append({
                    "id": index,
                    "title": news["title"],
                    "summary": news["summary"],
                    "url": news["url"],
                })
            column_result = (
                column_editor_agent
                    .load_yaml_prompt(
                        path=f"{ root_path }/prompts/write_column.yaml",
                        variables={
                            "slimmed_news": slimmed_news,
                            "column_requirement": column_outline["column_requirement"],
                            "language": SETTINGS.OUTPUT_LANGUAGE,
                        }
                    )
                    .start()
            )
            # sleep to avoid requesting too often
            time.sleep(SETTINGS.SLEEP_TIME)
            final_news_list = []
            for news in column_result["news_list"]:
                id = news["id"]
                final_news_list.append({
                    "url": readed_news[id]["url"],
                    "title": readed_news[id]["title"],
                    "summary": readed_news[id]["summary"],
                    "recommend_comment": news["recommend_comment"],
                })
            storage.set("final_result", {
                "title": column_outline["column_title"],
                "prologue": column_result["prologue"],
                "news_list": final_news_list,
            })
        else:
            storage.set("final_result", None)

    # Connect Chunks
    (
        column_workflow.chunks["start"]
            .connect_to(column_workflow.chunks["search"])
            .connect_to(column_workflow.chunks["pick_news"])
            .connect_to(column_workflow.chunks["read_and_summarize"])
            .connect_to(column_workflow.chunks["write_column"])
    )

    # Start Workflow
    column_workflow.start()

    return column_workflow.executor.store.get("final_result")


================================================
FILE: v3/workflows/main_workflow.py
================================================
import time
import Agently
from datetime import datetime
from .column_workflow import start as start_column_workflow

def start(*, agent_factory, SETTINGS, root_path, logger):
    main_workflow = Agently.Workflow()
    chief_editor_agent = agent_factory.create_agent()
    # You can set chief editor agent here, read https://github.com/Maplemx/Agently/tree/main/docs/guidebook to explore
    """
    (
        chief_editor_agent
            .set_role("...")
            .set_user_info("...")
    )
    """

    # Define Workflow Chunks
    @main_workflow.chunk("start", type="Start")

    @main_workflow.chunk("input_topic")
    def input_topic_executor(inputs, storage):
        if not SETTINGS.USE_CUSTOMIZE_OUTLINE:
            storage.set(
                "topic",
                input("[Please input the topic of your news collection]: ")
            )

    @main_workflow.chunk("generate_outline")
    def generate_outline_executor(inputs, storage):
        if SETTINGS.USE_CUSTOMIZE_OUTLINE:
            storage.set("outline", SETTINGS.CUSTOMIZE_OUTLINE)
            logger.info("[Use Customize Outline]", SETTINGS.CUSTOMIZE_OUTLINE)
        else:
            # Load prompt from /prompts/create_outline.yaml
            outline = (
                chief_editor_agent
                    .load_yaml_prompt(
                        path=f"{ root_path }/prompts/create_outline.yaml",
                        variables={
                            "topic": storage.get("topic"),
                            "news_time_limit": SETTINGS.NEWS_TIME_LIMIT if hasattr(SETTINGS, "NEWS_TIME_LIMIT") else "d",
                            "language": SETTINGS.OUTPUT_LANGUAGE,
                            "max_column_num": SETTINGS.MAX_COLUMN_NUM,
                        }
                    )
                    .start()
            )
            storage.set("outline", outline)
            logger.info("[Outline Generated]", outline)
            # sleep to avoid requesting too often
            time.sleep(SETTINGS.SLEEP_TIME)

    @main_workflow.chunk("generate_columns")
    def generate_columns_executor(inputs, storage):
        columns_data = []
        outline = storage.get("outline")
        for column_outline in outline["column_list"]:
            column_data = start_column_workflow(
                column_outline=column_outline,
                agent_factory=agent_factory,
                SETTINGS=SETTINGS,
                root_path=root_path,
                logger=logger,
            )
            if column_data:
                columns_data.append(column_data)
                logger.info("[Column Data Prepared]", column_data)
        storage.set("columns_data", columns_data)

    @main_workflow.chunk("generate_markdown")
    def generate_markdown_executor(inputs, storage):
        outline = storage.get("outline")
        columns_data = storage.get("columns_data")
        if columns_data and len(columns_data) > 0:
            # Main Title
            md_doc_text = f'# { outline["report_title"] }\n\n'
            md_doc_text += f'> { datetime.now().strftime("%Y-%m-%d %A") }\n\n'
            # Columns
            if SETTINGS.IS_DEBUG:
                logger.debug("[Columns Data]", columns_data)
            for column_data in columns_data:
                md_doc_text += f'## { column_data["title"] }\n\n### PROLOGUE\n\n'
                md_doc_text += f'> { column_data["prologue"] }\n\n'
                md_doc_text += f"### NEWS LIST\n\n"
                for single_news in column_data["news_list"]:
                    md_doc_text += f'- [{ single_news["title"] }]({ single_news["url"] })\n\n'
                    md_doc_text += f'    - `[summray]` { single_news["summary"] }\n'
                    md_doc_text += f'    - `[comment]` { single_news["recommend_comment"] }\n\n'
            # Tailer
            md_doc_text +="\n\n---\n\nPowered by [Agently AI Application Development Framework & Agently Workflow](https://github.com/Maplemx/Agently)\n\n"
            md_doc_text += f"Model Information：{ SETTINGS.MODEL_PROVIDER if hasattr(SETTINGS, 'MODEL_PROVIDER') else 'OpenAI' } - { str(SETTINGS.MODEL_OPTIONS) if hasattr(SETTINGS, 'MODEL_OPTIONS') else 'Default Options' }\n\n"
            md_doc_text += '**_<font color = "red">Agent</font><font color = "blue">ly</font>_** [Guidebook](https://github.com/Maplemx/Agently/blob/main/docs/guidebook)\n\n[Apply Developers WeChat Group](https://doc.weixin.qq.com/forms/AIoA8gcHAFMAScAhgZQABIlW6tV3l7QQf) or Scan QR Code to Apply.\n\n<img width="120" alt="image" src="https://github.com/Maplemx/Agently/assets/4413155/7f4bc9bf-a125-4a1e-a0a4-0170b718c1a6">'
            logger.info("[Markdown Generated]", md_doc_text)
            with open(f'{ root_path }/{ outline["report_title"] }_{ datetime.now().strftime("%Y-%m-%d") }.md', 'w', encoding='utf-8') as f:
                f.write(md_doc_text)
        else:
            logger.info("[Markdown Generation Failed] Due to have not any column data.")

    # Connect Chunks
    (
        main_workflow.chunks["start"]
            .connect_to(main_workflow.chunks["input_topic"])
            .connect_to(main_workflow.chunks["generate_outline"])
            .connect_to(main_workflow.chunks["generate_columns"])
            .connect_to(main_workflow.chunks["generate_markdown"])
    )

    # Start Workflow
    main_workflow.start()

================================================
FILE: v3/workflows/tools/__init__.py
================================================


================================================
FILE: v3/workflows/tools/browse.py
================================================
import re
import requests
from bs4 import BeautifulSoup

def browse(url, *, logger=None, proxy=None):
    content = ""
    try:
        request_options = {
            "headers": { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" }
        }
        if proxy:
            if proxy.startswith("http:"):
                request_options.update({ "proxies": { "http": proxy } })
            elif proxy.startswith("https:"):
                request_options.update({ "proxies": { "https": proxy } })
        page = requests.get(
            url,
            **request_options
        )
        soup = BeautifulSoup(page.content, "html.parser")
        # find text in p, list, pre (github code), td
        chunks = soup.find_all(["h1", "h2", "h3", "h4", "h5", "p", "pre", "td"])
        for chunk in chunks:
            if chunk.name.startswith("h"):
                content += "#" * int(chunk.name[-1]) + " " + chunk.get_text() + "\n"
            else:
                text = chunk.get_text()
                if text and text != "":
                    content += text + "\n"
        # find text in div that class=content
        divs = soup.find("div", class_="content")
        if divs:
            chunks_with_text = divs.find_all(text=True)
            for chunk in chunks_with_text:
                if isinstance(chunk, str) and chunk.strip():
                    content += chunk.strip() + "\n"
        content = re.sub(r"\n+", "\n", content)
        return content
    except Exception as e:
        if logger:
            logger.error(f"[Browse]: Can not browse '{ url }'.\tError: { str(e) }")
        return ""

================================================
FILE: v3/workflows/tools/search.py
================================================
from duckduckgo_search import DDGS

def search(keywords, **kwargs):
    results = []
    try:
        with DDGS(proxy=kwargs.get("proxy", None)) as ddgs:
            for index, result in enumerate(
                ddgs.news(
                    keywords,
                    max_results=kwargs.get("max_results", 8),
                    timelimit=kwargs.get("timelimit", "d"),
                )
            ):
                results.append({
                    "id": index,
                    "title": result["title"],
                    "brief": result["body"],
                    "url": result["url"],
                    "source": result["source"],
                    "date": result["date"],
                })
        return results
    except Exception as e:
        if "logger" in kwargs:
            kwargs["logger"].error(f"[Search]: Can not search '{ keywords }'.\tError: { str(e) }")
        return [] 


================================================
FILE: workflow/__init__.py
================================================
from .daily_news import build_daily_news_flow

__all__ = ["build_daily_news_flow"]


================================================
FILE: workflow/column_chunks.py
================================================
from __future__ import annotations

import copy
import re
from typing import Any, Callable

from agently import TriggerFlowRuntimeData

from .common import (
    DailyNewsChunkConfig,
    create_editor_agent,
    is_chinese_language,
    require_logger,
    require_search_tool,
    safe_int,
)
from .summary_chunks import pick_news


def create_search_column_news_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def search_column_news(data: TriggerFlowRuntimeData) -> dict[str, Any] | None:
        column_outline = data.value if isinstance(data.value, dict) else None
        if not isinstance(column_outline, dict):
            return None

        title = str(column_outline.get("column_title") or "").strip()
        if not title:
            return None

        logger = require_logger(data)
        request = _get_request_context(data)
        logger.info("[Start Generate Column] %s", title)
        try:
            searched_news = await search_news(
                config,
                logger,
                require_search_tool(data),
                column_outline,
                topic=str(request.get("topic") or ""),
            )
        except Exception as exc:
            logger.exception("[Column Search Failed] %s: %s", title, exc)
            return None

        logger.info("[Search News Count] %s => %s", title, len(searched_news))
        if not searched_news:
            return None
        return {
            "column_outline": copy.deepcopy(column_outline),
            "searched_news": searched_news,
        }

    return search_column_news


def create_pick_column_news_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def pick_column_news(data: TriggerFlowRuntimeData) -> dict[str, Any] | None:
        context = _coerce_column_context(data.value)
        if context is None:
            return None

        column_outline = context["column_outline"]
        searched_news = context["searched_news"]
        title = str(column_outline.get("column_title") or "").strip()
        logger = require_logger(data)
        try:
            picked_news = await pick_news(
                config,
                column_outline,
                searched_news,
            )
        except Exception as exc:
            logger.exception("[Column Pick Failed] %s: %s", title, exc)
            return None

        logger.info("[Picked News Count] %s => %s", title, len(picked_news))
        if not picked_news:
            return None
        return {
            **context,
            "picked_news": picked_news,
        }

    return pick_column_news


def create_write_column_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def write_column(data: TriggerFlowRuntimeData) -> dict[str, Any] | None:
        context = _coerce_column_context(data.value, require_picked=True, require_summarized=True)
        if context is None:
            return None

        column_outline = context["column_outline"]
        title = str(column_outline.get("column_title") or "").strip()
        logger = require_logger(data)
        try:
            column_result = await _write_column(
                config,
                column_outline,
                context["summarized_news"],
            )
        except Exception as exc:
            logger.exception("[Column Write Failed] %s: %s", title, exc)
            return None

        logger.info("[Column Ready] %s", title)
        return column_result

    return write_column


def _get_request_context(data: TriggerFlowRuntimeData) -> dict[str, Any]:
    request = data.state.get("request")
    if not isinstance(request, dict):
        request = data.get_runtime_data("request")
    return request if isinstance(request, dict) else {}


def _coerce_column_context(
    value: Any,
    *,
    require_picked: bool = False,
    require_summarized: bool = False,
) -> dict[str, Any] | None:
    if not isinstance(value, dict):
        return None

    column_outline = value.get("column_outline")
    searched_news = value.get("searched_news")
    if not isinstance(column_outline, dict) or not isinstance(searched_news, list):
        return None

    context: dict[str, Any] = {
        "column_outline": column_outline,
        "searched_news": searched_news,
    }

    picked_news = value.get("picked_news")
    if picked_news is not None:
        if not isinstance(picked_news, list):
            return None
        context["picked_news"] = picked_news
    elif require_picked:
        return None

    summarized_news = value.get("summarized_news")
    if summarized_news is not None:
        if not isinstance(summarized_news, list):
            return None
        context["summarized_news"] = summarized_news
    elif require_summarized:
        return None

    return context


async def search_news(
    config: DailyNewsChunkConfig,
    logger,
    search_tool,
    column_outline: dict[str, Any],
    *,
    topic: str,
) -> list[dict[str, Any]]:
    query = str(column_outline.get("search_keywords") or "").strip()
    if not query:
        return []
    queries = build_search_queries(
        search_keywords=query,
        topic=topic,
    )
    normalized_results = []
    seen_urls: set[str] = set()
    for candidate in queries:
        try:
            raw_results = await search_tool.search_news(
                query=candidate,
                timelimit=config.settings.search.timelimit,
                max_results=config.settings.search.max_results,
            )
        except Exception as exc:
            logger.warning("[Search Failed] %s => %s", candidate, exc)
            continue

        added_count = 0
        for raw in raw_results or []:
            if not isinstance(raw, dict):
                continue
            title = str(raw.get("title") or "").strip()
            url = str(raw.get("url") or raw.get("href") or "").strip()
            if not title or not url or url in seen_urls:
                continue
            seen_urls.add(url)
            normalized_results.append(
                {
                    "id": len(normalized_results),
                    "title": title,
                    "brief": str(raw.get("body") or raw.get("snippet") or "").strip(),
                    "url": url,
                    "source": str(raw.get("source") or "").strip(),
                    "date": str(raw.get("date") or "").strip(),
                }
            )
            added_count += 1
            if len(normalized_results) >= config.settings.search.max_results:
                break
        logger.info("[Search Attempt] %s => %s", candidate, added_count)
        if len(normalized_results) >= config.settings.search.max_results:
            break
    return normalized_results


def build_search_queries(
    *,
    search_keywords: str,
    topic: str,
) -> list[str]:
    queries: list[str] = []
    seen: set[str] = set()

    def add(query: str) -> None:
        normalized = re.sub(r"\s+", " ", query).strip()
        if not normalized or normalized in seen:
            return
        seen.add(normalized)
        queries.append(normalized)

    add(search_keywords)

    keyword_tokens = _extract_search_tokens(search_keywords)
    topic_tokens = _extract_search_tokens(topic)

    if keyword_tokens:
        add(" ".join(keyword_tokens))
        non_year_keyword_tokens = [token for token in keyword_tokens if not re.fullmatch(r"\d{4}", token)]
        if non_year_keyword_tokens:
            add(" ".join(non_year_keyword_tokens))

    if topic_tokens:
        add(" ".join(topic_tokens))
        add(" ".join([*topic_tokens, "news"]))

    merged_tokens = _dedupe_tokens([*topic_tokens, *keyword_tokens])
    if merged_tokens:
        add(" ".join(merged_tokens))
        add(" ".join([*merged_tokens, "news"]))

    return queries


def _extract_search_tokens(text: str) -> list[str]:
    tokens = re.findall(r"[A-Za-z0-9][A-Za-z0-9._+-]*", text)
    return _dedupe_tokens(tokens)[:8]


def _dedupe_tokens(tokens: list[str]) -> list[str]:
    result: list[str] = []
    seen: set[str] = set()
    for token in tokens:
        normalized = token.strip()
        lower_token = normalized.lower()
        if not normalized or lower_token in seen:
            continue
        seen.add(lower_token)
        result.append(normalized)
    return result


async def _write_column(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    summarized_news: list[dict[str, Any]],
) -> dict[str, Any]:
    slimmed_news = []
    for index, news in enumerate(summarized_news):
        slimmed_news.append(
            {
                "id": index,
                "title": news["title"],
                "summary": news["summary"],
                "url": news["url"],
                "source": news.get("source", ""),
                "date": news.get("date", ""),
                "recommend_comment": news.get("recommend_comment", ""),
            }
        )

    column_result = await (
        create_editor_agent(kind="column")
        .load_yaml_prompt(
            config.prompt_dir / "write_column.yaml",
            {
                "news_list": slimmed_news,
                "column_title": column_outline["column_title"],
                "column_requirement": column_outline["column_requirement"],
                "language": config.settings.workflow.output_language,
            },
        )
        .async_start(
            ensure_keys=[
                "prologue",
                "news_list[*].id",
                "news_list[*].recommend_comment",
            ]
        )
    )

    if not isinstance(column_result, dict):
        return _build_fallback_column(config, column_outline, summarized_news)

    final_news_list = []
    used_ids: set[int] = set()
    for item in column_result.get("news_list", []):
        if not isinstance(item, dict):
            continue
        news_id = safe_int(item.get("id"), -1)
        if news_id < 0 or news_id >= len(summarized_news) or news_id in used_ids:
            continue
        used_ids.add(news_id)
        final_item = copy.deepcopy(summarized_news[news_id])
        refined_comment = str(item.get("recommend_comment") or "").strip()
        if refined_comment:
            final_item["recommend_comment"] = refined_comment
        final_news_list.append(final_item)

    if not final_news_list:
        final_news_list = summarized_news[: config.settings.workflow.max_news_per_column]

    prologue = str(column_result.get("prologue") or "").strip()
    if not prologue:
        prologue = _build_fallback_prologue(config, column_outline, final_news_list)

    return {
        "title": column_outline["column_title"],
        "prologue": prologue,
        "news_list": final_news_list,
    }


def _build_fallback_column(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    summarized_news: list[dict[str, Any]],
) -> dict[str, Any]:
    return {
        "title": column_outline["column_title"],
        "prologue": _build_fallback_prologue(config, column_outline, summarized_news),
        "news_list": summarized_news[: config.settings.workflow.max_news_per_column],
    }


def _build_fallback_prologue(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    news_list: list[dict[str, Any]],
) -> str:
    if not news_list:
        return str(column_outline.get("column_requirement") or "")

    if is_chinese_language(config.settings.workflow.output_language):
        lead_titles = "，".join(f"《{news['title']}》" for news in news_list[:3])
        return f"本栏目围绕“{column_outline['column_title']}”整理了以下重点内容：{lead_titles}。"

    lead_titles = ", ".join(news["title"] for news in news_list[:3])
    return f"This section highlights the most relevant stories for {column_outline['column_title']}: {lead_titles}."


__all__ = [
    "create_search_column_news_chunk",
    "create_pick_column_news_chunk",
    "create_write_column_chunk",
]


================================================
FILE: workflow/common.py
================================================
from __future__ import annotations

import logging
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, cast

from agently import Agently, TriggerFlowRuntimeData

from news_collector.config import AppSettings
from tools.base import BrowseToolProtocol, SearchToolProtocol


@dataclass(frozen=True, slots=True)
class DailyNewsChunkConfig:
    settings: AppSettings
    prompt_dir: Path
    output_dir: Path
    model_label: str


def create_editor_agent(*, kind: str):
    agent = Agently.create_agent(name=f"{kind}_editor")
    if kind == "chief":
        agent.set_agent_prompt(
            "system",
            "You are a veteran newsroom chief editor who designs reliable daily news briefings.",
        )
        agent.set_agent_prompt(
            "instruct",
            [
                "Prefer recent, factual, non-duplicated stories.",
                "Keep structures stable and concise.",
            ],
        )
    else:
        agent.set_agent_prompt(
            "system",
            "You are a meticulous news editor who selects and rewrites high-signal stories.",
        )
        agent.set_agent_prompt(
            "instruct",
            [
                "Reject irrelevant or thin content.",
                "Keep comments practical and publication-ready.",
            ],
        )
    return agent


def is_chinese_language(language: str) -> bool:
    normalized = language.lower()
    return "chinese" in normalized or normalized.startswith("zh")


def safe_filename(name: str) -> str:
    cleaned = re.sub(r"[\\/:*?\"<>|]+", "-", name)
    cleaned = re.sub(r"\s+", " ", cleaned).strip(" .-_")
    return cleaned or "daily-news-report"


def safe_int(value: Any, default: int) -> int:
    try:
        return int(value)
    except (TypeError, ValueError):
        return default


def require_logger(data: TriggerFlowRuntimeData) -> logging.Logger:
    return cast(logging.Logger, data.require_resource("logger"))


def require_search_tool(data: TriggerFlowRuntimeData) -> SearchToolProtocol:
    return cast(SearchToolProtocol, data.require_resource("search_tool"))


def require_browse_tool(data: TriggerFlowRuntimeData) -> BrowseToolProtocol:
    return cast(BrowseToolProtocol, data.require_resource("browse_tool"))


__all__ = [
    "DailyNewsChunkConfig",
    "create_editor_agent",
    "is_chinese_language",
    "safe_filename",
    "safe_int",
    "require_logger",
    "require_search_tool",
    "require_browse_tool",
]


================================================
FILE: workflow/daily_news.py
================================================
from __future__ import annotations

from pathlib import Path

from agently import TriggerFlow

from news_collector.config import AppSettings

from .column_chunks import (
    create_pick_column_news_chunk,
    create_search_column_news_chunk,
    create_write_column_chunk,
)
from .common import DailyNewsChunkConfig
from .report_chunks import (
    create_generate_outline_chunk,
    create_prepare_request_chunk,
    create_render_report_chunk,
)
from .summary_chunks import (
    create_dispatch_summary_batch_chunk,
    create_finalize_summary_chunk,
    create_merge_summary_batch_chunk,
    create_prepare_summary_candidates_chunk,
    create_summarize_candidate_chunk,
)


def build_summary_sub_flow(
    *,
    chunk_config: DailyNewsChunkConfig,
) -> TriggerFlow:
    flow = TriggerFlow(name="daily-news-summary-sub-flow")
    prepare_summary_candidates = flow.chunk("prepare_summary_candidates")(
        create_prepare_summary_candidates_chunk(chunk_config)
    )
    dispatch_summary_batch = flow.chunk("dispatch_summary_batch")(
        create_dispatch_summary_batch_chunk(chunk_config)
    )
    summarize_candidate = flow.chunk("summarize_candidate")(
        create_summarize_candidate_chunk(chunk_config)
    )
    merge_summary_batch = flow.chunk("merge_summary_batch")(
        create_merge_summary_batch_chunk(chunk_config)
    )
    finalize_summary = flow.chunk("finalize_summary")(
        create_finalize_summary_chunk(chunk_config)
    )

    (
        flow.when("Summary.Dispatch")
        .to(dispatch_summary_batch)
        .for_each(concurrency=chunk_config.settings.workflow.summary_concurrency)
        .to(summarize_candidate)
        .end_for_each()
        .to(merge_summary_batch)
    )
    flow.when("Summary.Done").to(finalize_summary).end()
    flow.to(prepare_summary_candidates)
    return flow


def build_column_sub_flow(
    *,
    chunk_config: DailyNewsChunkConfig,
) -> TriggerFlow:
    flow = TriggerFlow(name="daily-news-column-sub-flow")
    summary_sub_flow = build_summary_sub_flow(chunk_config=chunk_config)
    search_column_news = flow.chunk("search_column_news")(create_search_column_news_chunk(chunk_config))
    pick_column_news = flow.chunk("pick_column_news")(create_pick_column_news_chunk(chunk_config))
    write_column = flow.chunk("write_column")(create_write_column_chunk(chunk_config))

    (
        flow.to(search_column_news)
        .to(pick_column_news)
        .to_sub_flow(
            summary_sub_flow,
            capture={
                "input": "value",
                "resources": {
                    "logger": "resources.logger",
                    "browse_tool": "resources.browse_tool",
                },
            },
            write_back={
                "value": "result",
            },
        )
        .to(write_column)
        .end()
    )
    return flow


def build_daily_news_flow(
    *,
    settings: AppSettings,
    root_dir: str | Path,
    model_label: str,
) -> TriggerFlow:
    resolved_root_dir = Path(root_dir).resolve()
    chunk_config = DailyNewsChunkConfig(
        settings=settings,
        prompt_dir=resolved_root_dir / "prompts",
        output_dir=resolved_root_dir / settings.output.directory,
        model_label=model_label,
    )
    flow = TriggerFlow(name="daily-news-collector-v4")
    column_sub_flow = build_column_sub_flow(chunk_config=chunk_config)
    prepare_request = flow.chunk("prepare_request")(create_prepare_request_chunk(chunk_config))
    generate_outline = flow.chunk("generate_outline")(create_generate_outline_chunk(chunk_config))
    render_report = flow.chunk("render_report")(create_render_report_chunk(chunk_config))

    (
        flow.to(prepare_request)
        .to(generate_outline)
        .for_each(concurrency=settings.workflow.column_concurrency)
        .to_sub_flow(
            column_sub_flow,
            capture={
                "input": "value",
                "runtime_data": {
                    "request": "runtime_data.request",
                },
                "resources": {
                    "logger": "resources.logger",
                    "search_tool": "resources.search_tool",
                    "browse_tool": "resources.browse_tool",
                },
            },
            write_back={
                "value": "result",
            },
        )
        .end_for_each()
        .to(render_report)
        .end()
    )

    return flow


================================================
FILE: workflow/report_chunks.py
================================================
from __future__ import annotations

import copy
from datetime import datetime
from pathlib import Path
from typing import Any, Callable

from agently import TriggerFlowRuntimeData

from news_collector.markdown import render_markdown

from .common import DailyNewsChunkConfig, create_editor_agent, require_logger, safe_filename


def create_prepare_request_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def prepare_request(data: TriggerFlowRuntimeData) -> dict[str, Any]:
        topic = str(data.value).strip()
        now = datetime.now()
        request = {
            "topic": topic,
            "today": now.strftime("%Y-%m-%d"),
            "generated_at": now.strftime("%Y-%m-%d %H:%M:%S"),
            "language": config.settings.workflow.output_language,
        }
        data.state.set("request", request)
        require_logger(data).info("[Topic] %s", topic)
        return request

    return prepare_request


def create_generate_outline_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def generate_outline(data: TriggerFlowRuntimeData) -> list[dict[str, Any]]:
        request = data.value
        logger = require_logger(data)
        if config.settings.outline.use_customized:
            outline = _get_customized_outline(config)
            logger.info("[Use Customized Outline] %s", outline)
        else:
            outline = await _generate_outline(config, request)
            logger.info("[Outline Generated] %s", outline)
        data.state.set("outline", outline)
        return outline.get("column_list", [])

    return generate_outline


def create_render_report_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def render_report(data: TriggerFlowRuntimeData) -> dict[str, Any]:
        request = data.state.get("request") or {}
        outline = data.state.get("outline") or {}
        columns = [column for column in data.value if isinstance(column, dict)]
        report_title = str(
            outline.get("report_title")
            or f"Daily News about {request.get('topic', 'the topic')}"
        )
        markdown = render_markdown(
            report_title=report_title,
            generated_at=str(request.get("generated_at") or ""),
            topic=str(request.get("topic") or ""),
            language=config.settings.workflow.output_language,
            columns=columns,
            model_label=config.model_label,
        )
        output_path = _write_markdown(
            config=config,
            report_title=report_title,
            report_date=str(request.get("today") or ""),
            markdown=markdown,
        )
        require_logger(data).info("[Markdown Saved] %s", output_path)
        return {
            "report_title": report_title,
            "output_path": str(output_path),
            "markdown": markdown,
            "columns": columns,
        }

    return render_report


async def _generate_outline(
    config: DailyNewsChunkConfig,
    request: dict[str, Any],
) -> dict[str, Any]:
    outline = await (
        create_editor_agent(kind="chief")
        .load_yaml_prompt(
            config.prompt_dir / "create_outline.yaml",
            {
                "topic": request["topic"],
                "today": request["today"],
                "language": config.settings.workflow.output_language,
                "max_column_num": config.settings.workflow.max_column_num,
            },
        )
        .async_start(
            ensure_keys=[
                "report_title",
                "column_list[*].column_title",
                "column_list[*].column_requirement",
                "column_list[*].search_keywords",
            ]
        )
    )
    if not isinstance(outline, dict):
        raise TypeError(f"Invalid outline result: {outline}")
    column_list = outline.get("column_list", [])
    if not isinstance(column_list, list):
        raise TypeError("Outline column_list must be a list.")
    outline["column_list"] = column_list[: config.settings.workflow.max_column_num]
    return outline


def _get_customized_outline(config: DailyNewsChunkConfig) -> dict[str, Any]:
    outline = copy.deepcopy(config.settings.outline.customized)
    column_list = outline.get("column_list", [])
    if not isinstance(column_list, list) or not column_list:
        raise ValueError("Customized outline must provide a non-empty column_list.")
    outline["column_list"] = column_list[: config.settings.workflow.max_column_num]
    outline.setdefault("report_title", "Daily News Briefing")
    return outline


def _write_markdown(
    *,
    config: DailyNewsChunkConfig,
    report_title: str,
    report_date: str,
    markdown: str,
) -> Path:
    config.output_dir.mkdir(parents=True, exist_ok=True)
    file_name = f"{safe_filename(report_title)}_{report_date or datetime.now().strftime('%Y-%m-%d')}.md"
    output_path = config.output_dir / file_name
    output_path.write_text(markdown, encoding="utf-8")
    return output_path


__all__ = [
    "create_prepare_request_chunk",
    "create_generate_outline_chunk",
    "create_render_report_chunk",
]


================================================
FILE: workflow/summary_chunks.py
================================================
from __future__ import annotations

import copy
from typing import Any, Callable

from agently import TriggerFlowRuntimeData

from .common import (
    DailyNewsChunkConfig,
    create_editor_agent,
    is_chinese_language,
    require_browse_tool,
    require_logger,
    safe_int,
)


def create_prepare_summary_candidates_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def prepare_summary_candidates(data: TriggerFlowRuntimeData):
        context = _coerce_summary_context(data.value)
        if context is None:
            data.state.set("summary_context", None, emit=False)
            data.state.set("summary_candidates", [], emit=False)
            data.state.set("summary_cursor", 0, emit=False)
            data.state.set("summary_results", [], emit=False)
            data.state.set("summary_target_count", 0, emit=False)
            await data.async_emit("Summary.Done", None)
            return

        candidates = build_summary_candidates(
            config,
            context["column_outline"],
            context["searched_news"],
            context["picked_news"],
        )
        target_count = min(
            len(context["picked_news"]),
            config.settings.workflow.max_news_per_column,
        )

        data.state.set("summary_context", copy.deepcopy(context), emit=False)
        data.state.set("summary_candidates", candidates, emit=False)
        data.state.set("summary_cursor", 0, emit=False)
        data.state.set("summary_results", [], emit=False)
        data.state.set("summary_target_count", target_count, emit=False)

        if target_count <= 0 or not candidates:
            await data.async_emit("Summary.Done", None)
        else:
            await data.async_emit("Summary.Dispatch", None)

    return prepare_summary_candidates


def create_dispatch_summary_batch_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def dispatch_summary_batch(data: TriggerFlowRuntimeData) -> list[dict[str, Any]]:
        candidates = data.state.get("summary_candidates") or []
        cursor = safe_int(data.state.get("summary_cursor"), 0)
        target_count = safe_int(data.state.get("summary_target_count"), 0)
        summary_results = data.state.get("summary_results") or []
        if not isinstance(candidates, list) or not isinstance(summary_results, list):
            raise RuntimeError("Invalid summary flow state.")

        remaining_needed = target_count - len(summary_results)
        batch_size = min(
            max(config.settings.workflow.summary_concurrency, 1),
            max(remaining_needed, 0),
            len(candidates) - cursor,
        )
        if batch_size <= 0:
            raise RuntimeError("Summary dispatch received no work. Summary.Done should have been emitted first.")

        batch = candidates[cursor : cursor + batch_size]
        data.state.set("summary_cursor", cursor + batch_size, emit=False)
        return batch

    return dispatch_summary_batch


def create_summarize_candidate_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def summarize_candidate(data: TriggerFlowRuntimeData) -> dict[str, Any]:
        candidate = data.value if isinstance(data.value, dict) else {}
        news = candidate.get("news")
        is_backup = bool(candidate.get("is_backup"))
        if not isinstance(news, dict):
            return {
                "news": {},
                "is_backup": is_backup,
                "summarized": None,
            }

        logger = require_logger(data)
        column_outline = _get_summary_column_outline(data)
        summarized = await summarize_single_news(
            config,
            logger,
            require_browse_tool(data),
            column_outline,
            news,
        )
        return {
            "news": copy.deepcopy(news),
            "is_backup": is_backup,
            "summarized": summarized,
        }

    return summarize_candidate


def create_merge_summary_batch_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def merge_summary_batch(data: TriggerFlowRuntimeData):
        logger = require_logger(data)
        results = data.value if isinstance(data.value, list) else []
        summary_results = data.state.get("summary_results") or []
        cursor = safe_int(data.state.get("summary_cursor"), 0)
        candidates = data.state.get("summary_candidates") or []
        target_count = safe_int(data.state.get("summary_target_count"), 0)

        if not isinstance(summary_results, list) or not isinstance(candidates, list):
            raise RuntimeError("Invalid summary merge state.")

        for item in results:
            if not isinstance(item, dict):
                continue
            news = item.get("news")
            summarized = item.get("summarized")
            is_backup = bool(item.get("is_backup"))
            title = str(news.get("title") or "").strip() if isinstance(news, dict) else ""

            if isinstance(summarized, dict):
                summary_results.append(summarized)
                continue
            if is_backup:
                logger.info("[Backup News Rejected] %s", title)
            elif cursor < len(candidates):
                logger.info("[Backup News Activated] %s", title)

        data.state.set("summary_results", summary_results, emit=False)
        if len(summary_results) >= target_count or cursor >= len(candidates):
            await data.async_emit("Summary.Done", None)
        else:
            await data.async_emit("Summary.Dispatch", None)

    return merge_summary_batch


def create_finalize_summary_chunk(
    config: DailyNewsChunkConfig,
) -> Callable[[TriggerFlowRuntimeData], Any]:
    async def finalize_summary(data: TriggerFlowRuntimeData) -> dict[str, Any]:
        context = data.state.get("summary_context")
        if not isinstance(context, dict):
            return {
                "column_outline": {},
                "searched_news": [],
                "picked_news": [],
                "summarized_news": [],
            }

        result = copy.deepcopy(context)
        summarized_news = data.state.get("summary_results") or []
        result["summarized_news"] = summarized_news if isinstance(summarized_news, list) else []
        logger = require_logger(data)
        title = str(result.get("column_outline", {}).get("column_title") or "").strip()
        logger.info("[Summarized News Count] %s => %s", title, len(result["summarized_news"]))
        return result

    return finalize_summary


def _coerce_summary_context(value: Any) -> dict[str, Any] | None:
    if not isinstance(value, dict):
        return None

    column_outline = value.get("column_outline")
    searched_news = value.get("searched_news")
    picked_news = value.get("picked_news")
    if not isinstance(column_outline, dict) or not isinstance(searched_news, list) or not isinstance(picked_news, list):
        return None

    return {
        "column_outline": copy.deepcopy(column_outline),
        "searched_news": copy.deepcopy(searched_news),
        "picked_news": copy.deepcopy(picked_news),
    }


def _get_summary_column_outline(data: TriggerFlowRuntimeData) -> dict[str, Any]:
    context = data.state.get("summary_context")
    if isinstance(context, dict) and isinstance(context.get("column_outline"), dict):
        return context["column_outline"]
    return {}


def build_summary_candidates(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    searched_news: list[dict[str, Any]],
    picked_news: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    candidates: list[dict[str, Any]] = []
    picked_urls = {
        str(news.get("url") or "").strip()
        for news in picked_news
        if str(news.get("url") or "").strip()
    }
    seen_urls: set[str] = set()

    for news in picked_news:
        url = str(news.get("url") or "").strip()
        if not url or url in seen_urls:
            continue
        seen_urls.add(url)
        candidates.append(
            {
                "news": copy.deepcopy(news),
                "is_backup": False,
            }
        )

    for news in searched_news:
        url = str(news.get("url") or "").strip()
        if not url or url in seen_urls or url in picked_urls:
            continue
        seen_urls.add(url)
        backup_news = copy.deepcopy(news)
        if not str(backup_news.get("recommend_comment") or "").strip():
            backup_news["recommend_comment"] = build_backup_recommend_comment(
                config,
                column_outline,
                backup_news,
            )
        candidates.append(
            {
                "news": backup_news,
                "is_backup": True,
            }
        )

    return candidates


async def pick_news(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    searched_news: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    pick_results = await (
        create_editor_agent(kind="column")
        .load_yaml_prompt(
            config.prompt_dir / "pick_news.yaml",
            {
                "column_news": searched_news,
                "column_title": column_outline["column_title"],
                "column_requirement": column_outline["column_requirement"],
                "max_news_per_column": config.settings.workflow.max_news_per_column,
            },
        )
        .async_start(
            ensure_keys=[
                "[*].id",
                "[*].can_use",
                "[*].relevance_score",
                "[*].recommend_comment",
            ]
        )
    )

    if not isinstance(pick_results, list):
        return []

    picked_news = []
    seen_ids: set[int] = set()
    sorted_results = sorted(
        [item for item in pick_results if isinstance(item, dict)],
        key=lambda item: safe_int(item.get("relevance_score"), 0),
        reverse=True,
    )
    for item in sorted_results:
        if item.get("can_use") is not True:
            continue
        news_id = safe_int(item.get("id"), -1)
        if news_id < 0 or news_id >= len(searched_news) or news_id in seen_ids:
            continue
        seen_ids.add(news_id)
        picked_item = copy.deepcopy(searched_news[news_id])
        picked_item["recommend_comment"] = str(item.get("recommend_comment") or "").strip()
        picked_item["relevance_score"] = safe_int(item.get("relevance_score"), 0)
        picked_news.append(picked_item)
        if len(picked_news) >= config.settings.workflow.max_news_per_column:
            break
    return picked_news


async def summarize_single_news(
    config: DailyNewsChunkConfig,
    logger,
    browse_tool,
    column_outline: dict[str, Any],
    news: dict[str, Any],
) -> dict[str, Any] | None:
    logger.info("[Summarizing] %s", news["title"])
    content = await browse_tool.browse(news["url"])
    content = str(content or "").strip()
    if len(content) < config.settings.browse.min_content_length:
        logger.info("[Summarizing] Failed - content too short")
        return None
    if is_invalid_browse_content(content):
        logger.info("[Summarizing] Failed - invalid browsed content")
        return None

    summary_result = await (
        create_editor_agent(kind="column")
        .load_yaml_prompt(
            config.prompt_dir / "summarize_news.yaml",
            {
                "news_content": content,
                "news_title": news["title"],
                "column_requirement": column_outline["column_requirement"],
                "language": config.settings.workflow.output_language,
            },
        )
        .async_start(
            ensure_keys=[
                "can_summarize",
                "summary",
            ]
        )
    )

    if not isinstance(summary_result, dict):
        logger.info("[Summarizing] Failed - invalid summary output")
        return None
    if summary_result.get("can_summarize") is not True:
        logger.info("[Summarizing] Failed - model rejected content")
        return None

    summary = str(summary_result.get("summary") or "").strip()
    if not summary:
        logger.info("[Summarizing] Failed - empty summary")
        return None

    summarized_news = copy.deepcopy(news)
    summarized_news["summary"] = summary
    logger.info("[Summarizing] Success")
    return summarized_news


def build_backup_recommend_comment(
    config: DailyNewsChunkConfig,
    column_outline: dict[str, Any],
    news: dict[str, Any],
) -> str:
    title = str(column_outline.get("column_title") or "this section")
    news_title = str(news.get("title") or "").strip()
    if is_chinese_language(config.settings.workflow.output_language):
        if news_title:
            return f"该报道与“{title}”存在明确关联，可作为备用候选：{news_title}。"
        return f"该报道与“{title}”存在明确关联，可作为备用候选。"
    if news_title:
        return f"This story is meaningfully related to {title} and is kept as a backup candidate: {news_title}."
    return f"This story is meaningfully related to {title} and is kept as a backup candidate."


def is_invalid_browse_content(content: str) -> bool:
    normalized = content.strip()
    lowered = normalized.lower()
    invalid_markers = (
        "can not browse '",
        "fallback failed:",
        "content_empty_or_too_short",
        "we've detected unusual activity",
        "not a robot",
        "captcha",
        "access denied",
        "subscribe now",
    )
    return any(marker in lowered for marker in invalid_markers)


__all__ = [
    "create_prepare_summary_candidates_chunk",
    "create_dispatch_summary_batch_chunk",
    "create_summarize_candidate_chunk",
    "create_merge_summary_batch_chunk",
    "create_finalize_summary_chunk",
    "pick_news",
]