Repository: andeya/pholcus
Branch: master
Commit: 91a56081f6e8
Files: 391
Total size: 6.2 MB
Directory structure:
gitextract_kq_lvwwa/
├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── app/
│ ├── aid/
│ │ ├── history/
│ │ │ ├── failure.go
│ │ │ ├── failure_test.go
│ │ │ ├── history.go
│ │ │ ├── history_test.go
│ │ │ ├── success.go
│ │ │ └── success_test.go
│ │ └── proxy/
│ │ ├── host.go
│ │ ├── host_test.go
│ │ ├── proxy.go
│ │ └── proxy_test.go
│ ├── app.go
│ ├── app_test.go
│ ├── crawler/
│ │ ├── crawler.go
│ │ ├── crawler_test.go
│ │ ├── crawlerpool.go
│ │ ├── crawlerpool_test.go
│ │ ├── spiderqueue.go
│ │ └── spiderqueue_test.go
│ ├── distribute/
│ │ ├── integration_test.go
│ │ ├── interface.go
│ │ ├── master_api.go
│ │ ├── master_api_test.go
│ │ ├── slave_api.go
│ │ ├── slave_api_test.go
│ │ ├── task.go
│ │ ├── task_test.go
│ │ ├── taskjar.go
│ │ ├── taskjar_test.go
│ │ └── teleport/
│ │ ├── client.go
│ │ ├── conn.go
│ │ ├── conn_test.go
│ │ ├── debug.go
│ │ ├── netdata.go
│ │ ├── netdata_test.go
│ │ ├── protocol.go
│ │ ├── protocol_test.go
│ │ ├── return_func.go
│ │ ├── return_func_test.go
│ │ ├── server.go
│ │ ├── teleport.go
│ │ ├── teleport_test.go
│ │ ├── util.go
│ │ └── util_test.go
│ ├── downloader/
│ │ ├── downloader.go
│ │ ├── downloader_surfer.go
│ │ ├── downloader_test.go
│ │ ├── request/
│ │ │ ├── request.go
│ │ │ ├── request_test.go
│ │ │ └── temp.go
│ │ └── surfer/
│ │ ├── agent/
│ │ │ ├── agent.go
│ │ │ ├── agent_bsd.go
│ │ │ ├── agent_linux.go
│ │ │ ├── agent_linux_arm.go
│ │ │ ├── agent_test.go
│ │ │ └── agent_windows.go
│ │ ├── chrome.go
│ │ ├── chrome_stub.go
│ │ ├── chrome_test.go
│ │ ├── example/
│ │ │ └── example.go
│ │ ├── param.go
│ │ ├── param_test.go
│ │ ├── phantom.go
│ │ ├── phantom_stub.go
│ │ ├── request.go
│ │ ├── request_test.go
│ │ ├── surf.go
│ │ ├── surf_stub_test.go
│ │ ├── surf_test.go
│ │ ├── surfer.go
│ │ ├── util.go
│ │ └── util_test.go
│ ├── pipeline/
│ │ ├── collector/
│ │ │ ├── collector.go
│ │ │ ├── collector_test.go
│ │ │ ├── data/
│ │ │ │ ├── data.go
│ │ │ │ └── data_test.go
│ │ │ ├── output_beanstalkd.go
│ │ │ ├── output_beanstalkd_stub.go
│ │ │ ├── output_csv.go
│ │ │ ├── output_data.go
│ │ │ ├── output_data_test.go
│ │ │ ├── output_excel.go
│ │ │ ├── output_file.go
│ │ │ ├── output_kafka.go
│ │ │ ├── output_kafka_stub.go
│ │ │ ├── output_mgo.go
│ │ │ ├── output_mgo_stub.go
│ │ │ ├── output_mysql.go
│ │ │ ├── output_mysql_stub.go
│ │ │ ├── output_util.go
│ │ │ └── output_util_test.go
│ │ ├── output.go
│ │ ├── pipeline.go
│ │ └── pipeline_test.go
│ ├── scheduler/
│ │ ├── matrix.go
│ │ ├── scheduler.go
│ │ └── scheduler_test.go
│ └── spider/
│ ├── common/
│ │ ├── common.go
│ │ ├── common_test.go
│ │ ├── form.go
│ │ └── form_test.go
│ ├── context.go
│ ├── parsejs.go
│ ├── species.go
│ ├── species_test.go
│ ├── spider.go
│ ├── timer.go
│ └── timer_test.go
├── cmd/
│ ├── cmd_test.go
│ └── pholcus-cmd.go
├── common/
│ ├── beanstalkd/
│ │ ├── beanstalkd.go
│ │ └── beanstalkd_test.go
│ ├── bytes/
│ │ ├── bytes.go
│ │ └── bytes_test.go
│ ├── closer/
│ │ ├── closer.go
│ │ └── closer_test.go
│ ├── gc/
│ │ ├── gc.go
│ │ └── gc_test.go
│ ├── goquery/
│ │ ├── .gitattributes
│ │ ├── .gitignore
│ │ ├── .travis.yml
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── array.go
│ │ ├── array_test.go
│ │ ├── bench/
│ │ │ ├── v0.1.0
│ │ │ ├── v0.1.1
│ │ │ ├── v0.2.0
│ │ │ ├── v0.2.1-go1.1rc1
│ │ │ ├── v0.3.0
│ │ │ ├── v0.3.2-go1.2
│ │ │ ├── v0.3.2-go1.2-take2
│ │ │ ├── v0.3.2-go1.2rc1
│ │ │ ├── v1.0.0-go1.7
│ │ │ ├── v1.0.1a-go1.7
│ │ │ ├── v1.0.1b-go1.7
│ │ │ └── v1.0.1c-go1.7
│ │ ├── bench_array_test.go
│ │ ├── bench_example_test.go
│ │ ├── bench_expand_test.go
│ │ ├── bench_filter_test.go
│ │ ├── bench_iteration_test.go
│ │ ├── bench_property_test.go
│ │ ├── bench_query_test.go
│ │ ├── bench_traversal_test.go
│ │ ├── doc/
│ │ │ └── tips.md
│ │ ├── doc.go
│ │ ├── example_test.go
│ │ ├── expand.go
│ │ ├── expand_test.go
│ │ ├── filter.go
│ │ ├── filter_test.go
│ │ ├── iteration.go
│ │ ├── iteration_test.go
│ │ ├── manipulation.go
│ │ ├── manipulation_test.go
│ │ ├── misc/
│ │ │ └── git/
│ │ │ └── pre-commit
│ │ ├── property.go
│ │ ├── property_test.go
│ │ ├── query.go
│ │ ├── query_test.go
│ │ ├── testdata/
│ │ │ ├── gotesting.html
│ │ │ ├── gowiki.html
│ │ │ ├── metalreview.html
│ │ │ ├── page.html
│ │ │ ├── page2.html
│ │ │ └── page3.html
│ │ ├── traversal.go
│ │ ├── traversal_test.go
│ │ ├── type.go
│ │ ├── type_test.go
│ │ ├── utilities.go
│ │ └── utilities_test.go
│ ├── kafka/
│ │ ├── kafka.go
│ │ └── kafka_test.go
│ ├── mahonia/
│ │ ├── 8bit.go
│ │ ├── ASCII.go
│ │ ├── README.md
│ │ ├── big5-data.go
│ │ ├── big5.go
│ │ ├── charset.go
│ │ ├── convert_string.go
│ │ ├── cp51932.go
│ │ ├── entity.go
│ │ ├── entity_data.go
│ │ ├── euc-jp.go
│ │ ├── euc-kr-data.go
│ │ ├── euc-kr.go
│ │ ├── fallback.go
│ │ ├── gb18030-data.go
│ │ ├── gb18030.go
│ │ ├── gbk-data.go
│ │ ├── gbk.go
│ │ ├── iso2022jp.go
│ │ ├── jis0201-data.go
│ │ ├── jis0208-data.go
│ │ ├── jis0212-data.go
│ │ ├── kuten.go
│ │ ├── mahonia_test.go
│ │ ├── mahoniconv/
│ │ │ └── mahoniconv.go
│ │ ├── mbcs.go
│ │ ├── ms-jis-data.go
│ │ ├── reader.go
│ │ ├── shiftjis-data.go
│ │ ├── shiftjis.go
│ │ ├── tcvn3.go
│ │ ├── translate.go
│ │ ├── utf16.go
│ │ ├── utf8.go
│ │ └── writer.go
│ ├── mgo/
│ │ ├── count.go
│ │ ├── find.go
│ │ ├── insert.go
│ │ ├── list.go
│ │ ├── mgo.go
│ │ ├── mgo_test.go
│ │ ├── operator.go
│ │ ├── remove.go
│ │ ├── update.go
│ │ ├── update_all.go
│ │ └── upsert.go
│ ├── mysql/
│ │ ├── mysql.go
│ │ └── mysql_test.go
│ ├── ping/
│ │ ├── ping.go
│ │ └── ping_test.go
│ ├── pinyin/
│ │ ├── example_test.go
│ │ ├── initials_sort.go
│ │ ├── phonetic_symbol.go
│ │ ├── pinyin.go
│ │ ├── pinyin_dict.go
│ │ └── pinyin_test.go
│ ├── pool/
│ │ ├── pool.go
│ │ └── pool_test.go
│ ├── queue/
│ │ ├── queue.go
│ │ └── queue_test.go
│ ├── session/
│ │ ├── README.md
│ │ ├── sess_cookie.go
│ │ ├── sess_cookie_test.go
│ │ ├── sess_file.go
│ │ ├── sess_file_test.go
│ │ ├── sess_mem.go
│ │ ├── sess_mem_test.go
│ │ ├── sess_test.go
│ │ ├── sess_utils.go
│ │ ├── session.go
│ │ └── session_manager_test.go
│ ├── simplejson/
│ │ ├── simplejson.go
│ │ └── simplejson_test.go
│ ├── util/
│ │ ├── util.go
│ │ └── util_test.go
│ ├── websocket/
│ │ ├── client.go
│ │ ├── client_test.go
│ │ ├── hybi.go
│ │ ├── server.go
│ │ ├── server_test.go
│ │ ├── websocket.go
│ │ └── websocket_test.go
│ └── xlsx/
│ ├── cell.go
│ ├── col.go
│ ├── date.go
│ ├── doc.go
│ ├── file.go
│ ├── hsl.go
│ ├── lib.go
│ ├── reftable.go
│ ├── row.go
│ ├── sheet.go
│ ├── style.go
│ ├── templates.go
│ ├── theme.go
│ ├── write.go
│ ├── xlsx_test.go
│ ├── xmlContentTypes.go
│ ├── xmlSharedStrings.go
│ ├── xmlStyle.go
│ ├── xmlTheme.go
│ ├── xmlWorkbook.go
│ └── xmlWorksheet.go
├── config/
│ ├── config.go
│ ├── config_test.go
│ └── setting.go
├── doc/
│ └── GUI编译命令.txt
├── doc.go
├── exec/
│ ├── exec.go
│ ├── exec_darwin.go
│ ├── exec_freebsd.go
│ ├── exec_linux.go
│ ├── exec_test.go
│ └── exec_windows.go
├── go.mod
├── go.sum
├── go.work
├── go.work.sum
├── gui/
│ ├── client.go
│ ├── guimain.manifest
│ ├── logview.go
│ ├── model/
│ │ └── guispider.go
│ ├── offline.go
│ ├── pholcus-gui.go
│ ├── rsrc.syso
│ ├── runmode.go
│ ├── server.go
│ └── var.go
├── logs/
│ ├── logs/
│ │ ├── conn.go
│ │ ├── conn_test.go
│ │ ├── console.go
│ │ ├── console_test.go
│ │ ├── file.go
│ │ ├── file_test.go
│ │ ├── log.go
│ │ ├── log_test.go
│ │ ├── smtp.go
│ │ └── smtp_test.go
│ ├── logs.go
│ └── logs_test.go
├── runtime/
│ ├── cache/
│ │ ├── cache.go
│ │ └── cache_test.go
│ └── status/
│ ├── status.go
│ └── status_test.go
├── sample/
│ ├── dyn_rules/
│ │ ├── baidu_search.pholcus.html
│ │ └── baidu_search.pholcus.xml
│ ├── main.go
│ └── static_rules/
│ ├── IJGUC/
│ │ └── IJGUC.go
│ ├── README.md
│ ├── alibaba/
│ │ └── alibaba.go
│ ├── area_codes/
│ │ └── area_codes.go
│ ├── baidunews/
│ │ └── baidunews.go
│ ├── baidusearch/
│ │ └── baidusearch.go
│ ├── car_home/
│ │ └── car_home.go
│ ├── chinanews/
│ │ ├── chinanews.go
│ │ └── readme.md
│ ├── fang_resell_list/
│ │ ├── fang_resell_list.go
│ │ └── readme.md
│ ├── filetest/
│ │ └── filetest.go
│ ├── ganji_gongsi/
│ │ └── ganji_gongsi.go
│ ├── googlesearch/
│ │ └── googlesearch.go
│ ├── hollandandbarrett/
│ │ └── hollandandbarrett.go
│ ├── jdsearch/
│ │ └── jdsearch.go
│ ├── jiban/
│ │ └── jiban.go
│ ├── jingdong/
│ │ ├── README.md
│ │ └── jdSpider.go
│ ├── kaola/
│ │ └── kaola.go
│ ├── lewa/
│ │ └── lewa.go
│ ├── miyabaobei/
│ │ └── miyabaobei.go
│ ├── people/
│ │ └── people.go
│ ├── pholcus_rules.go
│ ├── qq_avatar/
│ │ ├── README.md
│ │ └── avatar.go
│ ├── shunfenghaitao/
│ │ └── shunfenghaitao.go
│ ├── taobao/
│ │ └── taobao.go
│ ├── taobaosearch/
│ │ └── taobaosearch.go
│ ├── wangyi/
│ │ └── wangyi.go
│ ├── weibo_fans/
│ │ └── weibo_fans.go
│ ├── wukongwenda/
│ │ ├── README.md
│ │ └── wukongwenda.go
│ ├── zhihu_bianji/
│ │ ├── README.md
│ │ └── zhihu_bianji.go
│ ├── zhihu_daily/
│ │ ├── README.md
│ │ └── zhihu_daily.go
│ ├── zolpc/
│ │ └── zolpc.go
│ ├── zolphone/
│ │ └── zolphone.go
│ └── zolslab/
│ └── zolslab.go
└── web/
├── embed.go
├── embed_test.go
├── http_controller.go
├── http_controller_test.go
├── logsocket_controller.go
├── logsocket_controller_test.go
├── pholcus-web.go
├── router.go
├── router_test.go
├── views/
│ ├── bootstrap/
│ │ ├── css/
│ │ │ ├── bootstrap-theme.css
│ │ │ └── bootstrap.css
│ │ └── js/
│ │ ├── bootstrap.js
│ │ └── npm.js
│ ├── css/
│ │ ├── pholcus.css
│ │ └── split.css
│ ├── index.html
│ ├── js/
│ │ ├── app.js
│ │ ├── jquery.githubRepoWidget2.js
│ │ └── tpl.js
│ ├── layer/
│ │ ├── extend/
│ │ │ └── layer.ext.js
│ │ ├── layer.js
│ │ └── skin/
│ │ ├── layer.css
│ │ └── layer.ext.css
│ └── splitjs/
│ └── split.js
├── websocket_controller.go
└── websocket_controller_test.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
*.* linguist-language=go
================================================
FILE: .gitignore
================================================
*.o
*.a
*.so
_obj
_test
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.exe~
*.test
*.prof
*.rar
*.zip
*.gz
*.psd
*.bmd
*.cfg
*.pptx
*.log
*.out
*.sublime-project
*.sublime-workspace
/openspec
.cursor
.DS_Store
sample/sample
sample/pholcus_pkg/cache
sample/pholcus_pkg/file_out
sample/pholcus_pkg/history
sample/pholcus_pkg/logs
sample/pholcus_pkg/text_out
pholcus_pkg/
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2015 HenryLee
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
Pholcus(幽灵蛛)
纯 Go 语言编写的分布式高并发爬虫框架
[](https://github.com/andeya/pholcus/releases)
[](https://github.com/andeya/pholcus/stargazers)
[](https://pkg.go.dev/github.com/andeya/pholcus)
[](https://goreportcard.com/report/andeya/pholcus)
[](https://github.com/andeya/pholcus/blob/master/LICENSE)
[](https://github.com/andeya/pholcus/issues?q=is%3Aopen+is%3Aissue)
[](https://github.com/andeya/pholcus/issues?q=is%3Aissue+is%3Aclosed)
快速开始 •
核心特性 •
架构设计 •
操作界面 •
规则编写 •
FAQ
---
## 免责声明
> **本软件仅用于学术研究,使用者需遵守其所在地的相关法律法规,请勿用于非法用途!**
>
> 如在中国大陆频频爆出爬虫开发者涉诉与违规的 [新闻](https://github.com/HiddenStrawberry/Crawler_Illegal_Cases_In_China)。
>
> **郑重声明:因违法违规使用造成的一切后果,使用者自行承担!**
---
## 核心特性
**运行模式**
- 单机模式 — 开箱即用
- 服务端模式 — 分发任务
- 客户端模式 — 接收并执行任务
**操作界面**
- Web UI — 跨平台,浏览器操作
- GUI — Windows 原生界面
- Cmd — 命令行批量调度
**数据输出**
- MySQL / MongoDB
- Kafka / Beanstalkd
- CSV / Excel
- 原文件下载
**爬虫规则**
- 静态规则(Go)— 高性能,深度定制
- 动态规则(JS/XML)— 热加载,无需编译
- 30+ 内置示例规则
**更多亮点:**
- 三引擎下载器 [surfer](app/downloader/surfer):Surf(高并发 HTTP)/ PhantomJS / **Chrome**(Chromium 无头浏览器,自动执行 JS)
- 智能 Cookie 管理:固定 UserAgent 自动保存 cookie,或随机 UserAgent 禁用 cookie
- 模拟登录、自定义 Header、POST 表单提交
- 代理 IP 池,可按频率自动更换
- 随机停歇机制,模拟人工行为
- 采集量与并发协程数可控
- 请求自动去重 + 失败请求自动重试
- 成功记录持久化,支持断点续爬
- 分布式通信全双工 Socket 框架
---
## 架构设计
模块结构
项目架构
分布式架构
### 目录结构
```
pholcus/
├── app/ 核心逻辑
│ ├── crawler/ 爬虫引擎 & 并发池
│ ├── downloader/ 下载器(surfer)
│ ├── pipeline/ 数据管道 & 多种输出后端
│ ├── scheduler/ 请求调度器
│ ├── spider/ 爬虫规则引擎
│ ├── distribute/ 分布式 Master/Slave 通信
│ └── aid/ 辅助模块(历史记录、代理 IP)
├── config/ 配置管理
├── exec/ 启动入口 & 平台适配
├── cmd/ 命令行模式
├── gui/ GUI 模式(Windows)
├── web/ Web UI 模式
├── common/ 公共工具库(DB 驱动、编码、队列等)
├── logs/ 日志模块
├── runtime/ 运行时缓存 & 状态
└── sample/ 示例程序 & 30+ 爬虫规则
```
---
## 快速开始
### 环境要求
- Go 1.18+(推荐 1.22+)
### 获取源码
```bash
git clone https://github.com/andeya/pholcus.git
cd pholcus
```
### 编写入口
创建 `main.go`(或参考 `sample/main.go`):
```go
package main
import (
"github.com/andeya/pholcus/exec"
_ "github.com/andeya/pholcus/sample/static_rules" // 内置规则库
// _ "yourproject/rules" // 自定义规则库
)
func main() {
// 启动界面:web / gui / cmd
// 可通过 -a_ui 运行参数覆盖
exec.DefaultRun("web")
}
```
### 编译运行
```bash
# 编译(非 Windows 平台自动排除 GUI 包)
go build -o pholcus ./sample/
# 查看所有可选参数
./pholcus -h
```
Windows 下隐藏 cmd 窗口的编译方式:
```bash
go build -ldflags="-H=windowsgui -linkmode=internal" -o pholcus.exe ./sample/
```
### 命令行参数一览
```bash
./pholcus -h
```

---
## 操作界面
### Web UI
启动后访问 `http://localhost:2015`,在浏览器中即可完成蜘蛛选择、参数配置、任务启停等全部操作。

### GUI(仅 Windows)
原生桌面客户端,功能与 Web 版一致。

### Cmd 命令行
适用于服务器部署或 cron 定时任务场景。
```bash
pholcus -_ui=cmd -a_mode=0 -c_spider=3,8 -a_outtype=csv -a_thread=20 \
-a_batchcap=5000 -a_pause=300 -a_proxyminute=0 \
-a_keyins="" -a_limit=10 -a_success=true -a_failure=true
```
---
## 规则编写
Pholcus 支持 **静态规则(Go)** 和 **动态规则(JS/XML)** 两种方式。
### 静态规则(Go)
随软件一同编译,性能最优,适合重量级采集项目。在 `sample/static_rules/` 下新建 Go 文件即可:
```go
package rules
import (
"net/http"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/spider"
)
func init() {
mySpider.Register()
}
var mySpider = &spider.Spider{
Name: "示例爬虫",
Description: "示例爬虫 [Auto Page] [http://example.com]",
EnableCookie: true,
RuleTree: &spider.RuleTree{
Root: func(ctx *spider.Context) {
ctx.AddQueue(&request.Request{
URL: "http://example.com",
Rule: "首页",
})
},
Trunk: map[string]*spider.Rule{
"首页": {
ParseFunc: func(ctx *spider.Context) {
ctx.Output(map[int]interface{}{
0: ctx.GetText(),
})
},
},
},
},
}
```
> 更多示例见 [`sample/static_rules/`](sample/static_rules/),涵盖百度、京东、淘宝、知乎等 30+ 网站。
### 动态规则(JS/XML)
无需编译即可热加载,适合轻量级采集。将 `.pholcus.xml` 文件放入 `dyn_rules/` 目录:
```xml
百度搜索
百度搜索 [Auto Page] [http://www.baidu.com]
300
false
true
true
false
```
> 同时兼容 `.pholcus.html` 旧格式。`
```
### Chrome 引擎说明
Chrome 引擎依赖本机安装的 Chromium / Google Chrome 浏览器,通过 [chromedp](https://github.com/chromedp/chromedp) 驱动。
**适用场景:**
- 目标网站有 JS 渲染的内容(SPA / CSR 页面)
- 目标网站有安全验证(如百度安全验证)需要浏览器执行 JS 后自动跳转
- 需要模拟真实浏览器环境绕过反爬检测
**环境要求:**
- 本机需安装 Chrome / Chromium 浏览器
- macOS: `brew install --cask google-chrome` 或 `brew install chromium`
- Linux: `apt install chromium-browser` 或 `yum install chromium`
- Windows: 安装 Google Chrome 即可
**注意事项:**
- Chrome 引擎每次请求会启动独立的无头浏览器实例,资源消耗高于 Surf
- 建议仅在 Surf 引擎无法获取内容时使用 Chrome
- Chrome 引擎内置了反自动化检测(隐藏 `navigator.webdriver`、禁用自动化标志等)
---
## 配置说明
### 运行时目录
```
├── pholcus 可执行文件
├── dyn_rules/ 动态规则目录(可在 config.ini 中配置)
│ └── xxx.pholcus.xml 动态规则文件
└── pholcus_pkg/ 运行时文件目录
├── config.ini 配置文件
├── proxy.lib 代理 IP 列表
├── phantomjs PhantomJS 程序
├── text_out/ 文本输出目录
├── file_out/ 文件输出目录
├── logs/ 日志目录
├── history/ 历史记录目录
└── cache/ 临时缓存目录
```
### 代理 IP
在 `pholcus_pkg/proxy.lib` 文件中逐行写入代理地址:
```
http://183.141.168.95:3128
https://60.13.146.92:8088
http://59.59.4.22:8090
```
通过界面选择"代理 IP 更换频率"或命令行参数 `-a_proxyminute` 启用。
> **注意:** macOS 下使用代理 IP 功能需要 root 权限,否则无法通过 `ping` 检测可用代理。
---
## 内置爬虫规则
| 分类 | 规则名称 |
| -------- | --------------------------------------------------------- |
| 搜索引擎 | 百度搜索、百度新闻、谷歌搜索、京东搜索、淘宝搜索 |
| 电商平台 | 京东、淘宝、考拉海购、蜜芽宝贝、顺丰海淘、Holland&Barrett |
| 新闻资讯 | 中国新闻网、网易新闻、人民网 |
| 社交问答 | 知乎日报、知乎编辑推荐、悟空问答、微博粉丝 |
| 房产汽车 | 房天下二手房、汽车之家 |
| 数码科技 | ZOL 手机、ZOL 电脑、ZOL 平板、乐蛙 |
| 分类信息 | 赶集公司、全国区号 |
| 社交工具 | QQ 头像 |
| 学术期刊 | IJGUC |
| 其他 | 阿里巴巴、技版、文件下载测试 |
---
## 常见问题
请求队列中重复的 URL 会自动去重吗?
默认自动去重。如需允许重复请求,设置 `Request.Reloadable = true`。
框架能否判断页面内容是否更新?
框架不内置页面变更检测,但可在规则中自定义实现。
请求成功的判定标准是什么?
以服务器是否返回响应流为准,而非 HTTP 状态码。即 404 页面也算"请求成功"。
请求失败后如何重试?
每个 URL 尝试下载指定次数后,若仍失败则进入 defer 队列。当前任务正常结束后自动重试。再次失败则保存至失败历史记录。下次执行同一规则时,可选择继承历史失败记录进行自动重试。
---
## 参与贡献
欢迎提交 Issue 和 Pull Request!
1. Fork 本仓库
2. 创建特性分支:`git checkout -b feature/your-feature`
3. 提交更改:`git commit -m 'Add your feature'`
4. 推送分支:`git push origin feature/your-feature`
5. 提交 Pull Request
---
## 开源协议
本项目基于 [Apache License 2.0](LICENSE) 开源。
---
Created by andeya — 如果觉得有帮助,请给个 Star 支持!
================================================
FILE: app/aid/history/failure.go
================================================
package history
import (
"bytes"
"encoding/json"
"os"
"sync"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/common/mgo"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/common/pool"
"github.com/andeya/pholcus/config"
)
// Failure tracks failed requests for retry.
type Failure struct {
tabName string
fileName string
list map[string]*request.Request
inheritable bool
sync.RWMutex
}
func (f *Failure) PullFailure() map[string]*request.Request {
list := f.list
f.list = make(map[string]*request.Request)
return list
}
// UpsertFailure updates or adds a failure record. Returns true if an insert occurred.
func (f *Failure) UpsertFailure(req *request.Request) bool {
f.RWMutex.Lock()
defer f.RWMutex.Unlock()
if f.list[req.Unique()] != nil {
return false
}
f.list[req.Unique()] = req
return true
}
// DeleteFailure removes a failure record.
func (f *Failure) DeleteFailure(req *request.Request) {
f.RWMutex.Lock()
delete(f.list, req.Unique())
f.RWMutex.Unlock()
}
// flush clears historical failure records first, then updates.
func (f *Failure) flush(provider string) (r result.Result[int]) {
defer r.Catch()
f.RWMutex.Lock()
defer f.RWMutex.Unlock()
fLen := len(f.list)
switch provider {
case "mgo":
result.RetVoid(mgo.Error()).Unwrap()
mgo.Call(func(src pool.Src) error {
c := src.(*mgo.MgoSrc).DB(config.Conf().DBName).C(f.tabName)
c.DropCollection()
if fLen == 0 {
return nil
}
var docs = []interface{}{}
for key, req := range f.list {
docs = append(docs, map[string]interface{}{"_id": key, "failure": req.Serialize().Unwrap()})
}
c.Insert(docs...)
return nil
}).Unwrap()
case "mysql":
_, err := mysql.DB()
result.RetVoid(err).Unwrap()
table, ok := getWriteMysqlTable(f.tabName)
if !ok {
table = mysql.New().Unwrap()
table.SetTableName(f.tabName).CustomPrimaryKey(`id VARCHAR(255) NOT NULL PRIMARY KEY`).AddColumn(`failure MEDIUMTEXT`)
setWriteMysqlTable(f.tabName, table)
table.Create().Unwrap()
} else {
table.Truncate().Unwrap()
}
for key, req := range f.list {
table.AutoInsert([]string{key, req.Serialize().Unwrap()})
table.FlushInsert().Unwrap()
}
default:
os.Remove(f.fileName)
if fLen == 0 {
return result.Ok(0)
}
file, err := os.OpenFile(f.fileName, os.O_CREATE|os.O_WRONLY, 0777)
result.RetVoid(err).Unwrap()
docs := make(map[string]string, len(f.list))
for key, req := range f.list {
docs[key] = req.Serialize().Unwrap()
}
b, _ := json.Marshal(docs)
b = bytes.Replace(b, []byte(`\u0026`), []byte(`&`), -1)
file.Write(b)
file.Close()
}
return result.Ok(fLen)
}
================================================
FILE: app/aid/history/failure_test.go
================================================
package history
import (
"net/http"
"os"
"path/filepath"
"testing"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
)
func newTestRequest(url string) *request.Request {
r := &request.Request{Spider: "s", URL: url, Rule: "r", Method: "GET", Header: make(http.Header)}
r.Prepare()
return r
}
func TestFailure_PullFailure(t *testing.T) {
req := newTestRequest("http://a.com")
f := &Failure{
tabName: "t",
fileName: "f",
list: map[string]*request.Request{req.Unique(): req},
}
got := f.PullFailure()
if len(got) != 1 {
t.Errorf("PullFailure len = %v, want 1", len(got))
}
if len(f.list) != 0 {
t.Error("PullFailure should clear list")
}
}
func TestFailure_UpsertFailure(t *testing.T) {
req := newTestRequest("http://a.com")
f := &Failure{
tabName: "t",
fileName: "f",
list: make(map[string]*request.Request),
}
tests := []struct {
req *request.Request
want bool
}{
{req, true},
{req, false},
}
for i, tt := range tests {
if got := f.UpsertFailure(tt.req); got != tt.want {
t.Errorf("UpsertFailure #%d = %v, want %v", i, got, tt.want)
}
}
}
func TestFailure_DeleteFailure(t *testing.T) {
req := newTestRequest("http://a.com")
f := &Failure{
tabName: "t",
fileName: "f",
list: map[string]*request.Request{req.Unique(): req},
}
f.DeleteFailure(req)
if len(f.list) != 0 {
t.Error("DeleteFailure should remove from list")
}
}
func TestFailure_Flush_File(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(dir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
fileName := filepath.Join(dir, "history__n__test")
req := newTestRequest("http://b.com")
f := &Failure{
tabName: util.FileNameReplace("history__n__test"),
fileName: fileName,
list: map[string]*request.Request{req.Unique(): req},
}
r := f.flush("file")
if r.IsErr() {
t.Fatalf("flush: %v", r.UnwrapErr())
}
if r.Unwrap() != 1 {
t.Errorf("flush count = %v, want 1", r.Unwrap())
}
if _, err := os.Stat(fileName); err != nil {
t.Errorf("flush file: %v", err)
}
}
func TestFailure_Flush_FileEmpty(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(dir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
fileName := filepath.Join(dir, "history__n__empty")
f := &Failure{
tabName: util.FileNameReplace("history__n__empty"),
fileName: fileName,
list: make(map[string]*request.Request),
}
r := f.flush("file")
if r.IsErr() {
t.Fatalf("flush empty: %v", r.UnwrapErr())
}
if r.Unwrap() != 0 {
t.Errorf("flush count = %v, want 0", r.Unwrap())
}
}
func TestFailure_Flush_FileOverwrite(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(dir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
fileName := filepath.Join(dir, "history__n__overwrite")
if err := os.WriteFile(fileName, []byte("old"), 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
req := newTestRequest("http://c.com")
f := &Failure{
tabName: util.FileNameReplace("history__n__overwrite"),
fileName: fileName,
list: map[string]*request.Request{req.Unique(): req},
}
r := f.flush("file")
if r.IsErr() {
t.Fatalf("flush: %v", r.UnwrapErr())
}
data, _ := os.ReadFile(fileName)
if len(data) < 10 {
t.Errorf("flush should overwrite file, got %d bytes", len(data))
}
}
================================================
FILE: app/aid/history/history.go
================================================
// Package history provides persistence and inheritance of success and failure request records.
package history
import (
"encoding/json"
"io"
"os"
"sync"
"gopkg.in/mgo.v2/bson"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/common/closer"
"github.com/andeya/pholcus/common/mgo"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/common/pool"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
)
type (
HistoryStore interface {
ReadSuccess(provider string, inherit bool) result.VoidResult // Read success records
UpsertSuccess(string) bool // Upsert a success record
HasSuccess(string) bool // Check if a success record exists
DeleteSuccess(string) // Delete a success record
FlushSuccess(provider string) result.VoidResult // Flush success records to I/O without clearing cache
ReadFailure(provider string, inherit bool) result.VoidResult // Read failure records
PullFailure() map[string]*request.Request // Pull failure records and clear
UpsertFailure(*request.Request) bool // Upsert a failure record
DeleteFailure(*request.Request) // Delete a failure record
FlushFailure(provider string) result.VoidResult // Flush failure records to I/O without clearing cache
Empty() // Clear cache without output
}
// History stores success and failure records for crawl deduplication.
History struct {
*Success
*Failure
provider string
sync.RWMutex
}
)
const (
SuccessSuffix = config.HistoryTag + "__y"
FailureSuffix = config.HistoryTag + "__n"
SuccessFile = config.HistoryDir + "/" + SuccessSuffix
FailureFile = config.HistoryDir + "/" + FailureSuffix
)
// New creates a HistoryStore for the given spider name and optional subname.
func New(name string, subName string) HistoryStore {
successTabName := SuccessSuffix + "__" + name
successFileName := SuccessFile + "__" + name
failureTabName := FailureSuffix + "__" + name
failureFileName := FailureFile + "__" + name
if subName != "" {
successTabName += "__" + subName
successFileName += "__" + subName
failureTabName += "__" + subName
failureFileName += "__" + subName
}
return &History{
Success: &Success{
tabName: util.FileNameReplace(successTabName),
fileName: successFileName,
new: make(map[string]bool),
old: make(map[string]bool),
},
Failure: &Failure{
tabName: util.FileNameReplace(failureTabName),
fileName: failureFileName,
list: make(map[string]*request.Request),
},
}
}
// ReadSuccess reads success records from the given provider.
func (h *History) ReadSuccess(provider string, inherit bool) result.VoidResult {
h.RWMutex.Lock()
h.provider = provider
h.RWMutex.Unlock()
if !inherit {
// Not inheriting history
h.Success.old = make(map[string]bool)
h.Success.new = make(map[string]bool)
h.Success.inheritable = false
return result.OkVoid()
} else if h.Success.inheritable {
// Both current and previous runs inherit history
return result.OkVoid()
} else {
// Previous run did not inherit, but current run does
h.Success.old = make(map[string]bool)
h.Success.new = make(map[string]bool)
h.Success.inheritable = true
}
switch provider {
case "mgo":
var docs = map[string]interface{}{}
r := mgo.Mgo(&docs, "find", map[string]interface{}{
"Database": config.Conf().DBName,
"Collection": h.Success.tabName,
})
if r.IsErr() {
logs.Log().Error(" * Fail [read success record][mgo]: %v\n", r.UnwrapErr())
return result.OkVoid()
}
for _, v := range docs["Docs"].([]interface{}) {
h.Success.old[v.(bson.M)["_id"].(string)] = true
}
case "mysql":
_, err := mysql.DB()
if err != nil {
logs.Log().Error(" * Fail [read success record][mysql]: %v\n", err)
return result.OkVoid()
}
table, ok := getReadMysqlTable(h.Success.tabName)
if !ok {
table = mysql.New().Unwrap().SetTableName(h.Success.tabName)
setReadMysqlTable(h.Success.tabName, table)
}
r := table.SelectAll()
if r.IsErr() {
return result.OkVoid()
}
rows := r.Unwrap()
for rows.Next() {
var id string
err = rows.Scan(&id)
h.Success.old[id] = true
}
default:
f, err := os.Open(h.Success.fileName)
if err != nil {
return result.OkVoid()
}
defer closer.LogClose(f, logs.Log().Error)
b, _ := io.ReadAll(f)
if len(b) == 0 {
return result.OkVoid()
}
b[0] = '{'
json.Unmarshal(append(b, '}'), &h.Success.old)
}
logs.Log().Informational(" * [read success record]: %v\n", len(h.Success.old))
return result.OkVoid()
}
// ReadFailure reads failure records from the given provider.
func (h *History) ReadFailure(provider string, inherit bool) result.VoidResult {
h.RWMutex.Lock()
h.provider = provider
h.RWMutex.Unlock()
if !inherit {
// Not inheriting history
h.Failure.list = make(map[string]*request.Request)
h.Failure.inheritable = false
return result.OkVoid()
} else if h.Failure.inheritable {
// Both current and previous runs inherit history
return result.OkVoid()
} else {
// Previous run did not inherit, but current run does
h.Failure.list = make(map[string]*request.Request)
h.Failure.inheritable = true
}
var fLen int
switch provider {
case "mgo":
if mgo.Error() != nil {
logs.Log().Error(" * Fail [read failure record][mgo]: %v\n", mgo.Error())
return result.OkVoid()
}
var docs = []interface{}{}
mgo.Call(func(src pool.Src) error {
c := src.(*mgo.MgoSrc).DB(config.Conf().DBName).C(h.Failure.tabName)
return c.Find(nil).All(&docs)
}).Unwrap()
fLen = len(docs)
for _, v := range docs {
key := v.(bson.M)["_id"].(string)
failure := v.(bson.M)["failure"].(string)
reqResult := request.UnSerialize(failure)
if reqResult.IsErr() {
continue
}
h.Failure.list[key] = reqResult.Unwrap()
}
case "mysql":
_, err := mysql.DB()
if err != nil {
logs.Log().Error(" * Fail [read failure record][mysql]: %v\n", err)
return result.OkVoid()
}
table, ok := getReadMysqlTable(h.Failure.tabName)
if !ok {
table = mysql.New().Unwrap().SetTableName(h.Failure.tabName)
setReadMysqlTable(h.Failure.tabName, table)
}
r := table.SelectAll()
if r.IsErr() {
return result.OkVoid()
}
rows := r.Unwrap()
for rows.Next() {
var key, failure string
err = rows.Scan(&key, &failure)
reqResult := request.UnSerialize(failure)
if reqResult.IsErr() {
continue
}
h.Failure.list[key] = reqResult.Unwrap()
fLen++
}
default:
f, err := os.Open(h.Failure.fileName)
if err != nil {
return result.OkVoid()
}
defer closer.LogClose(f, logs.Log().Error)
b, _ := io.ReadAll(f)
if len(b) == 0 {
return result.OkVoid()
}
docs := map[string]string{}
json.Unmarshal(b, &docs)
fLen = len(docs)
for key, s := range docs {
reqResult := request.UnSerialize(s)
if reqResult.IsErr() {
continue
}
h.Failure.list[key] = reqResult.Unwrap()
}
}
logs.Log().Informational(" * [read failure record]: %v\n", fLen)
return result.OkVoid()
}
// Empty clears the cache without output.
func (h *History) Empty() {
h.RWMutex.Lock()
h.Success.new = make(map[string]bool)
h.Success.old = make(map[string]bool)
h.Failure.list = make(map[string]*request.Request)
h.RWMutex.Unlock()
}
// FlushSuccess flushes success records to I/O without clearing cache.
func (h *History) FlushSuccess(provider string) (r result.VoidResult) {
defer r.Catch()
h.RWMutex.Lock()
h.provider = provider
h.RWMutex.Unlock()
sucLen := h.Success.flush(provider).Unwrap()
if sucLen <= 0 {
return result.OkVoid()
}
logs.Log().Informational(" * [add success record]: %v\n", sucLen)
return result.OkVoid()
}
// FlushFailure flushes failure records to I/O without clearing cache.
func (h *History) FlushFailure(provider string) (r result.VoidResult) {
defer r.Catch()
h.RWMutex.Lock()
h.provider = provider
h.RWMutex.Unlock()
failLen := h.Failure.flush(provider).Unwrap()
if failLen <= 0 {
return result.OkVoid()
}
logs.Log().Informational(" * [add failure record]: %v\n", failLen)
return result.OkVoid()
}
var (
readMysqlTable = map[string]*mysql.Table{}
readMysqlTableLock sync.RWMutex
)
func getReadMysqlTable(name string) (*mysql.Table, bool) {
readMysqlTableLock.RLock()
tab, ok := readMysqlTable[name]
readMysqlTableLock.RUnlock()
if ok {
return tab.Clone(), true
}
return nil, false
}
func setReadMysqlTable(name string, tab *mysql.Table) {
readMysqlTableLock.Lock()
readMysqlTable[name] = tab
readMysqlTableLock.Unlock()
}
var (
writeMysqlTable = map[string]*mysql.Table{}
writeMysqlTableLock sync.RWMutex
)
func getWriteMysqlTable(name string) (*mysql.Table, bool) {
writeMysqlTableLock.RLock()
tab, ok := writeMysqlTable[name]
writeMysqlTableLock.RUnlock()
if ok {
return tab.Clone(), true
}
return nil, false
}
func setWriteMysqlTable(name string, tab *mysql.Table) {
writeMysqlTableLock.Lock()
writeMysqlTable[name] = tab
writeMysqlTableLock.Unlock()
}
================================================
FILE: app/aid/history/history_test.go
================================================
package history
import (
"database/sql"
"encoding/json"
"net/http"
"os"
"path/filepath"
"testing"
sqlmock "github.com/DATA-DOG/go-sqlmock"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/config"
)
func setupHistoryDir(t *testing.T) (cleanup func()) {
tmp := t.TempDir()
historyDir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(historyDir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
if err := os.Chdir(tmp); err != nil {
t.Fatalf("Chdir: %v", err)
}
return func() { os.Chdir(orig) }
}
func TestNew(t *testing.T) {
tests := []struct {
name string
subName string
}{
{"spider1", ""},
{"spider2", "sub"},
}
for _, tt := range tests {
t.Run(tt.name+"_"+tt.subName, func(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New(tt.name, tt.subName)
if h == nil {
t.Fatal("New returned nil")
}
if got := h.UpsertSuccess("id1"); !got {
t.Error("UpsertSuccess want true")
}
if got := h.UpsertSuccess("id1"); got {
t.Error("UpsertSuccess duplicate want false")
}
})
}
}
func TestHistory_ReadSuccess_File(t *testing.T) {
tests := []struct {
name string
inherit bool
fileData string
checkOld bool
}{
{"no inherit", false, "", false},
{"inherit no file", true, "", false},
{"inherit with data", true, `,"id1":true,"id2":true`, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
if tt.fileData != "" {
if err := os.WriteFile(h.Success.fileName, []byte(tt.fileData), 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
}
r := h.ReadSuccess("file", tt.inherit)
if r.IsErr() {
t.Errorf("ReadSuccess: %v", r.UnwrapErr())
}
if tt.checkOld {
if len(h.Success.old) != 2 || !h.Success.HasSuccess("id1") || !h.Success.HasSuccess("id2") {
t.Errorf("expected ids in old, got len=%d old=%v", len(h.Success.old), h.Success.old)
}
}
})
}
}
func TestHistory_ReadSuccess_EmptyFile(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
if err := os.WriteFile(h.Success.fileName, []byte{}, 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
r := h.ReadSuccess("file", true)
if r.IsErr() {
t.Errorf("ReadSuccess: %v", r.UnwrapErr())
}
}
func TestHistory_ReadSuccess_InheritPaths(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
h.Success.inheritable = true
r := h.ReadSuccess("file", true)
if r.IsErr() {
t.Errorf("ReadSuccess inheritable: %v", r.UnwrapErr())
}
h.Success.inheritable = false
h.Success.old["x"] = true
r = h.ReadSuccess("file", true)
if r.IsErr() {
t.Errorf("ReadSuccess: %v", r.UnwrapErr())
}
if len(h.Success.old) != 0 {
t.Error("expected old cleared when switching to inherit")
}
}
func TestHistory_ReadFailure_File(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
ser := req.Serialize().Unwrap()
fileData, _ := json.Marshal(map[string]string{req.Unique(): ser})
tests := []struct {
name string
inherit bool
fileData []byte
}{
{"no inherit", false, nil},
{"inherit no file", true, nil},
{"inherit with data", true, fileData},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.fileData != nil {
if err := os.WriteFile(h.Failure.fileName, tt.fileData, 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
}
r := h.ReadFailure("file", tt.inherit)
if r.IsErr() {
t.Errorf("ReadFailure: %v", r.UnwrapErr())
}
})
}
}
func TestHistory_ReadFailure_EmptyFile(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
if err := os.WriteFile(h.Failure.fileName, []byte{}, 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
r := h.ReadFailure("file", true)
if r.IsErr() {
t.Errorf("ReadFailure: %v", r.UnwrapErr())
}
}
func TestHistory_Empty(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
h.UpsertSuccess("id1")
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
h.UpsertFailure(req)
h.Empty()
if h.HasSuccess("id1") {
t.Error("Empty should clear success")
}
pulled := h.PullFailure()
if len(pulled) != 0 {
t.Error("Empty should clear failure")
}
}
func TestHistory_FlushSuccess_File(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
h.UpsertSuccess("id1")
h.UpsertSuccess("id2")
r := h.FlushSuccess("file")
if r.IsErr() {
t.Errorf("FlushSuccess: %v", r.UnwrapErr())
}
if _, err := os.Stat(h.Success.fileName); err != nil {
t.Errorf("FlushSuccess file: %v", err)
}
}
func TestHistory_FlushFailure_File(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
h.UpsertFailure(req)
r := h.FlushFailure("file")
if r.IsErr() {
t.Errorf("FlushFailure: %v", r.UnwrapErr())
}
if _, err := os.Stat(h.Failure.fileName); err != nil {
t.Errorf("FlushFailure file: %v", err)
}
}
func TestHistory_FlushSuccess_Empty(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
r := h.FlushSuccess("file")
if r.IsErr() {
t.Errorf("FlushSuccess empty: %v", r.UnwrapErr())
}
}
func TestHistory_FlushFailure_Empty(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
r := h.FlushFailure("file")
if r.IsErr() {
t.Errorf("FlushFailure empty: %v", r.UnwrapErr())
}
}
func TestHistory_ReadSuccess_FileNotFound(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
r := h.ReadSuccess("file", true)
if r.IsErr() {
t.Errorf("ReadSuccess file not found: %v", r.UnwrapErr())
}
}
func TestHistory_ReadSuccess_ReadFailure_MysqlMock(t *testing.T) {
sqlDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer sqlDB.Close()
cleanup := mysql.SetDBForTest(sqlDB)
defer cleanup()
cleanupDir := setupHistoryDir(t)
defer cleanupDir()
_ = config.Conf()
h := New("test", "").(*History)
rows := sqlmock.NewRows([]string{"id"}).AddRow("id1").AddRow("id2")
mock.ExpectQuery("SELECT \\* FROM").WillReturnRows(rows)
r := h.ReadSuccess("mysql", true)
if r.IsErr() {
t.Errorf("ReadSuccess mysql: %v", r.UnwrapErr())
}
if len(h.Success.old) != 2 {
t.Errorf("ReadSuccess mysql: want 2 old, got %d", len(h.Success.old))
}
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
ser := req.Serialize().Unwrap()
rows2 := sqlmock.NewRows([]string{"id", "failure"}).AddRow(req.Unique(), ser)
mock.ExpectQuery("SELECT \\* FROM").WillReturnRows(rows2)
r = h.ReadFailure("mysql", true)
if r.IsErr() {
t.Errorf("ReadFailure mysql: %v", r.UnwrapErr())
}
if len(h.Failure.list) != 1 {
t.Errorf("ReadFailure mysql: want 1, got %d", len(h.Failure.list))
}
}
func TestHistory_ReadSuccess_MysqlDBError(t *testing.T) {
cleanup := mysql.SetDBForTest(nil)
defer cleanup()
cleanupDir := setupHistoryDir(t)
defer cleanupDir()
_ = config.Conf()
h := New("test", "").(*History)
r := h.ReadSuccess("mysql", true)
if r.IsErr() {
t.Errorf("ReadSuccess mysql no db: %v", r.UnwrapErr())
}
}
func TestHistory_ReadFailure_MysqlDBError(t *testing.T) {
cleanup := mysql.SetDBForTest(nil)
defer cleanup()
cleanupDir := setupHistoryDir(t)
defer cleanupDir()
_ = config.Conf()
h := New("test", "").(*History)
r := h.ReadFailure("mysql", true)
if r.IsErr() {
t.Errorf("ReadFailure mysql no db: %v", r.UnwrapErr())
}
}
func TestHistory_ReadSuccess_MysqlSelectError(t *testing.T) {
sqlDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer sqlDB.Close()
cleanup := mysql.SetDBForTest(sqlDB)
defer cleanup()
cleanupDir := setupHistoryDir(t)
defer cleanupDir()
_ = config.Conf()
h := New("test", "").(*History)
mock.ExpectQuery("SELECT \\* FROM").WillReturnError(sql.ErrConnDone)
r := h.ReadSuccess("mysql", true)
if r.IsErr() {
t.Errorf("ReadSuccess mysql select err: %v", r.UnwrapErr())
}
}
func TestHistory_FlushSuccess_FlushFailure_MysqlMock(t *testing.T) {
sqlDB, mock, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer sqlDB.Close()
cleanup := mysql.SetDBForTest(sqlDB)
defer cleanup()
cleanupDir := setupHistoryDir(t)
defer cleanupDir()
_ = config.Conf()
h := New("test", "").(*History)
h.UpsertSuccess("id1")
h.UpsertSuccess("id2")
mock.ExpectExec("CREATE TABLE IF NOT EXISTS").WillReturnResult(sqlmock.NewResult(0, 0))
mock.ExpectExec("INSERT INTO").WithArgs("id1", "id2").WillReturnResult(sqlmock.NewResult(2, 2))
r := h.FlushSuccess("mysql")
if r.IsErr() {
t.Errorf("FlushSuccess mysql: %v", r.UnwrapErr())
}
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
h.UpsertFailure(req)
mock.ExpectExec("CREATE TABLE IF NOT EXISTS").WillReturnResult(sqlmock.NewResult(0, 0))
mock.ExpectExec("INSERT INTO").WillReturnResult(sqlmock.NewResult(1, 1))
r = h.FlushFailure("mysql")
if r.IsErr() {
t.Errorf("FlushFailure mysql: %v", r.UnwrapErr())
}
}
func TestHistory_ReadFailure_InvalidData(t *testing.T) {
cleanup := setupHistoryDir(t)
defer cleanup()
_ = config.Conf()
h := New("test", "").(*History)
req := &request.Request{Spider: "s", URL: "http://a.com", Rule: "r", Method: "GET", Header: make(http.Header)}
req.Prepare()
ser := req.Serialize().Unwrap()
fileData := map[string]string{req.Unique(): ser, "badkey": "{invalid}"}
data, _ := json.Marshal(fileData)
if err := os.WriteFile(h.Failure.fileName, data, 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
r := h.ReadFailure("file", true)
if r.IsErr() {
t.Errorf("ReadFailure: %v", r.UnwrapErr())
}
if len(h.Failure.list) != 1 {
t.Errorf("expected 1 valid record, got %d", len(h.Failure.list))
}
}
================================================
FILE: app/aid/history/success.go
================================================
package history
import (
"encoding/json"
"fmt"
"os"
"sync"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/mgo"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/config"
)
// Success tracks successfully crawled request IDs for deduplication.
type Success struct {
tabName string
fileName string
new map[string]bool
old map[string]bool
inheritable bool
sync.RWMutex
}
// UpsertSuccess updates or adds a success record. Returns true if an insert occurred.
func (s *Success) UpsertSuccess(reqUnique string) bool {
s.RWMutex.Lock()
defer s.RWMutex.Unlock()
if s.old[reqUnique] {
return false
}
if s.new[reqUnique] {
return false
}
s.new[reqUnique] = true
return true
}
func (s *Success) HasSuccess(reqUnique string) bool {
s.RWMutex.Lock()
has := s.old[reqUnique] || s.new[reqUnique]
s.RWMutex.Unlock()
return has
}
// DeleteSuccess removes a success record.
func (s *Success) DeleteSuccess(reqUnique string) {
s.RWMutex.Lock()
delete(s.new, reqUnique)
s.RWMutex.Unlock()
}
func (s *Success) flush(provider string) result.Result[int] {
s.RWMutex.Lock()
defer s.RWMutex.Unlock()
sLen := len(s.new)
if sLen == 0 {
return result.Ok(0)
}
switch provider {
case "mgo":
if mgo.Error() != nil {
return result.TryErr[int](fmt.Errorf(" * Fail [add success record][mgo]: %v [ERROR] %v\n", sLen, mgo.Error()))
}
var docs = make([]map[string]interface{}, sLen)
var i int
for key := range s.new {
docs[i] = map[string]interface{}{"_id": key}
s.old[key] = true
i++
}
r := mgo.Mgo(nil, "insert", map[string]interface{}{
"Database": config.Conf().DBName,
"Collection": s.tabName,
"Docs": docs,
})
if r.IsErr() {
return result.TryErr[int](fmt.Errorf(" * Fail [add success record][mgo]: %v [ERROR] %v\n", sLen, r.UnwrapErr()))
}
case "mysql":
_, err := mysql.DB()
if err != nil {
return result.TryErr[int](fmt.Errorf(" * Fail [add success record][mysql]: %v [ERROR] %v\n", sLen, err))
}
table, ok := getWriteMysqlTable(s.tabName)
if !ok {
table = mysql.New().Unwrap()
table.SetTableName(s.tabName).CustomPrimaryKey(`id VARCHAR(255) NOT NULL PRIMARY KEY`)
if r := table.Create(); r.IsErr() {
return result.TryErr[int](fmt.Errorf(" * Fail [add success record][mysql]: %v [ERROR] %v\n", sLen, r.UnwrapErr()))
}
setWriteMysqlTable(s.tabName, table)
}
for key := range s.new {
table.AutoInsert([]string{key})
s.old[key] = true
}
if r := table.FlushInsert(); r.IsErr() {
return result.TryErr[int](fmt.Errorf(" * Fail [add success record][mysql]: %v [ERROR] %v\n", sLen, r.UnwrapErr()))
}
default:
f, _ := os.OpenFile(s.fileName, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0777)
b, _ := json.Marshal(s.new)
b[0] = ','
f.Write(b[:len(b)-1])
f.Close()
for key := range s.new {
s.old[key] = true
}
}
s.new = make(map[string]bool)
return result.Ok(sLen)
}
================================================
FILE: app/aid/history/success_test.go
================================================
package history
import (
"encoding/json"
"os"
"path/filepath"
"testing"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
)
func TestSuccess_UpsertSuccess(t *testing.T) {
s := &Success{
tabName: "t",
fileName: "f",
new: make(map[string]bool),
old: make(map[string]bool),
}
tests := []struct {
unique string
want bool
}{
{"id1", true},
{"id1", false},
{"id2", true},
{"id2", false},
}
for _, tt := range tests {
if got := s.UpsertSuccess(tt.unique); got != tt.want {
t.Errorf("UpsertSuccess(%q) = %v, want %v", tt.unique, got, tt.want)
}
}
}
func TestSuccess_UpsertSuccess_OldExists(t *testing.T) {
s := &Success{
tabName: "t",
fileName: "f",
new: make(map[string]bool),
old: map[string]bool{"id1": true},
}
if got := s.UpsertSuccess("id1"); got {
t.Error("UpsertSuccess when old exists want false")
}
}
func TestSuccess_HasSuccess(t *testing.T) {
s := &Success{
tabName: "t",
fileName: "f",
new: map[string]bool{"n1": true},
old: map[string]bool{"o1": true},
}
tests := []struct {
unique string
want bool
}{
{"n1", true},
{"o1", true},
{"x", false},
}
for _, tt := range tests {
if got := s.HasSuccess(tt.unique); got != tt.want {
t.Errorf("HasSuccess(%q) = %v, want %v", tt.unique, got, tt.want)
}
}
}
func TestSuccess_DeleteSuccess(t *testing.T) {
s := &Success{
tabName: "t",
fileName: "f",
new: map[string]bool{"id1": true},
old: make(map[string]bool),
}
s.DeleteSuccess("id1")
if s.HasSuccess("id1") {
t.Error("DeleteSuccess should remove from new")
}
}
func TestSuccess_Flush_File(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(dir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
fileName := filepath.Join(dir, "history__y__test")
s := &Success{
tabName: util.FileNameReplace("history__y__test"),
fileName: fileName,
new: map[string]bool{"a": true, "b": true},
old: make(map[string]bool),
}
r := s.flush("file")
if r.IsErr() {
t.Fatalf("flush: %v", r.UnwrapErr())
}
if r.Unwrap() != 2 {
t.Errorf("flush count = %v, want 2", r.Unwrap())
}
if _, err := os.Stat(fileName); err != nil {
t.Errorf("flush file: %v", err)
}
}
func TestSuccess_Flush_Empty(t *testing.T) {
s := &Success{
tabName: "t",
fileName: "/nonexistent",
new: make(map[string]bool),
old: make(map[string]bool),
}
r := s.flush("file")
if r.IsErr() {
t.Fatalf("flush empty: %v", r.UnwrapErr())
}
if r.Unwrap() != 0 {
t.Errorf("flush count = %v, want 0", r.Unwrap())
}
}
func TestSuccess_Flush_FileAppend(t *testing.T) {
tmp := t.TempDir()
dir := filepath.Join(tmp, config.WorkRoot, config.HistoryTag)
if err := os.MkdirAll(dir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
fileName := filepath.Join(dir, "history__y__test")
s := &Success{
tabName: util.FileNameReplace("history__y__test"),
fileName: fileName,
new: map[string]bool{"c": true},
old: make(map[string]bool),
}
r := s.flush("file")
if r.IsErr() {
t.Fatalf("flush: %v", r.UnwrapErr())
}
data, _ := os.ReadFile(fileName)
var m map[string]bool
if err := json.Unmarshal(append(append([]byte{'{'}, data[1:]...), '}'), &m); err != nil {
t.Fatalf("unmarshal file: %v, content: %s", err, data)
}
if !m["c"] {
t.Errorf("expected c in file, got %v", m)
}
}
================================================
FILE: app/aid/proxy/host.go
================================================
package proxy
import (
"sync"
"time"
)
// ProxyForHost manages proxy IPs for a host, sorted by response time.
type ProxyForHost struct {
curIndex int // Index of current proxy IP
proxys []string
timedelay []time.Duration
isEcho bool // Whether to print proxy switch info
sync.Mutex
}
// Len implements sort.Interface.
func (ph *ProxyForHost) Len() int {
return len(ph.proxys)
}
func (ph *ProxyForHost) Less(i, j int) bool {
return ph.timedelay[i] < ph.timedelay[j]
}
func (ph *ProxyForHost) Swap(i, j int) {
ph.proxys[i], ph.proxys[j] = ph.proxys[j], ph.proxys[i]
ph.timedelay[i], ph.timedelay[j] = ph.timedelay[j], ph.timedelay[i]
}
================================================
FILE: app/aid/proxy/host_test.go
================================================
package proxy
import (
"testing"
"time"
)
func TestProxyForHost_Len(t *testing.T) {
tests := []struct {
proxys []string
want int
}{
{nil, 0},
{[]string{}, 0},
{[]string{"a"}, 1},
{[]string{"a", "b", "c"}, 3},
}
for _, tt := range tests {
ph := &ProxyForHost{proxys: tt.proxys}
if got := ph.Len(); got != tt.want {
t.Errorf("Len() = %v, want %v", got, tt.want)
}
}
}
func TestProxyForHost_Less(t *testing.T) {
ph := &ProxyForHost{
proxys: []string{"a", "b", "c"},
timedelay: []time.Duration{10 * time.Millisecond, 5 * time.Millisecond, 20 * time.Millisecond},
}
tests := []struct {
i, j int
want bool
}{
{0, 1, false},
{1, 0, true},
{1, 2, true},
{2, 1, false},
}
for _, tt := range tests {
if got := ph.Less(tt.i, tt.j); got != tt.want {
t.Errorf("Less(%d,%d) = %v, want %v", tt.i, tt.j, got, tt.want)
}
}
}
func TestProxyForHost_Swap(t *testing.T) {
ph := &ProxyForHost{
proxys: []string{"a", "b"},
timedelay: []time.Duration{10 * time.Millisecond, 5 * time.Millisecond},
}
ph.Swap(0, 1)
if ph.proxys[0] != "b" || ph.proxys[1] != "a" {
t.Errorf("Swap proxys = %v", ph.proxys)
}
if ph.timedelay[0] != 5*time.Millisecond || ph.timedelay[1] != 10*time.Millisecond {
t.Errorf("Swap timedelay = %v", ph.timedelay)
}
}
================================================
FILE: app/aid/proxy/proxy.go
================================================
// Package proxy provides proxy IP pool management and online filtering.
package proxy
import (
"io"
"log"
"net/http"
"net/url"
"os"
"regexp"
"sort"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/andeya/gust/option"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/downloader/surfer"
"github.com/andeya/pholcus/common/ping"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
)
// Proxy manages a pool of proxy IPs with online filtering and per-host sorting.
type Proxy struct {
ipRegexp *regexp.Regexp
proxyIPTypeRegexp *regexp.Regexp
proxyUrlTypeRegexp *regexp.Regexp
allIps map[string]string
all map[string]bool
online int32
usable map[string]*ProxyForHost
ticker *time.Ticker
tickMinute int64
threadPool chan bool
surf surfer.Surfer
sync.Mutex
}
const (
CONN_TIMEOUT = 4 //4s
DAIL_TIMEOUT = 4 //4s
TRY_TIMES = 3
// Max concurrency for IP speed testing
MAX_THREAD_NUM = 1000
)
// New creates and starts a Proxy that loads and filters proxy IPs from config.
func New() *Proxy {
p := &Proxy{
ipRegexp: regexp.MustCompile(`[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+`),
proxyIPTypeRegexp: regexp.MustCompile(`https?://([\w]*:[\w]*@)?[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+`),
proxyUrlTypeRegexp: regexp.MustCompile(`((https?|ftp):\/\/)?(([^:\n\r]+):([^@\n\r]+)@)?((www\.)?([^/\n\r:]+)):?([0-9]{1,5})?\/?([^?\n\r]+)?\??([^#\n\r]*)?#?([^\n\r]*)`),
allIps: map[string]string{},
all: map[string]bool{},
usable: make(map[string]*ProxyForHost),
threadPool: make(chan bool, MAX_THREAD_NUM),
surf: surfer.New(),
}
go p.Update()
return p
}
// Count returns the number of online proxy IPs.
func (p *Proxy) Count() int32 {
return p.online
}
// SetSurfForTest injects a Surfer for testing.
func (p *Proxy) SetSurfForTest(s surfer.Surfer) {
p.surf = s
}
// Update refreshes the proxy IP list.
func (p *Proxy) Update() result.VoidResult {
f, err := os.Open(config.Conf().ProxyFile)
if err != nil {
return result.TryErrVoid(err)
}
b, _ := io.ReadAll(f)
f.Close()
proxysIPType := p.proxyIPTypeRegexp.FindAllString(string(b), -1)
for _, proxy := range proxysIPType {
p.allIps[proxy] = p.ipRegexp.FindString(proxy)
p.all[proxy] = false
}
proxysUrlType := p.proxyUrlTypeRegexp.FindAllString(string(b), -1)
for _, proxy := range proxysUrlType {
gvalue := p.proxyUrlTypeRegexp.FindStringSubmatch(proxy)
p.allIps[proxy] = gvalue[6]
p.all[proxy] = false
}
log.Printf(" * Read proxy IPs: %v\n", len(p.all))
p.findOnline()
return result.OkVoid()
}
// findOnline filters proxy IPs that are online.
func (p *Proxy) findOnline() *Proxy {
log.Printf(" * Filtering online proxy IPs...")
p.online = 0
for proxy := range p.all {
p.threadPool <- true
go func(proxy string) {
alive := ping.Ping(p.allIps[proxy], CONN_TIMEOUT).IsOk()
p.Lock()
p.all[proxy] = alive
p.Unlock()
if alive {
atomic.AddInt32(&p.online, 1)
}
<-p.threadPool
}(proxy)
}
for len(p.threadPool) > 0 {
time.Sleep(0.2e9)
}
p.online = atomic.LoadInt32(&p.online)
log.Printf(" * Online proxy IP filtering complete, total: %v\n", p.online)
return p
}
// UpdateTicker updates the ticker.
func (p *Proxy) UpdateTicker(tickMinute int64) {
p.tickMinute = tickMinute
p.ticker = time.NewTicker(time.Duration(p.tickMinute) * time.Minute)
for _, proxyForHost := range p.usable {
proxyForHost.curIndex++
proxyForHost.isEcho = true
}
}
// GetOne returns an unused proxy IP for this cycle and its response time.
func (p *Proxy) GetOne(u string) option.Option[string] {
if p.online == 0 {
return option.None[string]()
}
u2, _ := url.Parse(u)
if u2.Host == "" {
logs.Log().Informational(" * [%v] Failed to set proxy IP, invalid target URL\n", u)
return option.None[string]()
}
var key = u2.Host
if strings.Count(key, ".") > 1 {
key = key[strings.Index(key, ".")+1:]
}
p.Lock()
defer p.Unlock()
var ok = true
var proxyForHost = p.usable[key]
select {
case <-p.ticker.C:
proxyForHost.curIndex++
if proxyForHost.curIndex >= proxyForHost.Len() {
_, ok = p.testAndSort(key, u2.Scheme+"://"+u2.Host)
}
proxyForHost.isEcho = true
default:
if proxyForHost == nil {
p.usable[key] = &ProxyForHost{
proxys: []string{},
timedelay: []time.Duration{},
isEcho: true,
}
proxyForHost, ok = p.testAndSort(key, u2.Scheme+"://"+u2.Host)
} else if l := proxyForHost.Len(); l == 0 {
ok = false
} else if proxyForHost.curIndex >= l {
_, ok = p.testAndSort(key, u2.Scheme+"://"+u2.Host)
proxyForHost.isEcho = true
}
}
if !ok {
logs.Log().Informational(" * [%v] Failed to set proxy IP, no available proxy IPs\n", key)
return option.None[string]()
}
curProxy := proxyForHost.proxys[proxyForHost.curIndex]
if proxyForHost.isEcho {
logs.Log().Informational(" * Set proxy IP to [%v](%v)\n",
curProxy,
proxyForHost.timedelay[proxyForHost.curIndex],
)
proxyForHost.isEcho = false
}
return option.Some(curProxy)
}
// testAndSort tests and sorts proxy IPs for the given host.
func (p *Proxy) testAndSort(key string, testHost string) (*ProxyForHost, bool) {
logs.Log().Informational(" * [%v] Testing and sorting proxy IPs...", key)
proxyForHost := p.usable[key]
proxyForHost.proxys = []string{}
proxyForHost.timedelay = []time.Duration{}
proxyForHost.curIndex = 0
for proxy, online := range p.all {
if !online {
continue
}
p.threadPool <- true
go func(proxy string) {
alive, timedelay := p.findUsable(proxy, testHost)
if alive {
proxyForHost.Mutex.Lock()
proxyForHost.proxys = append(proxyForHost.proxys, proxy)
proxyForHost.timedelay = append(proxyForHost.timedelay, timedelay)
proxyForHost.Mutex.Unlock()
}
<-p.threadPool
}(proxy)
}
for len(p.threadPool) > 0 {
time.Sleep(0.2e9)
}
if proxyForHost.Len() > 0 {
sort.Sort(proxyForHost)
logs.Log().Informational(" * [%v] Testing and sorting proxy IPs complete, available: %v\n", key, proxyForHost.Len())
return proxyForHost, true
}
logs.Log().Informational(" * [%v] Testing and sorting proxy IPs complete, no available proxy IPs\n", key)
return proxyForHost, false
}
// findUsable tests proxy IP availability.
func (p *Proxy) findUsable(proxy string, testHost string) (alive bool, timedelay time.Duration) {
t0 := time.Now()
req := &request.Request{
URL: testHost,
Method: "HEAD",
Header: make(http.Header),
DialTimeout: time.Second * time.Duration(DAIL_TIMEOUT),
ConnTimeout: time.Second * time.Duration(CONN_TIMEOUT),
TryTimes: TRY_TIMES,
}
req.SetProxy(proxy)
r := p.surf.Download(req)
if r.IsErr() {
return false, 0
}
resp := r.Unwrap()
if resp == nil || resp.StatusCode != http.StatusOK {
return false, 0
}
return true, time.Since(t0)
}
================================================
FILE: app/aid/proxy/proxy_test.go
================================================
package proxy
import (
"net/http"
"os"
"path/filepath"
"regexp"
"testing"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader/surfer"
"github.com/andeya/pholcus/config"
)
func setupProxyDir(t *testing.T) (cleanup func()) {
tmp := t.TempDir()
configDir := filepath.Join(tmp, config.WorkRoot)
if err := os.MkdirAll(configDir, 0777); err != nil {
t.Fatalf("MkdirAll: %v", err)
}
proxyFile := filepath.Join(configDir, "proxy.lib")
if err := os.WriteFile(proxyFile, []byte(""), 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
orig, _ := os.Getwd()
if err := os.Chdir(tmp); err != nil {
t.Fatalf("Chdir: %v", err)
}
return func() { os.Chdir(orig) }
}
func newTestProxy() *Proxy {
return &Proxy{
ipRegexp: regexp.MustCompile(`[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+`),
proxyIPTypeRegexp: regexp.MustCompile(`https?://([\w]*:[\w]*@)?[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:[0-9]+`),
proxyUrlTypeRegexp: regexp.MustCompile(`((https?|ftp):\/\/)?(([^:\n\r]+):([^@\n\r]+)@)?((www\.)?([^/\n\r:]+)):?([0-9]{1,5})?\/?([^?\n\r]+)?\??([^#\n\r]*)?#?([^\n\r]*)`),
allIps: map[string]string{},
all: map[string]bool{},
usable: make(map[string]*ProxyForHost),
threadPool: make(chan bool, MAX_THREAD_NUM),
surf: surfer.New(),
}
}
func TestProxy_Update_EmptyFile(t *testing.T) {
cleanup := setupProxyDir(t)
defer cleanup()
_ = config.Conf()
p := newTestProxy()
r := p.Update()
if r.IsErr() {
t.Errorf("Update: %v", r.UnwrapErr())
}
if p.Count() != 0 {
t.Errorf("Count = %v, want 0", p.Count())
}
}
func TestProxy_Update_WithIPs(t *testing.T) {
cleanup := setupProxyDir(t)
defer cleanup()
_ = config.Conf()
proxyFile := filepath.Join(config.WorkRoot, "proxy.lib")
content := "http://127.0.0.1:8080\nhttp://user:pass@127.0.0.1:9090"
if err := os.WriteFile(proxyFile, []byte(content), 0644); err != nil {
t.Fatalf("WriteFile: %v", err)
}
p := newTestProxy()
r := p.Update()
if r.IsErr() {
t.Errorf("Update: %v", r.UnwrapErr())
}
}
func TestProxy_GetOne_NoOnline(t *testing.T) {
p := &Proxy{online: 0}
if got := p.GetOne("http://example.com"); got.IsSome() {
t.Error("GetOne with online=0 want None")
}
}
func TestProxy_GetOne_EmptyHost(t *testing.T) {
p := &Proxy{online: 1}
if got := p.GetOne("http://"); got.IsSome() {
t.Error("GetOne with empty host want None")
}
}
func TestProxy_UpdateTicker(t *testing.T) {
p := &Proxy{
usable: make(map[string]*ProxyForHost),
}
p.usable["example.com"] = &ProxyForHost{curIndex: 0, isEcho: false}
p.UpdateTicker(5)
if p.ticker == nil {
t.Error("UpdateTicker should set ticker")
}
if p.tickMinute != 5 {
t.Errorf("tickMinute = %v, want 5", p.tickMinute)
}
}
func TestProxy_New(t *testing.T) {
cleanup := setupProxyDir(t)
defer cleanup()
_ = config.Conf()
p := New()
time.Sleep(100 * time.Millisecond)
if p.Count() != 0 {
t.Errorf("New with empty file Count = %v, want 0", p.Count())
}
}
func TestProxy_GetOne_WithUsable(t *testing.T) {
p := &Proxy{
online: 1,
ticker: time.NewTicker(time.Hour),
usable: map[string]*ProxyForHost{
"example.com": {
proxys: []string{"http://127.0.0.1:8080"},
timedelay: []time.Duration{time.Millisecond},
curIndex: 0,
isEcho: false,
},
},
}
got := p.GetOne("http://www.example.com/path")
if !got.IsSome() {
t.Fatal("GetOne want Some")
}
if got.Unwrap() != "http://127.0.0.1:8080" {
t.Errorf("GetOne = %v, want http://127.0.0.1:8080", got.Unwrap())
}
}
func TestProxy_GetOne_NoUsableForHost(t *testing.T) {
p := &Proxy{
online: 1,
ticker: time.NewTicker(time.Hour),
usable: map[string]*ProxyForHost{
"example.com": {
proxys: []string{},
timedelay: []time.Duration{},
curIndex: 0,
isEcho: false,
},
},
}
got := p.GetOne("http://www.example.com/path")
if got.IsSome() {
t.Error("GetOne with empty proxys want None")
}
}
type mockSurfer struct {
resp *http.Response
}
func (m *mockSurfer) Download(req surfer.Request) result.Result[*http.Response] {
if m.resp != nil {
return result.Ok(m.resp)
}
return result.TryErr[*http.Response](http.ErrHandlerTimeout)
}
func TestProxy_GetOne_TriggersTestAndSort(t *testing.T) {
cleanup := setupProxyDir(t)
defer cleanup()
_ = config.Conf()
p := newTestProxy()
p.SetSurfForTest(&mockSurfer{resp: &http.Response{StatusCode: http.StatusOK}})
p.all = map[string]bool{"http://127.0.0.1:8080": true}
p.allIps = map[string]string{"http://127.0.0.1:8080": "127.0.0.1"}
p.online = 1
p.ticker = time.NewTicker(time.Hour)
p.usable = map[string]*ProxyForHost{
"example.com": {
proxys: []string{"old"}, // curIndex will exceed after tick
timedelay: []time.Duration{time.Millisecond},
curIndex: 1,
isEcho: false,
},
}
got := p.GetOne("http://www.example.com/path")
if !got.IsSome() {
t.Fatal("GetOne want Some")
}
if got.Unwrap() != "http://127.0.0.1:8080" {
t.Errorf("GetOne = %v, want http://127.0.0.1:8080", got.Unwrap())
}
}
func TestProxy_Update_FileNotFound(t *testing.T) {
tmp := t.TempDir()
orig, _ := os.Getwd()
os.Chdir(tmp)
defer os.Chdir(orig)
_ = config.Conf()
p := newTestProxy()
r := p.Update()
if r.IsOk() {
t.Error("Update with missing file want Err")
}
}
================================================
FILE: app/app.go
================================================
// app interface for graphical user interface.
// Basic execution order: New() --> [SetLog(io.Writer) -->] Init() --> SpiderPrepare() --> Run()
// Package app provides the main entry and task scheduling for the crawler application.
package app
import (
"io"
"reflect"
"runtime/debug"
"strconv"
"sync"
"time"
"unicode"
"unicode/utf8"
"github.com/andeya/gust/option"
"github.com/andeya/pholcus/app/crawler"
"github.com/andeya/pholcus/app/distribute"
"github.com/andeya/pholcus/app/distribute/teleport"
"github.com/andeya/pholcus/app/downloader"
"github.com/andeya/pholcus/app/pipeline"
"github.com/andeya/pholcus/app/scheduler"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/cache"
"github.com/andeya/pholcus/runtime/status"
)
type (
App interface {
SetLog(io.Writer) App // Set global log output to terminal
LogGoOn() App // Resume log output
LogRest() App // Pause log output
Init(mode int, port int, master string, w ...io.Writer) App // Must call Init before using App (except SetLog)
ReInit(mode int, port int, master string, w ...io.Writer) App // Switch run mode and reset log output target
GetAppConf(k ...string) interface{} // Get global config
SetAppConf(k string, v interface{}) App // Set global config (not called in client mode)
SpiderPrepare(original []*spider.Spider) App // Must call after setting global params and before Run() (not called in client mode)
Run() // Block until task completes (call after all config is done)
Stop() // Terminate task mid-run in Offline mode (blocks until current task stops)
IsRunning() bool // Check if task is running
IsPaused() bool // Check if task is paused
IsStopped() bool // Check if task has stopped
PauseRecover() // Pause or resume task in Offline mode
Status() int // Return current status
GetSpiderLib() []*spider.Spider // Get all spider species
GetSpiderByName(string) option.Option[*spider.Spider] // Get spider by name
GetSpiderQueue() crawler.SpiderQueue // Get spider queue interface
GetOutputLib() []string // Get all output methods
GetTaskJar() *distribute.TaskJar // Return task jar
distribute.Distributor // Implements distributed interface
}
Logic struct {
*cache.AppConf // Global config
*spider.SpiderSpecies // All spider species
crawler.SpiderQueue // Spider queue for current task
*distribute.TaskJar // Task storage passed between server and client
crawler.CrawlerPool // Crawler pool
teleport.Teleport // Socket duplex communication, JSON transport
sum [2]uint64 // Execution count
takeTime time.Duration // Execution duration
status int // Run status
finish chan bool
finishOnce sync.Once
canSocketLog bool
sync.RWMutex
}
)
// LogicApp is the global singleton core interface instance.
var LogicApp = New()
func New() App {
return newLogic()
}
func newLogic() *Logic {
return &Logic{
AppConf: cache.Task,
SpiderSpecies: spider.Species,
status: status.STOPPED,
Teleport: teleport.New(),
TaskJar: distribute.NewTaskJar(),
SpiderQueue: crawler.NewSpiderQueue(),
CrawlerPool: crawler.NewCrawlerPool(downloader.SurferDownloader),
}
}
// SetLog sets global log output to the given writer.
func (l *Logic) SetLog(w io.Writer) App {
logs.Log().SetOutput(w)
return l
}
// LogRest pauses log output.
func (l *Logic) LogRest() App {
logs.Log().PauseOutput()
return l
}
// LogGoOn resumes log output.
func (l *Logic) LogGoOn() App {
logs.Log().GoOn()
return l
}
// GetAppConf returns global config value(s).
func (l *Logic) GetAppConf(k ...string) interface{} {
defer func() {
if err := recover(); err != nil {
logs.Log().Error("panic recovered: %v\n%s", err, debug.Stack())
}
}()
if len(k) == 0 {
return l.AppConf
}
key := titleCase(k[0])
acv := reflect.ValueOf(l.AppConf).Elem()
return acv.FieldByName(key).Interface()
}
// SetAppConf sets a global config value.
func (l *Logic) SetAppConf(k string, v interface{}) App {
defer func() {
if err := recover(); err != nil {
logs.Log().Error("panic recovered: %v\n%s", err, debug.Stack())
}
}()
if k == "Limit" && v.(int64) <= 0 {
v = int64(spider.LIMIT)
} else if k == "BatchCap" && v.(int) < 1 {
v = int(1)
}
acv := reflect.ValueOf(l.AppConf).Elem()
key := titleCase(k)
if acv.FieldByName(key).CanSet() {
acv.FieldByName(key).Set(reflect.ValueOf(v))
}
return l
}
// Init initializes the app; must be called before use (except SetLog).
func (l *Logic) Init(mode int, port int, master string, w ...io.Writer) App {
l.AppConf = cache.Task
l.canSocketLog = false
if len(w) > 0 {
l.SetLog(w[0])
}
l.LogGoOn()
l.AppConf.Mode, l.AppConf.Port, l.AppConf.Master = mode, port, master
l.Teleport = teleport.New()
l.TaskJar = distribute.NewTaskJar()
l.SpiderQueue = crawler.NewSpiderQueue()
l.CrawlerPool = crawler.NewCrawlerPool(downloader.SurferDownloader)
switch l.AppConf.Mode {
case status.SERVER:
logs.Log().EnableStealOne(false)
if l.checkPort() {
logs.Log().Informational(" !! Current run mode: [ SERVER ] !!")
l.Teleport.SetAPI(distribute.MasterAPI(l)).Server(":" + strconv.Itoa(l.AppConf.Port))
}
case status.CLIENT:
if l.checkAll() {
logs.Log().Informational(" !! Current run mode: [ CLIENT ] !!")
l.Teleport.SetAPI(distribute.SlaveAPI(l)).Client(l.AppConf.Master, ":"+strconv.Itoa(l.AppConf.Port))
// Enable inter-node log forwarding
l.canSocketLog = true
logs.Log().EnableStealOne(true)
go l.socketLog()
}
case status.OFFLINE:
logs.Log().EnableStealOne(false)
logs.Log().Informational(" !! Current run mode: [ OFFLINE ] !!")
return l
default:
logs.Log().Warning(" * -- Please specify a valid run mode! --")
return l
}
return l
}
// ReInit switches run mode; use when changing mode.
func (l *Logic) ReInit(mode int, port int, master string, w ...io.Writer) App {
if !l.IsStopped() {
l.Stop()
}
l.LogRest()
if l.Teleport != nil {
l.Teleport.Close()
}
// Wait for shutdown
if mode == status.UNSET {
l = newLogic()
l.AppConf.Mode = status.UNSET
return l
}
// Restart
l = newLogic().Init(mode, port, master, w...).(*Logic)
return l
}
// SpiderPrepare must be called after setting global params and immediately before Run().
// original is the raw spider species from spider package without prior assignment.
// Spiders with explicit Keyin are not reassigned.
// Not called in client mode.
func (l *Logic) SpiderPrepare(original []*spider.Spider) App {
l.SpiderQueue.Reset()
for _, sp := range original {
spcopy := sp.Copy()
spcopy.SetPausetime(l.AppConf.Pausetime)
if spcopy.GetLimit() == spider.LIMIT {
spcopy.SetLimit(l.AppConf.Limit)
} else {
spcopy.SetLimit(-1 * l.AppConf.Limit)
}
l.SpiderQueue.Add(spcopy)
}
l.SpiderQueue.AddKeyins(l.AppConf.Keyins)
return l
}
// GetOutputLib returns all output methods.
func (l *Logic) GetOutputLib() []string {
return pipeline.GetOutputLib()
}
// GetSpiderLib returns all spider species.
func (l *Logic) GetSpiderLib() []*spider.Spider {
return l.SpiderSpecies.Get()
}
// GetSpiderByName returns a spider by name.
func (l *Logic) GetSpiderByName(name string) option.Option[*spider.Spider] {
return l.SpiderSpecies.GetByNameOpt(name)
}
// GetMode returns current run mode.
func (l *Logic) GetMode() int {
return l.AppConf.Mode
}
// GetTaskJar returns the task jar.
func (l *Logic) GetTaskJar() *distribute.TaskJar {
return l.TaskJar
}
// CountNodes returns connected node count in server/client mode.
func (l *Logic) CountNodes() int {
return l.Teleport.CountNodes()
}
// GetSpiderQueue returns the spider queue interface.
func (l *Logic) GetSpiderQueue() crawler.SpiderQueue {
return l.SpiderQueue
}
// Run executes the task.
func (l *Logic) Run() {
l.LogGoOn()
if l.AppConf.Mode != status.CLIENT && l.SpiderQueue.Len() == 0 {
logs.Log().Warning(" * -- Task list cannot be empty --")
l.LogRest()
return
}
l.finish = make(chan bool)
l.finishOnce = sync.Once{}
l.sum[0], l.sum[1] = 0, 0
l.takeTime = 0
l.setStatus(status.RUN)
defer l.setStatus(status.STOPPED)
switch l.AppConf.Mode {
case status.OFFLINE:
l.offline()
case status.SERVER:
l.server()
case status.CLIENT:
l.client()
default:
return
}
<-l.finish
}
// PauseRecover pauses or resumes the task in Offline mode.
func (l *Logic) PauseRecover() {
switch l.Status() {
case status.PAUSE:
l.setStatus(status.RUN)
case status.RUN:
l.setStatus(status.PAUSE)
}
scheduler.PauseRecover()
}
// Stop terminates the task mid-run in Offline mode.
func (l *Logic) Stop() {
if l.status == status.STOPPED {
return
}
if l.status != status.STOP {
// Stop order must not be reversed
l.setStatus(status.STOP)
scheduler.Stop()
l.CrawlerPool.Stop()
}
for !l.IsStopped() {
time.Sleep(time.Second)
}
}
// IsRunning reports whether the task is running.
func (l *Logic) IsRunning() bool {
return l.status == status.RUN
}
// IsPaused reports whether the task is paused.
func (l *Logic) IsPaused() bool {
return l.status == status.PAUSE
}
// IsStopped reports whether the task has stopped.
func (l *Logic) IsStopped() bool {
return l.status == status.STOPPED
}
// Status returns current run status.
func (l *Logic) Status() int {
l.RWMutex.RLock()
defer l.RWMutex.RUnlock()
return l.status
}
// setStatus sets the run status.
func (l *Logic) setStatus(status int) {
l.RWMutex.Lock()
defer l.RWMutex.Unlock()
l.status = status
}
// --- Private methods ---
// offline runs in offline mode.
func (l *Logic) offline() {
l.exec()
}
// server runs in server mode; must be called after SpiderPrepare() to add tasks.
// Generated tasks use the same global config.
func (l *Logic) server() {
defer func() {
l.finishOnce.Do(func() { close(l.finish) })
}()
tasksNum, spidersNum := l.addNewTask()
if tasksNum == 0 {
return
}
logs.Log().Informational(" * ")
logs.Log().Informational(` *********************************************************************************************************************************** `)
logs.Log().Informational(" * ")
logs.Log().Informational(" * -- Successfully added %v tasks, %v spider rules in total --", tasksNum, spidersNum)
logs.Log().Informational(" * ")
logs.Log().Informational(` *********************************************************************************************************************************** `)
}
// addNewTask generates tasks and adds them to the jar in server mode.
func (l *Logic) addNewTask() (tasksNum, spidersNum int) {
length := l.SpiderQueue.Len()
t := distribute.Task{}
l.setTask(&t)
for i, sp := range l.SpiderQueue.GetAll() {
t.Spiders = append(t.Spiders, map[string]string{"name": sp.GetName(), "keyin": sp.GetKeyin()})
spidersNum++
if i > 0 && i%10 == 0 && length > 10 {
one := t
l.TaskJar.Push(&one)
tasksNum++
t.Spiders = []map[string]string{}
}
}
if len(t.Spiders) != 0 {
one := t
l.TaskJar.Push(&one)
tasksNum++
}
return
}
// client runs in client mode.
func (l *Logic) client() {
defer func() {
l.finishOnce.Do(func() { close(l.finish) })
}()
for {
t := l.downTask()
if l.Status() == status.STOP || l.Status() == status.STOPPED {
return
}
l.taskToRun(t)
l.sum[0], l.sum[1] = 0, 0
l.takeTime = 0
l.exec()
}
}
// downTask fetches a task from the jar in client mode.
func (l *Logic) downTask() *distribute.Task {
for {
if l.Status() == status.STOP || l.Status() == status.STOPPED {
return nil
}
if l.CountNodes() == 0 && l.TaskJar.Len() == 0 {
time.Sleep(time.Second)
continue
}
if l.TaskJar.Len() == 0 {
l.Request(nil, "task", "")
for l.TaskJar.Len() == 0 {
if l.CountNodes() == 0 {
break
}
time.Sleep(time.Second)
}
if l.TaskJar.Len() == 0 {
continue
}
}
return l.TaskJar.Pull()
}
}
// taskToRun prepares run conditions from a task in client mode.
func (l *Logic) taskToRun(t *distribute.Task) {
l.SpiderQueue.Reset()
l.setAppConf(t)
for _, n := range t.Spiders {
spOpt := l.SpiderSpecies.GetByNameOpt(n["name"])
if spOpt.IsNone() {
continue
}
spcopy := spOpt.Unwrap().Copy()
spcopy.SetPausetime(t.Pausetime)
if spcopy.GetLimit() > 0 {
spcopy.SetLimit(t.Limit)
} else {
spcopy.SetLimit(-1 * t.Limit)
}
if v, ok := n["keyin"]; ok {
spcopy.SetKeyin(v)
}
l.SpiderQueue.Add(spcopy)
}
}
// exec starts task execution.
func (l *Logic) exec() {
count := l.SpiderQueue.Len()
cache.ResetPageCount()
pipeline.RefreshOutput()
scheduler.Init(l.AppConf.ThreadNum, l.AppConf.ProxyMinute)
l.CrawlerPool.SetPipelineConfig(l.AppConf.OutType, l.AppConf.BatchCap)
crawlerCap := l.CrawlerPool.Reset(count)
logs.Log().Informational(" * Total tasks (tasks * custom configs): %v\n", count)
logs.Log().Informational(" * Crawler pool capacity: %v\n", crawlerCap)
logs.Log().Informational(" * Max concurrent goroutines: %v\n", l.AppConf.ThreadNum)
logs.Log().Informational(" * Default random pause: %v~%v ms\n", l.AppConf.Pausetime/2, l.AppConf.Pausetime*2)
logs.Log().App(" * -- Starting crawl, please wait --")
logs.Log().Informational(` *********************************************************************************************************************************** `)
cache.StartTime = time.Now()
if l.AppConf.Mode == status.OFFLINE {
go l.goRun(count)
} else {
l.goRun(count)
}
}
// goRun executes the task.
func (l *Logic) goRun(count int) {
var i int
for i = 0; i < count && l.Status() != status.STOP; i++ {
for l.IsPaused() {
time.Sleep(time.Second)
}
if opt := l.CrawlerPool.UseOpt(); opt.IsSome() {
c := opt.Unwrap()
go func(i int, c crawler.Crawler) {
c.Init(l.SpiderQueue.GetByIndex(i)).Run()
l.RWMutex.RLock()
if l.status != status.STOP {
l.CrawlerPool.Free(c)
}
l.RWMutex.RUnlock()
}(i, c)
}
}
for ii := 0; ii < i; ii++ {
s := <-cache.ReportChan
if (s.DataNum == 0) && (s.FileNum == 0) {
logs.Log().App(" * [Task subtotal: %s | KEYIN: %s] No results, duration %v\n", s.SpiderName, s.Keyin, s.Time)
continue
}
logs.Log().Informational(" * ")
switch {
case s.DataNum > 0 && s.FileNum == 0:
logs.Log().App(" * [Task subtotal: %s | KEYIN: %s] Collected %v data items, duration %v\n",
s.SpiderName, s.Keyin, s.DataNum, s.Time)
case s.DataNum == 0 && s.FileNum > 0:
logs.Log().App(" * [Task subtotal: %s | KEYIN: %s] Downloaded %v files, duration %v\n",
s.SpiderName, s.Keyin, s.FileNum, s.Time)
default:
logs.Log().App(" * [Task subtotal: %s | KEYIN: %s] Collected %v data items + %v files, duration %v\n",
s.SpiderName, s.Keyin, s.DataNum, s.FileNum, s.Time)
}
l.sum[0] += s.DataNum
l.sum[1] += s.FileNum
}
l.takeTime = time.Since(cache.StartTime)
var prefix = func() string {
if l.Status() == status.STOP {
return "Task cancelled: "
}
return "This run: "
}()
logs.Log().Informational(" * ")
logs.Log().Informational(` *********************************************************************************************************************************** `)
logs.Log().Informational(" * ")
switch {
case l.sum[0] > 0 && l.sum[1] == 0:
logs.Log().App(" * -- %sTotal collected [%v data items], crawled [success %v URL + fail %v URL = total %v URL], duration [%v] --",
prefix, l.sum[0], cache.GetPageCount(1), cache.GetPageCount(-1), cache.GetPageCount(0), l.takeTime)
case l.sum[0] == 0 && l.sum[1] > 0:
logs.Log().App(" * -- %sTotal collected [%v files], crawled [success %v URL + fail %v URL = total %v URL], duration [%v] --",
prefix, l.sum[1], cache.GetPageCount(1), cache.GetPageCount(-1), cache.GetPageCount(0), l.takeTime)
case l.sum[0] == 0 && l.sum[1] == 0:
logs.Log().App(" * -- %sNo results, crawled [success %v URL + fail %v URL = total %v URL], duration [%v] --",
prefix, cache.GetPageCount(1), cache.GetPageCount(-1), cache.GetPageCount(0), l.takeTime)
default:
logs.Log().App(" * -- %sTotal collected [%v data items + %v files], crawled [success %v URL + fail %v URL = total %v URL], duration [%v] --",
prefix, l.sum[0], l.sum[1], cache.GetPageCount(1), cache.GetPageCount(-1), cache.GetPageCount(0), l.takeTime)
}
logs.Log().Informational(" * ")
logs.Log().Informational(` *********************************************************************************************************************************** `)
if l.AppConf.Mode == status.OFFLINE {
l.LogRest()
l.finishOnce.Do(func() { close(l.finish) })
}
}
// socketLog forwards client logs to the server.
func (l *Logic) socketLog() {
for l.canSocketLog {
_, msg, ok := logs.Log().StealOne()
if !ok {
return
}
if l.Teleport.CountNodes() == 0 {
continue
}
l.Teleport.Request(msg, "log", "")
}
}
func (l *Logic) checkPort() bool {
if l.AppConf.Port == 0 {
logs.Log().Warning(" * -- Distributed port cannot be empty --")
return false
}
return true
}
func (l *Logic) checkAll() bool {
if l.AppConf.Master == "" || !l.checkPort() {
logs.Log().Warning(" * -- Server address cannot be empty --")
return false
}
return true
}
// setAppConf applies task config to global runtime config.
func (l *Logic) setAppConf(task *distribute.Task) {
l.AppConf.ThreadNum = task.ThreadNum
l.AppConf.Pausetime = task.Pausetime
l.AppConf.OutType = task.OutType
l.AppConf.BatchCap = task.BatchCap
l.AppConf.SuccessInherit = task.SuccessInherit
l.AppConf.FailureInherit = task.FailureInherit
l.AppConf.Limit = task.Limit
l.AppConf.ProxyMinute = task.ProxyMinute
l.AppConf.Keyins = task.Keyins
}
func (l *Logic) setTask(task *distribute.Task) {
task.ThreadNum = l.AppConf.ThreadNum
task.Pausetime = l.AppConf.Pausetime
task.OutType = l.AppConf.OutType
task.BatchCap = l.AppConf.BatchCap
task.SuccessInherit = l.AppConf.SuccessInherit
task.FailureInherit = l.AppConf.FailureInherit
task.Limit = l.AppConf.Limit
task.ProxyMinute = l.AppConf.ProxyMinute
task.Keyins = l.AppConf.Keyins
}
func titleCase(s string) string {
if s == "" {
return s
}
r, size := utf8.DecodeRuneInString(s)
return string(unicode.ToUpper(r)) + s[size:]
}
================================================
FILE: app/app_test.go
================================================
package app
import (
"bytes"
"testing"
"time"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/runtime/status"
)
func TestNew(t *testing.T) {
a := New()
if a == nil {
t.Fatal("New returned nil")
}
}
func TestLogic_SetLog_LogGoOn_LogRest(t *testing.T) {
a := New()
buf := &bytes.Buffer{}
a.SetLog(buf)
a.LogGoOn()
a.LogRest()
}
func TestLogic_GetAppConf(t *testing.T) {
a := New().(*Logic)
tests := []struct {
keys []string
}{
{nil},
{[]string{"Mode"}},
{[]string{"ThreadNum"}},
{[]string{"Limit"}},
}
for _, tt := range tests {
_ = a.GetAppConf(tt.keys...)
}
}
func TestLogic_SetAppConf(t *testing.T) {
a := New().(*Logic)
tests := []struct {
k string
v interface{}
}{
{"Limit", int64(100)},
{"Limit", int64(0)},
{"BatchCap", 50},
{"BatchCap", 0},
{"ThreadNum", 10},
}
for _, tt := range tests {
a.SetAppConf(tt.k, tt.v)
}
}
func TestLogic_GetSpiderLib(t *testing.T) {
a := New()
lib := a.GetSpiderLib()
if lib == nil {
t.Error("GetSpiderLib returned nil")
}
}
func TestLogic_GetSpiderByName(t *testing.T) {
a := New()
opt := a.GetSpiderByName("nonexistent")
if opt.IsSome() {
t.Error("GetSpiderByName(nonexistent) should return None")
}
}
func TestLogic_GetSpiderQueue(t *testing.T) {
a := New()
q := a.GetSpiderQueue()
if q == nil {
t.Fatal("GetSpiderQueue returned nil")
}
if q.Len() != 0 {
t.Errorf("new queue Len() = %d, want 0", q.Len())
}
}
func TestLogic_GetOutputLib(t *testing.T) {
a := New()
lib := a.GetOutputLib()
if len(lib) == 0 {
t.Error("GetOutputLib returned empty")
}
}
func TestLogic_GetTaskJar(t *testing.T) {
a := New()
jar := a.GetTaskJar()
if jar == nil {
t.Fatal("GetTaskJar returned nil")
}
}
func TestLogic_Status_IsRunning_IsPaused_IsStopped(t *testing.T) {
a := New().(*Logic)
if a.Status() != status.STOPPED {
t.Errorf("Status() = %d, want STOPPED", a.Status())
}
if a.IsRunning() {
t.Error("IsRunning() = true, want false")
}
if a.IsPaused() {
t.Error("IsPaused() = true, want false")
}
if !a.IsStopped() {
t.Error("IsStopped() = false, want true")
}
}
func TestLogic_Init_Offline(t *testing.T) {
a := New()
got := a.Init(status.OFFLINE, 2015, "", nil)
if got == nil {
t.Fatal("Init returned nil")
}
}
func TestLogic_Init_Server_invalidPort(t *testing.T) {
a := New()
got := a.Init(status.SERVER, 0, "", nil)
if got == nil {
t.Fatal("Init returned nil")
}
}
func TestLogic_Init_Server_validPort(t *testing.T) {
a := New()
got := a.Init(status.SERVER, 2016, "", nil)
if got == nil {
t.Fatal("Init returned nil")
}
}
func TestLogic_Init_Client_invalidMaster(t *testing.T) {
a := New()
got := a.Init(status.CLIENT, 2015, "", nil)
if got == nil {
t.Fatal("Init returned nil")
}
}
func TestLogic_Init_invalidMode(t *testing.T) {
a := New()
got := a.Init(999, 2015, "", nil)
if got == nil {
t.Fatal("Init returned nil")
}
}
func TestLogic_GetMode(t *testing.T) {
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
if a.GetMode() != status.OFFLINE {
t.Errorf("GetMode() = %d, want OFFLINE", a.GetMode())
}
}
func TestLogic_ReInit(t *testing.T) {
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
got := a.ReInit(status.UNSET, 0, "")
if got == nil {
t.Fatal("ReInit returned nil")
}
}
func TestLogic_GetAppConf_titleCase(t *testing.T) {
a := New().(*Logic)
a.SetAppConf("limit", int64(50))
v := a.GetAppConf("limit")
if v == nil {
t.Fatal("GetAppConf(limit) returned nil")
}
}
func TestLogic_SpiderPrepare(t *testing.T) {
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: spider.LIMIT,
Pausetime: 100,
}
sp.Register()
got := a.SpiderPrepare([]*spider.Spider{sp})
if got == nil {
t.Fatal("SpiderPrepare returned nil")
}
if a.GetSpiderQueue().Len() < 1 {
t.Errorf("SpiderPrepare Len() = %d, want >= 1", a.GetSpiderQueue().Len())
}
}
func TestLogic_Run_emptyQueue(t *testing.T) {
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
a.Run()
}
func TestLogic_Stop_whenStopped(t *testing.T) {
a := New().(*Logic)
a.Stop()
}
func TestLogic_PauseRecover(t *testing.T) {
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
a.PauseRecover()
}
func TestLogic_Run_offline_withSpiders(t *testing.T) {
sp := &spider.Spider{
Name: "AppTestSpider",
RuleTree: &spider.RuleTree{Root: func(_ *spider.Context) {}, Trunk: map[string]*spider.Rule{}},
Limit: spider.LIMIT,
Pausetime: 100,
}
sp.Register()
a := New().(*Logic)
a.Init(status.OFFLINE, 2015, "", nil)
a.SpiderPrepare([]*spider.Spider{sp})
go func() {
time.Sleep(3 * time.Second)
a.Stop()
}()
a.Run()
}
func TestLogic_Run_server_withSpiders(t *testing.T) {
sp := &spider.Spider{
Name: "AppTestSpiderServer",
RuleTree: &spider.RuleTree{Root: func(_ *spider.Context) {}, Trunk: map[string]*spider.Rule{}},
Limit: spider.LIMIT,
Pausetime: 100,
}
sp.Register()
a := New().(*Logic)
a.Init(status.SERVER, 2018, "", nil)
a.SpiderPrepare([]*spider.Spider{sp})
go func() {
time.Sleep(2 * time.Second)
a.Stop()
}()
a.Run()
}
================================================
FILE: app/crawler/crawler.go
================================================
// Package crawler provides the core crawler engine for request scheduling and page downloading.
package crawler
import (
"bytes"
"math/rand"
"runtime"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/pipeline"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/cache"
)
// Crawler is the core crawler engine.
type (
Crawler interface {
Init(*spider.Spider) Crawler // Init initializes the crawler engine
Run() // Run executes the task
Stop() // Stop terminates the crawler
CanStop() bool // CanStop reports whether the crawler can be stopped
GetID() int // GetID returns the engine ID
}
crawler struct {
*spider.Spider // spider rule being executed
downloader.Downloader // shared downloader
pipeline.Pipeline // result collection and output pipeline
id int // engine ID
outType string // output type for pipeline
batchCap int // batch output capacity for pipeline
pause [2]int64 // [min request interval ms, max additional interval ms]
}
)
// New creates a new Crawler with the given ID, Downloader, and pipeline config.
func New(id int, dl downloader.Downloader, outType string, batchCap int) Crawler {
return &crawler{
id: id,
Downloader: dl,
outType: outType,
batchCap: batchCap,
}
}
// Init initializes the crawler with the given spider.
func (c *crawler) Init(sp *spider.Spider) Crawler {
c.Spider = sp.ReqmatrixInit()
c.Pipeline = pipeline.New(sp, c.outType, c.batchCap)
c.pause[0] = sp.Pausetime / 2
if c.pause[0] > 0 {
c.pause[1] = c.pause[0] * 3
} else {
c.pause[1] = 1
}
return c
}
// Run is the main entry point for task execution.
func (c *crawler) Run() {
c.Pipeline.Start()
done := make(chan bool)
go func() {
c.run()
close(done)
}()
c.Spider.Start()
<-done
c.Pipeline.Stop()
}
// Stop terminates the crawler and its pipeline.
func (c *crawler) Stop() {
c.Spider.Stop()
c.Pipeline.Stop()
}
func (c *crawler) run() {
for {
req := c.GetOne()
if req == nil {
if c.Spider.CanStop() {
break
}
time.Sleep(20 * time.Millisecond)
continue
}
c.UseOne()
go func() {
defer func() {
c.FreeOne()
}()
logs.Log().Debug(" * Start: %v", req.GetURL())
c.Process(req)
}()
c.sleep()
}
c.Spider.Defer()
}
// Process downloads a request, parses the response, and sends results to the pipeline.
func (c *crawler) Process(req *request.Request) {
var (
downUrl = req.GetURL()
sp = c.Spider
)
defer func() {
if p := recover(); p != nil {
if sp.IsStopping() {
return
}
if sp.DoHistory(req, false) {
cache.PageFailCount()
}
stack := make([]byte, 4<<10)
length := runtime.Stack(stack, true)
start := bytes.Index(stack, []byte("/src/runtime/panic.go"))
stack = stack[start:length]
start = bytes.Index(stack, []byte("\n")) + 1
stack = stack[start:]
if end := bytes.Index(stack, []byte("\ngoroutine ")); end != -1 {
stack = stack[:end]
}
stack = bytes.Replace(stack, []byte("\n"), []byte("\r\n"), -1)
logs.Log().Error(" * Panic [process][%s]: %s\r\n[TRACE]\r\n%s", downUrl, p, stack)
}
}()
var ctx = c.Downloader.Download(sp, req) // download page
if r := result.TryErrVoid(ctx.GetError()); r.IsErr() {
if sp.DoHistory(req, false) {
cache.PageFailCount()
}
logs.Log().Error(" * Fail [download][%v]: %v\n", downUrl, r.UnwrapErr())
return
}
ctx.Parse(req.GetRuleName())
if parseErr := ctx.GetError(); parseErr != nil {
if sp.DoHistory(req, false) {
cache.PageFailCount()
}
logs.Log().Error(" * Fail [parse][%v]: %v\n", downUrl, parseErr)
return
}
for _, f := range ctx.PullFiles() {
if c.Pipeline.CollectFile(f).IsErr() {
break
}
}
for _, item := range ctx.PullItems() {
if c.Pipeline.CollectData(item).IsErr() {
break
}
}
sp.DoHistory(req, true)
cache.PageSuccCount()
logs.Log().Informational(" * Success: %v\n", downUrl)
spider.PutContext(ctx)
}
func (c *crawler) sleep() {
sleeptime := c.pause[0] + rand.Int63n(c.pause[1])
time.Sleep(time.Duration(sleeptime) * time.Millisecond)
}
// GetOne pulls one request from the scheduler.
func (c *crawler) GetOne() *request.Request {
return c.Spider.RequestPull()
}
// UseOne acquires one resource slot from the scheduler.
func (c *crawler) UseOne() {
c.Spider.RequestUse()
}
// FreeOne releases one resource slot to the scheduler.
func (c *crawler) FreeOne() {
c.Spider.RequestFree()
}
// SetID sets the crawler ID.
func (c *crawler) SetID(id int) {
c.id = id
}
// GetID returns the crawler engine ID.
func (c *crawler) GetID() int {
return c.id
}
================================================
FILE: app/crawler/crawler_test.go
================================================
package crawler
import (
"fmt"
"net/http"
"testing"
"time"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/scheduler"
"github.com/andeya/pholcus/app/spider"
)
func TestNew(t *testing.T) {
c := New(1, &mockDownloader{}, "csv", 10)
if c == nil {
t.Fatal("New returned nil")
}
if c.GetID() != 1 {
t.Errorf("GetID() = %d, want 1", c.GetID())
}
}
func TestCrawler_GetID(t *testing.T) {
c := New(42, &mockDownloader{}, "csv", 10)
if got := c.GetID(); got != 42 {
t.Errorf("GetID() = %d, want 42", got)
}
}
func TestCrawler_Init(t *testing.T) {
scheduler.Init(4, 0)
c := New(0, &mockDownloader{}, "csv", 10)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
got := c.Init(sp)
if got != c {
t.Error("Init should return self")
}
}
func TestCrawler_Init_zeroPause(t *testing.T) {
scheduler.Init(4, 0)
c := New(0, &mockDownloader{}, "csv", 10)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
Pausetime: 0,
}
c.Init(sp)
}
func TestCrawler_GetOne_UseOne_FreeOne(t *testing.T) {
scheduler.Init(4, 0)
cr := New(0, &mockDownloader{}, "csv", 10).(*crawler)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
cr.Init(sp)
req := cr.GetOne()
if req != nil {
t.Error("GetOne on empty matrix should return nil")
}
cr.UseOne()
cr.FreeOne()
}
func TestCrawler_CanStop(t *testing.T) {
scheduler.Init(4, 0)
c := New(0, &mockDownloader{}, "csv", 10)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
c.Init(sp)
if !c.CanStop() {
t.Error("CanStop on empty matrix should be true")
}
}
func TestCrawler_Stop(t *testing.T) {
scheduler.Init(4, 0)
c := New(0, &mockDownloader{}, "csv", 10)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
c.Init(sp)
c.Stop()
}
func TestCrawler_SetID(t *testing.T) {
cr := New(0, &mockDownloader{}, "csv", 10).(*crawler)
cr.SetID(99)
if cr.GetID() != 99 {
t.Errorf("GetID() = %d, want 99", cr.GetID())
}
}
type errorDownloader struct{}
func (d *errorDownloader) Download(sp *spider.Spider, req *request.Request) *spider.Context {
ctx := spider.GetContext(sp, req)
ctx.SetError(fmt.Errorf("download failed"))
return ctx
}
func TestCrawler_Process_downloadError(t *testing.T) {
scheduler.Init(4, 0)
cr := New(0, &errorDownloader{}, "csv", 10).(*crawler)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
cr.Init(sp)
req := &request.Request{URL: "http://example.com", Rule: "r"}
req.Prepare()
cr.Process(req)
}
func TestCrawler_Run(t *testing.T) {
scheduler.Init(4, 0)
sd := &successDownloader{}
cr := New(0, sd, "csv", 10).(*crawler)
sp := &spider.Spider{
Name: "CrawlerRunTestSpider",
RuleTree: &spider.RuleTree{
Root: func(ctx *spider.Context) {
time.Sleep(50 * time.Millisecond)
req := &request.Request{URL: "http://example.com", Rule: "r"}
req.Prepare()
ctx.AddQueue(req)
},
Trunk: map[string]*spider.Rule{"r": {ParseFunc: func(_ *spider.Context) {}}},
},
Limit: -5,
}
sp.Register()
cr.Init(sp)
cr.Run()
}
type successDownloader struct{}
func (d *successDownloader) Download(sp *spider.Spider, req *request.Request) *spider.Context {
ctx := spider.GetContext(sp, req)
ctx.SetResponse(&http.Response{StatusCode: 200})
return ctx
}
func TestCrawler_Process_success(t *testing.T) {
scheduler.Init(4, 0)
cr := New(0, &successDownloader{}, "csv", 10).(*crawler)
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{
Root: func(_ *spider.Context) {},
Trunk: map[string]*spider.Rule{"r": {ParseFunc: func(_ *spider.Context) {}}},
},
Limit: -5,
}
cr.Init(sp)
req := &request.Request{URL: "http://example.com", Rule: "r"}
req.Prepare()
cr.Process(req)
}
================================================
FILE: app/crawler/crawlerpool.go
================================================
package crawler
import (
"sync"
"time"
"github.com/andeya/gust/option"
"github.com/andeya/pholcus/app/downloader"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/runtime/status"
)
// CrawlerPool manages a pool of crawler engines.
type (
CrawlerPool interface {
Reset(spiderNum int) int
SetPipelineConfig(outType string, batchCap int)
Use() Crawler
UseOpt() option.Option[Crawler]
Free(Crawler)
Stop()
}
cq struct {
capacity int
count int
usable chan Crawler
all []Crawler
dl downloader.Downloader
outType string
batchCap int
status int
sync.RWMutex
}
)
// NewCrawlerPool creates a new crawler pool with the given Downloader.
func NewCrawlerPool(dl downloader.Downloader) CrawlerPool {
return &cq{
status: status.RUN,
dl: dl,
all: make([]Crawler, 0, config.Conf().CrawlsCap),
}
}
// SetPipelineConfig sets the output type and batch capacity for new crawlers.
func (cq *cq) SetPipelineConfig(outType string, batchCap int) {
cq.Lock()
defer cq.Unlock()
cq.outType = outType
cq.batchCap = batchCap
}
// Reset configures the pool size based on the number of spiders to run.
// When reusing a pool instance, it efficiently resizes to the new capacity.
func (cq *cq) Reset(spiderNum int) int {
cq.Lock()
defer cq.Unlock()
var wantNum int
if spiderNum < config.Conf().CrawlsCap {
wantNum = spiderNum
} else {
wantNum = config.Conf().CrawlsCap
}
if wantNum <= 0 {
wantNum = 1
}
cq.capacity = wantNum
cq.count = 0
cq.usable = make(chan Crawler, wantNum)
for _, crawler := range cq.all {
if cq.count < cq.capacity {
cq.usable <- crawler
cq.count++
}
}
cq.status = status.RUN
return wantNum
}
// Use acquires a crawler from the pool in a concurrency-safe manner.
func (cq *cq) Use() Crawler {
return cq.UseOpt().UnwrapOr(nil)
}
// UseOpt acquires a crawler from the pool; returns None when pool is stopped.
func (cq *cq) UseOpt() option.Option[Crawler] {
var crawler Crawler
for {
cq.Lock()
if cq.status == status.STOP {
cq.Unlock()
return option.None[Crawler]()
}
select {
case crawler = <-cq.usable:
cq.Unlock()
return option.Some(crawler)
default:
if cq.count < cq.capacity {
crawler = New(cq.count, cq.dl, cq.outType, cq.batchCap)
cq.all = append(cq.all, crawler)
cq.count++
cq.Unlock()
return option.Some(crawler)
}
}
cq.Unlock()
time.Sleep(time.Second)
}
}
// Free returns a crawler to the pool.
func (cq *cq) Free(crawler Crawler) {
cq.RLock()
defer cq.RUnlock()
if cq.status == status.STOP || !crawler.CanStop() {
return
}
cq.usable <- crawler
}
// Stop terminates all crawler tasks in the pool.
func (cq *cq) Stop() {
cq.Lock()
if cq.status == status.STOP {
cq.Unlock()
return
}
cq.status = status.STOP
close(cq.usable)
cq.usable = nil
cq.Unlock()
for _, crawler := range cq.all {
crawler.Stop()
}
}
================================================
FILE: app/crawler/crawlerpool_test.go
================================================
package crawler
import (
"testing"
"github.com/andeya/pholcus/app/downloader"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/scheduler"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/config"
)
type mockDownloader struct{}
func (d *mockDownloader) Download(_ *spider.Spider, _ *request.Request) *spider.Context {
return nil
}
func TestNewCrawlerPool(t *testing.T) {
dl := &mockDownloader{}
pool := NewCrawlerPool(dl)
if pool == nil {
t.Fatal("NewCrawlerPool returned nil")
}
}
func TestCrawlerPool_SetPipelineConfig(t *testing.T) {
pool := NewCrawlerPool(&mockDownloader{})
pool.SetPipelineConfig("csv", 100)
}
func TestCrawlerPool_Reset(t *testing.T) {
_ = config.Conf()
pool := NewCrawlerPool(&mockDownloader{})
pool.SetPipelineConfig("csv", 10)
tests := []struct {
name string
spiderNum int
wantMinNum int
}{
{"one", 1, 1},
{"five", 5, 5},
{"over_cap", 999, 1},
{"zero", 0, 1},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := pool.Reset(tt.spiderNum)
if got < tt.wantMinNum {
t.Errorf("Reset(%d) = %d, want >= %d", tt.spiderNum, got, tt.wantMinNum)
}
})
}
}
func TestCrawlerPool_Use_UseOpt_Free(t *testing.T) {
scheduler.Init(4, 0)
pool := NewCrawlerPool(downloader.SurferDownloader)
pool.SetPipelineConfig("csv", 10)
pool.Reset(2)
opt := pool.UseOpt()
if !opt.IsSome() {
t.Fatal("UseOpt returned None")
}
c := opt.Unwrap()
if c == nil {
t.Fatal("UseOpt returned nil crawler")
}
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -10,
}
c.Init(sp)
pool.Free(c)
c2 := pool.Use()
if c2 == nil {
t.Fatal("Use returned nil")
}
pool.Free(c2)
}
func TestCrawlerPool_UseOpt_returnsSome(t *testing.T) {
scheduler.Init(4, 0)
pool := NewCrawlerPool(&mockDownloader{})
pool.SetPipelineConfig("csv", 10)
pool.Reset(2)
opt := pool.UseOpt()
if !opt.IsSome() {
t.Fatal("UseOpt returned None")
}
c := opt.Unwrap()
if c.GetID() < 0 {
t.Errorf("GetID() = %d, want >= 0", c.GetID())
}
}
func TestCrawlerPool_Stop(t *testing.T) {
pool := NewCrawlerPool(&mockDownloader{})
pool.SetPipelineConfig("csv", 10)
pool.Reset(1)
pool.Stop()
opt := pool.UseOpt()
if opt.IsSome() {
t.Error("UseOpt after Stop should return None")
}
}
func TestCrawlerPool_Reset_reuse(t *testing.T) {
scheduler.Init(4, 0)
_ = config.Conf()
pool := NewCrawlerPool(&mockDownloader{})
pool.SetPipelineConfig("csv", 10)
pool.Reset(2)
c1 := pool.Use()
c2 := pool.Use()
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
c1.Init(sp)
c2.Init(sp)
pool.Free(c1)
pool.Free(c2)
got := pool.Reset(3)
if got != 3 {
t.Errorf("Reset(3) = %d, want 3", got)
}
}
func TestCrawlerPool_Stop_idempotent(t *testing.T) {
pool := NewCrawlerPool(&mockDownloader{})
pool.Reset(1)
pool.Stop()
pool.Stop()
}
func TestCrawlerPool_Free_whenStopped(t *testing.T) {
scheduler.Init(4, 0)
pool := NewCrawlerPool(downloader.SurferDownloader)
pool.SetPipelineConfig("csv", 10)
pool.Reset(1)
c := pool.Use()
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
Limit: -5,
}
c.Init(sp)
pool.Stop()
pool.Free(c)
}
================================================
FILE: app/crawler/spiderqueue.go
================================================
package crawler
import (
"github.com/andeya/gust/option"
spider "github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/logs"
)
// SpiderQueue holds the spider rule queue for the crawler engine.
type (
SpiderQueue interface {
Reset() // Reset clears the queue
Add(*spider.Spider)
AddAll([]*spider.Spider)
AddKeyins(string) // AddKeyins assigns Keyin to queue members that have not been assigned yet
GetByIndex(int) *spider.Spider
GetByIndexOpt(int) option.Option[*spider.Spider]
GetByName(string) *spider.Spider
GetByNameOpt(string) option.Option[*spider.Spider]
GetAll() []*spider.Spider
Len() int // Len returns the queue length
}
sq struct {
list []*spider.Spider
}
)
// NewSpiderQueue creates a new spider queue.
func NewSpiderQueue() SpiderQueue {
return &sq{
list: []*spider.Spider{},
}
}
// Reset clears the spider queue.
func (sq *sq) Reset() {
sq.list = []*spider.Spider{}
}
// Add appends a spider to the queue.
func (sq *sq) Add(sp *spider.Spider) {
sp.SetID(sq.Len())
sq.list = append(sq.list, sp)
}
// AddAll appends all spiders in the list to the queue.
func (sq *sq) AddAll(list []*spider.Spider) {
for _, v := range list {
sq.Add(v)
}
}
// AddKeyins iterates over the spider queue and assigns Keyin values.
// Spiders that already have an explicit Keyin are not reassigned.
func (sq *sq) AddKeyins(keyins string) {
keyinSlice := util.KeyinsParse(keyins)
if len(keyinSlice) == 0 {
return
}
unit1 := []*spider.Spider{} // spiders that cannot receive custom config
unit2 := []*spider.Spider{} // spiders that can receive custom config
for _, v := range sq.GetAll() {
if v.GetKeyin() == spider.KEYIN {
unit2 = append(unit2, v)
continue
}
unit1 = append(unit1, v)
}
if len(unit2) == 0 {
logs.Log().Warning("This batch of tasks does not require custom configuration.\n")
return
}
sq.Reset()
for _, keyin := range keyinSlice {
for _, v := range unit2 {
v.Keyin = keyin
sq.Add(v.Copy())
}
}
if sq.Len() == 0 {
sq.AddAll(append(unit1, unit2...))
}
sq.AddAll(unit1)
}
// GetByIndex returns the spider at the given index.
func (sq *sq) GetByIndex(idx int) *spider.Spider {
return sq.GetByIndexOpt(idx).UnwrapOr(nil)
}
// GetByIndexOpt returns the spider at the given index as Option; None if out of range.
func (sq *sq) GetByIndexOpt(idx int) option.Option[*spider.Spider] {
if idx >= 0 && idx < len(sq.list) {
return option.Some(sq.list[idx])
}
return option.None[*spider.Spider]()
}
// GetByName returns the spider with the given name, or nil if not found.
func (sq *sq) GetByName(n string) *spider.Spider {
return sq.GetByNameOpt(n).UnwrapOr(nil)
}
// GetByNameOpt returns the spider with the given name as Option.
func (sq *sq) GetByNameOpt(n string) option.Option[*spider.Spider] {
for _, sp := range sq.list {
if sp.GetName() == n {
return option.Some(sp)
}
}
return option.None[*spider.Spider]()
}
// GetAll returns all spiders in the queue.
func (sq *sq) GetAll() []*spider.Spider {
return sq.list
}
// Len returns the number of spiders in the queue.
func (sq *sq) Len() int {
return len(sq.list)
}
================================================
FILE: app/crawler/spiderqueue_test.go
================================================
package crawler
import (
"testing"
spider "github.com/andeya/pholcus/app/spider"
)
func makeSpider(name string, keyin string) *spider.Spider {
return &spider.Spider{
Name: name,
Keyin: keyin,
RuleTree: &spider.RuleTree{
Trunk: map[string]*spider.Rule{},
},
}
}
func TestNewSpiderQueue(t *testing.T) {
q := NewSpiderQueue()
if q == nil {
t.Fatal("NewSpiderQueue returned nil")
}
if q.Len() != 0 {
t.Errorf("Len() = %d, want 0", q.Len())
}
}
func TestSpiderQueue_Add_Len_Reset(t *testing.T) {
tests := []struct {
name string
adds []*spider.Spider
wantLen int
}{
{"empty", nil, 0},
{"one", []*spider.Spider{makeSpider("a", "")}, 1},
{"two", []*spider.Spider{makeSpider("a", ""), makeSpider("b", "")}, 2},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
q := NewSpiderQueue()
for _, sp := range tt.adds {
q.Add(sp)
}
if got := q.Len(); got != tt.wantLen {
t.Errorf("Len() = %d, want %d", got, tt.wantLen)
}
q.Reset()
if q.Len() != 0 {
t.Errorf("after Reset Len() = %d, want 0", q.Len())
}
})
}
}
func TestSpiderQueue_AddAll(t *testing.T) {
list := []*spider.Spider{
makeSpider("a", ""),
makeSpider("b", ""),
makeSpider("c", ""),
}
q := NewSpiderQueue()
q.AddAll(list)
if got := q.Len(); got != 3 {
t.Errorf("AddAll Len() = %d, want 3", got)
}
all := q.GetAll()
for i := range list {
if all[i].GetName() != list[i].GetName() {
t.Errorf("GetAll()[%d].GetName() = %q, want %q", i, all[i].GetName(), list[i].GetName())
}
}
}
func TestSpiderQueue_GetByIndex_GetByIndexOpt(t *testing.T) {
sp1 := makeSpider("s1", "")
sp2 := makeSpider("s2", "")
q := NewSpiderQueue()
q.Add(sp1)
q.Add(sp2)
tests := []struct {
idx int
want *spider.Spider
optSome bool
}{
{0, sp1, true},
{1, sp2, true},
{-1, nil, false},
{2, nil, false},
{10, nil, false},
}
for _, tt := range tests {
got := q.GetByIndex(tt.idx)
if got != tt.want {
t.Errorf("GetByIndex(%d) = %v, want %v", tt.idx, got, tt.want)
}
opt := q.GetByIndexOpt(tt.idx)
if opt.IsSome() != tt.optSome {
t.Errorf("GetByIndexOpt(%d).IsSome() = %v, want %v", tt.idx, opt.IsSome(), tt.optSome)
}
if opt.IsSome() && opt.Unwrap() != tt.want {
t.Errorf("GetByIndexOpt(%d).Unwrap() = %v, want %v", tt.idx, opt.Unwrap(), tt.want)
}
}
}
func TestSpiderQueue_GetByName_GetByNameOpt(t *testing.T) {
sp1 := makeSpider("alpha", "")
sp2 := makeSpider("beta", "")
q := NewSpiderQueue()
q.Add(sp1)
q.Add(sp2)
tests := []struct {
name string
want *spider.Spider
optSome bool
}{
{"alpha", sp1, true},
{"beta", sp2, true},
{"nonexistent", nil, false},
{"", nil, false},
}
for _, tt := range tests {
got := q.GetByName(tt.name)
if got != tt.want {
t.Errorf("GetByName(%q) = %v, want %v", tt.name, got, tt.want)
}
opt := q.GetByNameOpt(tt.name)
if opt.IsSome() != tt.optSome {
t.Errorf("GetByNameOpt(%q).IsSome() = %v, want %v", tt.name, opt.IsSome(), tt.optSome)
}
if opt.IsSome() && opt.Unwrap() != tt.want {
t.Errorf("GetByNameOpt(%q).Unwrap() = %v, want %v", tt.name, opt.Unwrap(), tt.want)
}
}
}
func TestSpiderQueue_AddKeyins(t *testing.T) {
tests := []struct {
name string
spiders []*spider.Spider
keyins string
wantLen int
}{
{"empty_keyins", []*spider.Spider{makeSpider("a", "")}, "", 1},
{"no_keyin_spiders", []*spider.Spider{makeSpider("a", "x")}, "", 1},
{"with_keyin_spiders", []*spider.Spider{makeSpider("a", spider.KEYIN)}, "", 2},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
q := NewSpiderQueue()
for _, sp := range tt.spiders {
q.Add(sp)
}
q.AddKeyins(tt.keyins)
if got := q.Len(); got != tt.wantLen {
t.Errorf("AddKeyins Len() = %d, want %d", got, tt.wantLen)
}
})
}
}
func TestSpiderQueue_Add_setsID(t *testing.T) {
q := NewSpiderQueue()
sp1 := makeSpider("a", "")
sp2 := makeSpider("b", "")
q.Add(sp1)
q.Add(sp2)
if sp1.GetID() != 0 {
t.Errorf("first Add ID = %d, want 0", sp1.GetID())
}
if sp2.GetID() != 1 {
t.Errorf("second Add ID = %d, want 1", sp2.GetID())
}
}
func TestSpiderQueue_GetByIndexOpt(t *testing.T) {
q := NewSpiderQueue()
opt := q.GetByIndexOpt(0)
if opt.IsSome() {
t.Error("GetByIndexOpt(0) on empty queue should be None")
}
}
func TestSpiderQueue_AddKeyins_emptyUnit2(t *testing.T) {
q := NewSpiderQueue()
q.Add(makeSpider("a", "fixed"))
q.AddKeyins("")
if q.Len() != 1 {
t.Errorf("AddKeyins with no KEYIN spiders Len() = %d, want 1", q.Len())
}
}
================================================
FILE: app/distribute/integration_test.go
================================================
package distribute
import (
"net"
"strconv"
"sync"
"testing"
"time"
"github.com/andeya/pholcus/app/distribute/teleport"
)
func freePort(t *testing.T) string {
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("freePort: %v", err)
}
defer l.Close()
return strconv.Itoa(l.Addr().(*net.TCPAddr).Port)
}
func TestTP_ServerClient_Request(t *testing.T) {
if testing.Short() {
t.Skip("skipping network test in short mode")
}
port := freePort(t)
portStr := ":" + port
tj := NewTaskJar()
tj.Push(&Task{ID: 0, Limit: 100})
serverTP := teleport.New().SetUID("server").SetAPI(MasterAPI(tj)).SetTimeout(100 * time.Millisecond)
serverTP.Server(portStr)
time.Sleep(50 * time.Millisecond)
clientTP := teleport.New().SetUID("client").SetAPI(SlaveAPI(NewTaskJar())).SetTimeout(100 * time.Millisecond)
clientTP.Client("127.0.0.1", portStr)
time.Sleep(100 * time.Millisecond)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
clientTP.Request("", "task", "", "")
}()
time.Sleep(200 * time.Millisecond)
serverTP.Close()
clientTP.Close()
wg.Wait()
}
================================================
FILE: app/distribute/interface.go
================================================
package distribute
// Distributor is the distributed interface.
type Distributor interface {
// Send sends a task from the master to the jar.
Send(clientNum int) Task
// Receive receives a task into the jar on a slave node.
Receive(task *Task)
// CountNodes returns the number of connected nodes.
CountNodes() int
}
================================================
FILE: app/distribute/master_api.go
================================================
package distribute
import (
"encoding/json"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/distribute/teleport"
"github.com/andeya/pholcus/logs"
)
// MasterAPI creates the master node API.
func MasterAPI(n Distributor) teleport.API {
return teleport.API{
"task": &masterTaskHandle{n},
"log": &masterLogHandle{},
}
}
// masterTaskHandle assigns tasks to clients.
type masterTaskHandle struct {
Distributor
}
func (mth *masterTaskHandle) Process(receive *teleport.NetData) *teleport.NetData {
b := result.Ret(json.Marshal(mth.Send(mth.CountNodes())))
if b.IsErr() {
return teleport.ReturnError(receive, teleport.FAILURE, "marshal error: "+b.UnwrapErr().Error(), receive.From)
}
return teleport.ReturnData(string(b.Unwrap()))
}
// masterLogHandle receives and prints log messages from slave nodes.
type masterLogHandle struct{}
func (*masterLogHandle) Process(receive *teleport.NetData) *teleport.NetData {
logs.Log().Informational(" * ")
logs.Log().Informational(" * [ %s ] %s", receive.From, receive.Body)
logs.Log().Informational(" * ")
return nil
}
================================================
FILE: app/distribute/master_api_test.go
================================================
package distribute
import (
"encoding/json"
"testing"
"github.com/andeya/pholcus/app/distribute/teleport"
)
type mockDistributor struct {
sendCount int
countNodes int
sendTask Task
receiveTask *Task
}
func (m *mockDistributor) Send(clientNum int) Task {
m.sendCount++
return m.sendTask
}
func (m *mockDistributor) Receive(task *Task) {
m.receiveTask = task
}
func (m *mockDistributor) CountNodes() int {
return m.countNodes
}
func TestMasterAPI(t *testing.T) {
d := &mockDistributor{countNodes: 2, sendTask: Task{ID: 1, Limit: 100}}
api := MasterAPI(d)
if api == nil {
t.Fatal("MasterAPI returned nil")
}
if _, ok := api["task"]; !ok {
t.Error("API missing task handler")
}
if _, ok := api["log"]; !ok {
t.Error("API missing log handler")
}
}
func TestMasterTaskHandle_Process(t *testing.T) {
task := Task{ID: 1, Limit: 50, OutType: "mgo"}
d := &mockDistributor{countNodes: 1, sendTask: task}
handle := &masterTaskHandle{d}
req := &teleport.NetData{From: "client1", To: "server", Operation: "task", Body: ""}
resp := handle.Process(req)
if resp == nil {
t.Fatal("Process returned nil")
}
if resp.Status != teleport.SUCCESS {
t.Errorf("Status = %d, want SUCCESS", resp.Status)
}
bodyStr, ok := resp.Body.(string)
if !ok {
t.Fatalf("Body type = %T, want string", resp.Body)
}
var got Task
if err := json.Unmarshal([]byte(bodyStr), &got); err != nil {
t.Fatalf("json.Unmarshal: %v", err)
}
if got.ID != task.ID || got.Limit != task.Limit {
t.Errorf("got Task %+v, want %+v", got, task)
}
}
func TestMasterLogHandle_Process(t *testing.T) {
handle := &masterLogHandle{}
req := &teleport.NetData{From: "slave1", Body: "test log message"}
resp := handle.Process(req)
if resp != nil {
t.Errorf("Process returned %v, want nil", resp)
}
}
================================================
FILE: app/distribute/slave_api.go
================================================
package distribute
import (
"encoding/json"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/distribute/teleport"
"github.com/andeya/pholcus/logs"
)
// SlaveAPI creates the slave node API.
func SlaveAPI(n Distributor) teleport.API {
return teleport.API{
"task": &slaveTaskHandle{n},
}
}
// slaveTaskHandle receives tasks from the master and adds them to the task jar.
type slaveTaskHandle struct {
Distributor
}
func (sth *slaveTaskHandle) Process(receive *teleport.NetData) *teleport.NetData {
t := &Task{}
if r := result.RetVoid(json.Unmarshal([]byte(receive.Body.(string)), t)); r.IsErr() {
logs.Log().Error("JSON decode failed: %v", receive.Body)
return nil
}
sth.Receive(t)
return nil
}
================================================
FILE: app/distribute/slave_api_test.go
================================================
package distribute
import (
"encoding/json"
"testing"
"github.com/andeya/pholcus/app/distribute/teleport"
)
func TestSlaveAPI(t *testing.T) {
tj := NewTaskJar()
api := SlaveAPI(tj)
if api == nil {
t.Fatal("SlaveAPI returned nil")
}
if _, ok := api["task"]; !ok {
t.Error("API missing task handler")
}
}
func TestSlaveTaskHandle_Process(t *testing.T) {
tj := NewTaskJar()
task := Task{ID: 2, Limit: 200, OutType: "csv"}
body, _ := json.Marshal(task)
handle := &slaveTaskHandle{tj}
req := &teleport.NetData{From: "master", Body: string(body)}
resp := handle.Process(req)
if resp != nil {
t.Errorf("Process returned %v, want nil", resp)
}
got := tj.Pull()
if got.ID != task.ID || got.Limit != task.Limit {
t.Errorf("got Task %+v, want %+v", got, task)
}
}
func TestSlaveTaskHandle_Process_InvalidJSON(t *testing.T) {
tj := NewTaskJar()
handle := &slaveTaskHandle{tj}
req := &teleport.NetData{From: "master", Body: "invalid json {"}
resp := handle.Process(req)
if resp != nil {
t.Errorf("Process returned %v, want nil", resp)
}
if tj.Len() != 0 {
t.Errorf("Len() = %d, want 0", tj.Len())
}
}
================================================
FILE: app/distribute/task.go
================================================
// Package distribute provides distributed task scheduling and master-slave node communication.
package distribute
// Task is used for distributed task dispatch.
type Task struct {
ID int
Spiders []map[string]string // Spider rule name and keyin, format: map[string]string{"name":"baidu","keyin":"henry"}
ThreadNum int // Global max concurrency
Pausetime int64 // Pause duration in ms (random: Pausetime/2 ~ Pausetime*2)
OutType string // Output method
BatchCap int // Batch output capacity per flush
BatchQueueCap int // Batch output pool capacity, >= 2
SuccessInherit bool // Inherit historical success records
FailureInherit bool // Inherit historical failure records
Limit int64 // Collection limit, 0=unlimited; if rule sets LIMIT then custom limit
ProxyMinute int64 // Proxy IP rotation interval in minutes
Keyins string // Custom input, later split into Keyin config for multiple tasks
}
================================================
FILE: app/distribute/task_test.go
================================================
package distribute
import (
"testing"
)
func TestTask_Fields(t *testing.T) {
tests := []struct {
name string
task Task
wantID int
wantLimit int64
wantOutType string
}{
{"zero", Task{}, 0, 0, ""},
{"with_values", Task{
ID: 1,
Limit: 100,
OutType: "mgo",
ThreadNum: 10,
}, 1, 100, "mgo"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.task.ID != tt.wantID {
t.Errorf("ID = %d, want %d", tt.task.ID, tt.wantID)
}
if tt.task.Limit != tt.wantLimit {
t.Errorf("Limit = %d, want %d", tt.task.Limit, tt.wantLimit)
}
if tt.task.OutType != tt.wantOutType {
t.Errorf("OutType = %q, want %q", tt.task.OutType, tt.wantOutType)
}
})
}
}
================================================
FILE: app/distribute/taskjar.go
================================================
package distribute
// TaskJar is the task storage.
type TaskJar struct {
Tasks chan *Task
}
// NewTaskJar creates a task storage instance.
func NewTaskJar() *TaskJar {
return &TaskJar{
Tasks: make(chan *Task, 1024),
}
}
// Push adds a task to the jar (server side).
func (tj *TaskJar) Push(task *Task) {
id := len(tj.Tasks)
task.ID = id
tj.Tasks <- task
}
// Pull gets a task from the local jar (client side).
func (tj *TaskJar) Pull() *Task {
return <-tj.Tasks
}
// Len returns number of tasks in the jar.
func (tj *TaskJar) Len() int {
return len(tj.Tasks)
}
// Send sends a task from the jar (master side).
func (tj *TaskJar) Send(clientNum int) Task {
return *<-tj.Tasks
}
// Receive receives a task into the jar (slave side).
func (tj *TaskJar) Receive(task *Task) {
tj.Tasks <- task
}
// CountNodes returns 0; TaskJar does not track connected nodes.
func (tj *TaskJar) CountNodes() int {
return 0
}
================================================
FILE: app/distribute/taskjar_test.go
================================================
package distribute
import (
"sync"
"testing"
)
func TestNewTaskJar(t *testing.T) {
tj := NewTaskJar()
if tj == nil {
t.Fatal("NewTaskJar() returned nil")
}
if tj.Tasks == nil {
t.Fatal("Tasks channel is nil")
}
if cap(tj.Tasks) != 1024 {
t.Errorf("cap(Tasks) = %d, want 1024", cap(tj.Tasks))
}
}
func TestTaskJar_PushPull(t *testing.T) {
tj := NewTaskJar()
task := &Task{ID: 0, Limit: 10}
tj.Push(task)
if tj.Len() != 1 {
t.Errorf("Len() = %d, want 1", tj.Len())
}
got := tj.Pull()
if got != task {
t.Errorf("Pull() = %p, want %p", got, task)
}
if tj.Len() != 0 {
t.Errorf("Len() after Pull = %d, want 0", tj.Len())
}
}
func TestTaskJar_SendReceive(t *testing.T) {
tests := []struct {
name string
clientNum int
}{
{"single", 1},
{"multi", 3},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tj := NewTaskJar()
task := &Task{ID: 0, Limit: 5}
tj.Receive(task)
if tj.Len() != 1 {
t.Errorf("Len() = %d, want 1", tj.Len())
}
got := tj.Send(tt.clientNum)
if got.Limit != 5 {
t.Errorf("Send() Limit = %d, want 5", got.Limit)
}
})
}
}
func TestTaskJar_PushAssignsID(t *testing.T) {
tj := NewTaskJar()
t1 := &Task{Limit: 1}
t2 := &Task{Limit: 2}
tj.Push(t1)
tj.Push(t2)
got1 := tj.Pull()
got2 := tj.Pull()
if got1.ID != 0 {
t.Errorf("first task ID = %d, want 0", got1.ID)
}
if got2.ID != 1 {
t.Errorf("second task ID = %d, want 1", got2.ID)
}
}
func TestTaskJar_CountNodes(t *testing.T) {
tj := NewTaskJar()
if got := tj.CountNodes(); got != 0 {
t.Errorf("CountNodes() = %d, want 0", got)
}
}
func TestTaskJar_Concurrent(t *testing.T) {
tj := NewTaskJar()
var wg sync.WaitGroup
for i := 0; i < 10; i++ {
wg.Add(1)
go func(id int) {
defer wg.Done()
tj.Push(&Task{ID: id, Limit: int64(id)})
}(i)
}
wg.Wait()
if tj.Len() != 10 {
t.Errorf("Len() = %d, want 10", tj.Len())
}
for tj.Len() > 0 {
_ = tj.Pull()
}
}
================================================
FILE: app/distribute/teleport/client.go
================================================
package teleport
import (
"log"
"net"
"time"
"github.com/andeya/gust/result"
)
// tpClient holds client-only state.
type tpClient struct {
short bool
mustClose bool
serverUID string
}
// Client starts client mode.
func (tp *TP) Client(serverAddr string, port string, isShort ...bool) {
if len(isShort) > 0 && isShort[0] {
tp.tpClient.short = true
} else if tp.timeout == 0 {
tp.timeout = DEFAULT_TIMEOUT_C
}
if tp.tpClient.serverUID == "" {
tp.tpClient.serverUID = DEFAULT_SERVER_UID
}
tp.reserveAPI()
tp.mode = CLIENT
if port != "" {
tp.port = port
} else {
tp.port = DEFAULT_PORT
}
tp.serverAddr = serverAddr
tp.tpClient.mustClose = false
go tp.apiHandle()
go tp.client()
}
// --- Client implementation ---
func (tp *TP) client() {
if !tp.short {
log.Println(" * -- Connecting to server... --")
}
RetryLabel:
connRes := result.Ret(net.Dial("tcp", tp.serverAddr+tp.port))
if connRes.IsErr() {
if tp.tpClient.mustClose {
tp.tpClient.mustClose = false
return
}
time.Sleep(LOOP_TIMEOUT)
goto RetryLabel
}
conn := connRes.Unwrap()
debugPrintf("Debug: connected to server: %v", conn.RemoteAddr().String())
tp.cGoConn(conn)
if !tp.short {
for tp.CountNodes() > 0 {
time.Sleep(LOOP_TIMEOUT)
}
if _, ok := tp.connPool[tp.tpClient.serverUID]; ok {
goto RetryLabel
}
}
}
// cGoConn starts read/write goroutines for the connection.
func (tp *TP) cGoConn(conn net.Conn) {
remoteAddr, connect := NewConnect(conn, tp.connBufferLen, tp.connWChanCap)
tp.connPool[tp.tpClient.serverUID] = connect
if tp.uid == "" {
tp.uid = conn.LocalAddr().String()
}
if !tp.short {
tp.send(NewNetData(tp.uid, tp.tpClient.serverUID, IDENTITY, "", nil))
log.Printf(" * -- Connected to server: %v --", remoteAddr)
} else {
connect.Short = true
}
tp.connPool[tp.tpClient.serverUID].Usable = true
go tp.cReader(tp.tpClient.serverUID)
go tp.cWriter(tp.tpClient.serverUID)
}
// cReader reads data on the client side.
func (tp *TP) cReader(nodeuid string) {
defer func() {
tp.closeConn(nodeuid, true)
}()
var conn = tp.getConn(nodeuid)
for {
if !tp.read(conn) {
break
}
}
}
// cWriter sends data on the client side.
func (tp *TP) cWriter(nodeuid string) {
defer func() {
tp.closeConn(nodeuid, true)
}()
var conn = tp.getConn(nodeuid)
for conn != nil {
if tp.short {
tp.send(<-conn.WriteChan)
continue
}
timing := time.After(tp.timeout)
data := new(NetData)
select {
case data = <-conn.WriteChan:
case <-timing:
data = NewNetData(tp.uid, nodeuid, HEARTBEAT, "", nil)
}
tp.send(data)
}
}
================================================
FILE: app/distribute/teleport/conn.go
================================================
package teleport
import (
"net"
)
// Connect wraps a network connection.
type Connect struct {
net.Conn
Usable bool
Short bool
WriteChan chan *NetData
Buffer []byte
TmpBuffer []byte
}
// NewConnect creates a Connect instance; defaults to long connection (Short=false).
func NewConnect(conn net.Conn, bufferLen int, wChanCap int) (k string, v *Connect) {
k = conn.RemoteAddr().String()
v = &Connect{
WriteChan: make(chan *NetData, wChanCap),
Buffer: make([]byte, bufferLen),
TmpBuffer: make([]byte, 0),
Conn: conn,
}
return k, v
}
// Addr returns the remote node address.
func (conn *Connect) Addr() string {
return conn.Conn.RemoteAddr().String()
}
================================================
FILE: app/distribute/teleport/conn_test.go
================================================
package teleport
import (
"net"
"testing"
)
func TestNewConnect(t *testing.T) {
client, server := net.Pipe()
defer client.Close()
defer server.Close()
k, v := NewConnect(client, 1024, 256)
if k != client.RemoteAddr().String() {
t.Errorf("key = %q, want %q", k, client.RemoteAddr().String())
}
if v == nil {
t.Fatal("Connect is nil")
}
if v.WriteChan == nil {
t.Error("WriteChan is nil")
}
if len(v.Buffer) != 1024 {
t.Errorf("Buffer len = %d, want 1024", len(v.Buffer))
}
if cap(v.WriteChan) != 256 {
t.Errorf("WriteChan cap = %d, want 256", cap(v.WriteChan))
}
}
func TestConnect_Addr(t *testing.T) {
client, server := net.Pipe()
defer client.Close()
defer server.Close()
_, conn := NewConnect(client, 64, 16)
addr := conn.Addr()
want := client.RemoteAddr().String()
if addr != want {
t.Errorf("Addr() = %q, want %q", addr, want)
}
}
================================================
FILE: app/distribute/teleport/debug.go
================================================
package teleport
import (
"log"
)
var Debug bool
func debugPrintf(format string, v ...interface{}) {
if !Debug {
return
}
log.Printf(format, v...)
}
func debugPrintln(v ...interface{}) {
if !Debug {
return
}
log.Println(v...)
}
func debugFatal(v ...interface{}) {
if !Debug {
return
}
log.Fatal(v...)
}
================================================
FILE: app/distribute/teleport/netdata.go
================================================
package teleport
const (
SUCCESS = 0
FAILURE = -1
LLLEGAL = -2
)
// NetData is the data transfer structure.
type NetData struct {
Body interface{}
Operation string
From string
To string
Status int
Flag string
}
// NewNetData creates a network data transfer structure.
func NewNetData(from, to, operation string, flag string, body interface{}) *NetData {
return &NetData{
From: from,
To: to,
Body: body,
Operation: operation,
Status: SUCCESS,
Flag: flag,
}
}
================================================
FILE: app/distribute/teleport/netdata_test.go
================================================
package teleport
import (
"testing"
)
func TestNewNetData(t *testing.T) {
tests := []struct {
from, to, op, flag string
body interface{}
}{
{"a", "b", "task", "", "body"},
{"", "", "heartbeat", "f", nil},
}
for i, tt := range tests {
t.Run("", func(t *testing.T) {
d := NewNetData(tt.from, tt.to, tt.op, tt.flag, tt.body)
if d == nil {
t.Fatal("NewNetData returned nil")
}
if d.From != tt.from {
t.Errorf("From = %q, want %q", d.From, tt.from)
}
if d.To != tt.to {
t.Errorf("To = %q, want %q", d.To, tt.to)
}
if d.Operation != tt.op {
t.Errorf("Operation = %q, want %q", d.Operation, tt.op)
}
if d.Flag != tt.flag {
t.Errorf("Flag = %q, want %q", d.Flag, tt.flag)
}
if d.Status != SUCCESS {
t.Errorf("Status = %d, want SUCCESS", d.Status)
}
_ = i
})
}
}
================================================
FILE: app/distribute/teleport/protocol.go
================================================
package teleport
import (
"bytes"
"encoding/binary"
)
const (
DataLengthOfLenth = 4
)
// Protocol handles packet framing (pack/unpack).
type Protocol struct {
header string
headerLen int
}
// NewProtocol creates a protocol instance; packetHeader is the packet header identifier.
func NewProtocol(packetHeader string) *Protocol {
return &Protocol{
header: packetHeader,
headerLen: len([]byte(packetHeader)),
}
}
func (p *Protocol) ReSet(header string) {
p.header = header
p.headerLen = len([]byte(header))
}
// Packet frames a message for transmission.
func (p *Protocol) Packet(message []byte) []byte {
return append(append([]byte(p.header), IntToBytes(len(message))...), message...)
}
// Unpack extracts messages from the buffer.
func (p *Protocol) Unpack(buffer []byte) (readerSlice [][]byte, bufferOver []byte) {
length := len(buffer)
var i int
for i = 0; i < length; i = i + 1 {
if length < i+p.headerLen+DataLengthOfLenth {
break
}
if string(buffer[i:i+p.headerLen]) == p.header {
messageLength := BytesToInt(buffer[i+p.headerLen : i+p.headerLen+DataLengthOfLenth])
if length < i+p.headerLen+DataLengthOfLenth+messageLength {
break
}
data := buffer[i+p.headerLen+DataLengthOfLenth : i+p.headerLen+DataLengthOfLenth+messageLength]
readerSlice = append(readerSlice, data)
i += p.headerLen + DataLengthOfLenth + messageLength - 1
}
}
if i == length {
bufferOver = make([]byte, 0)
return
}
bufferOver = buffer[i:]
return
}
// IntToBytes converts int to bytes.
func IntToBytes(n int) []byte {
x := int32(n)
bytesBuffer := bytes.NewBuffer([]byte{})
binary.Write(bytesBuffer, binary.LittleEndian, x)
return bytesBuffer.Bytes()
}
// BytesToInt converts bytes to int.
func BytesToInt(b []byte) int {
bytesBuffer := bytes.NewBuffer(b)
var x int32
binary.Read(bytesBuffer, binary.LittleEndian, &x)
return int(x)
}
================================================
FILE: app/distribute/teleport/protocol_test.go
================================================
package teleport
import (
"bytes"
"testing"
)
func TestNewProtocol(t *testing.T) {
tests := []struct {
header string
}{
{""},
{"andeya"},
{"custom-header"},
}
for _, tt := range tests {
t.Run(tt.header, func(t *testing.T) {
p := NewProtocol(tt.header)
if p == nil {
t.Fatal("NewProtocol returned nil")
}
if p.header != tt.header {
t.Errorf("header = %q, want %q", p.header, tt.header)
}
wantLen := len([]byte(tt.header))
if p.headerLen != wantLen {
t.Errorf("headerLen = %d, want %d", p.headerLen, wantLen)
}
})
}
}
func TestProtocol_ReSet(t *testing.T) {
p := NewProtocol("old")
p.ReSet("new")
if p.header != "new" {
t.Errorf("header = %q, want new", p.header)
}
if p.headerLen != 3 {
t.Errorf("headerLen = %d, want 3", p.headerLen)
}
}
func TestProtocol_Packet(t *testing.T) {
p := NewProtocol("andeya")
msg := []byte("hello")
got := p.Packet(msg)
want := append(append([]byte("andeya"), IntToBytes(len(msg))...), msg...)
if !bytes.Equal(got, want) {
t.Errorf("Packet() = %v, want %v", got, want)
}
}
func TestProtocol_Unpack(t *testing.T) {
p := NewProtocol("andeya")
msg := []byte("hello")
packed := p.Packet(msg)
slice, rest := p.Unpack(packed)
if len(slice) != 1 {
t.Fatalf("len(slice) = %d, want 1", len(slice))
}
if !bytes.Equal(slice[0], msg) {
t.Errorf("Unpack()[0] = %v, want %v", slice[0], msg)
}
if len(rest) != 0 {
t.Errorf("rest = %v, want empty", rest)
}
}
func TestProtocol_Unpack_Multiple(t *testing.T) {
p := NewProtocol("ab")
m1 := []byte("x")
m2 := []byte("yz")
packed := append(p.Packet(m1), p.Packet(m2)...)
slice, rest := p.Unpack(packed)
if len(slice) != 2 {
t.Fatalf("len(slice) = %d, want 2", len(slice))
}
if !bytes.Equal(slice[0], m1) {
t.Errorf("slice[0] = %v, want %v", slice[0], m1)
}
if !bytes.Equal(slice[1], m2) {
t.Errorf("slice[1] = %v, want %v", slice[1], m2)
}
if len(rest) != 0 {
t.Errorf("rest len = %d, want 0", len(rest))
}
}
func TestProtocol_Unpack_Partial(t *testing.T) {
p := NewProtocol("ab")
msg := []byte("full")
packed := p.Packet(msg)
partial := packed[:len(packed)-2]
slice, rest := p.Unpack(partial)
if len(slice) != 0 {
t.Errorf("len(slice) = %d, want 0", len(slice))
}
if !bytes.Equal(rest, partial) {
t.Errorf("rest = %v, want %v", rest, partial)
}
}
func TestProtocol_Unpack_GarbageBeforeHeader(t *testing.T) {
p := NewProtocol("ab")
msg := []byte("x")
packed := p.Packet(msg)
buf := append([]byte("xx"), packed...)
slice, rest := p.Unpack(buf)
if len(slice) != 1 {
t.Fatalf("len(slice) = %d, want 1", len(slice))
}
if !bytes.Equal(slice[0], msg) {
t.Errorf("slice[0] = %v, want %v", slice[0], msg)
}
if len(rest) != 0 {
t.Errorf("rest len = %d, want 0", len(rest))
}
}
func TestProtocol_Unpack_EmptyBuffer(t *testing.T) {
p := NewProtocol("ab")
slice, rest := p.Unpack([]byte{})
if len(slice) != 0 {
t.Errorf("len(slice) = %d, want 0", len(slice))
}
if len(rest) != 0 {
t.Errorf("rest = %v, want empty", rest)
}
}
func TestProtocol_Unpack_TooShort(t *testing.T) {
p := NewProtocol("andeya")
slice, rest := p.Unpack([]byte("and"))
if len(slice) != 0 {
t.Errorf("len(slice) = %d, want 0", len(slice))
}
if !bytes.Equal(rest, []byte("and")) {
t.Errorf("rest = %v", rest)
}
}
func TestIntToBytes_BytesToInt(t *testing.T) {
tests := []int{0, 1, 42, 1024, -1}
for _, n := range tests {
b := IntToBytes(n)
got := BytesToInt(b)
if got != n {
t.Errorf("BytesToInt(IntToBytes(%d)) = %d", n, got)
}
}
}
================================================
FILE: app/distribute/teleport/return_func.go
================================================
package teleport
// ReturnData builds an API response. If OpAndToAndFrom[0] is empty, use peer operation; if [1] is empty, peer is receiver; if [2] is empty, self is sender.
func ReturnData(body interface{}, OpAndToAndFrom ...string) *NetData {
data := &NetData{
Status: SUCCESS,
Body: body,
}
if len(OpAndToAndFrom) > 0 {
data.Operation = OpAndToAndFrom[0]
}
if len(OpAndToAndFrom) > 1 {
data.To = OpAndToAndFrom[1]
}
if len(OpAndToAndFrom) > 2 {
data.From = OpAndToAndFrom[2]
}
return data
}
// ReturnError returns an error response; receive should be the original *NetData.
func ReturnError(receive *NetData, status int, msg string, nodeuid ...string) *NetData {
receive.Status = status
receive.Body = msg
receive.From = ""
if len(nodeuid) > 0 {
receive.To = nodeuid[0]
} else {
receive.To = ""
}
return receive
}
================================================
FILE: app/distribute/teleport/return_func_test.go
================================================
package teleport
import (
"testing"
)
func TestReturnData(t *testing.T) {
tests := []struct {
body string
args []string
wantOp, wantTo, wantFrom string
}{
{"ok", nil, "", "", ""},
{"x", []string{"op1"}, "op1", "", ""},
{"y", []string{"op2", "to2"}, "op2", "to2", ""},
{"z", []string{"op3", "to3", "from3"}, "op3", "to3", "from3"},
}
for _, tt := range tests {
t.Run(tt.body, func(t *testing.T) {
var d *NetData
if len(tt.args) == 0 {
d = ReturnData(tt.body)
} else {
d = ReturnData(tt.body, tt.args...)
}
if d == nil {
t.Fatal("ReturnData returned nil")
}
if d.Status != SUCCESS {
t.Errorf("Status = %d, want SUCCESS", d.Status)
}
if d.Body != tt.body {
t.Errorf("Body = %v, want %v", d.Body, tt.body)
}
if d.Operation != tt.wantOp {
t.Errorf("Operation = %q, want %q", d.Operation, tt.wantOp)
}
if d.To != tt.wantTo {
t.Errorf("To = %q, want %q", d.To, tt.wantTo)
}
if d.From != tt.wantFrom {
t.Errorf("From = %q, want %q", d.From, tt.wantFrom)
}
})
}
}
func TestReturnError(t *testing.T) {
req := &NetData{From: "a", To: "b", Operation: "task", Body: "orig"}
resp := ReturnError(req, FAILURE, "err msg", "target")
if resp != req {
t.Error("ReturnError should return same pointer")
}
if req.Status != FAILURE {
t.Errorf("Status = %d, want FAILURE", req.Status)
}
if req.Body != "err msg" {
t.Errorf("Body = %q, want err msg", req.Body)
}
if req.From != "" {
t.Errorf("From = %q, want empty", req.From)
}
if req.To != "target" {
t.Errorf("To = %q, want target", req.To)
}
}
func TestReturnError_NoNodeUID(t *testing.T) {
req := &NetData{To: "x"}
ReturnError(req, LLLEGAL, "bad")
if req.To != "" {
t.Errorf("To = %q, want empty", req.To)
}
}
================================================
FILE: app/distribute/teleport/server.go
================================================
package teleport
import (
"encoding/json"
"log"
"net"
"time"
"github.com/andeya/gust/result"
)
// tpServer holds server-only state.
type tpServer struct {
listener net.Listener
}
// Server starts server mode; port defaults to DEFAULT_PORT.
func (tp *TP) Server(port ...string) {
tp.reserveAPI()
tp.mode = SERVER
if len(port) > 0 {
tp.port = port[0]
} else {
tp.port = DEFAULT_PORT
}
if tp.uid == "" {
tp.uid = DEFAULT_SERVER_UID
}
if tp.timeout == 0 {
tp.timeout = DEFAULT_TIMEOUT_S
}
go tp.apiHandle()
go tp.server()
}
// --- Server implementation ---
func (tp *TP) server() {
retry:
listenerRes := result.Ret(net.Listen("tcp", tp.port))
if listenerRes.IsErr() {
debugPrintf("Debug: listen port error: %v", listenerRes.UnwrapErr())
time.Sleep(LOOP_TIMEOUT)
goto retry
}
tp.listener = listenerRes.Unwrap()
log.Printf(" * -- Server listening (port %v) --", tp.port)
for tp.listener != nil {
connRes := result.Ret(tp.listener.Accept())
if connRes.IsErr() {
return
}
conn := connRes.Unwrap()
debugPrintf("Debug: client %v connected, identity not yet verified", conn.RemoteAddr().String())
tp.sGoConn(conn)
}
}
// sGoConn starts read/write goroutines for each connection.
func (tp *TP) sGoConn(conn net.Conn) {
remoteAddr, connect := NewConnect(conn, tp.connBufferLen, tp.connWChanCap)
nodeuid, ok := tp.sInitConn(connect, remoteAddr)
if !ok {
conn.Close()
return
}
go tp.sReader(nodeuid)
go tp.sWriter(nodeuid)
}
// sInitConn initializes connection and binds node to conn; default key is node IP.
func (tp *TP) sInitConn(conn *Connect, remoteAddr string) (nodeuid string, usable bool) {
readLen, err := conn.Read(conn.Buffer)
if result.TryErrVoid(err).IsErr() || readLen == 0 {
return
}
conn.TmpBuffer = append(conn.TmpBuffer, conn.Buffer[:readLen]...)
dataSlice := make([][]byte, 10)
dataSlice, conn.TmpBuffer = tp.Unpack(conn.TmpBuffer)
for i, data := range dataSlice {
debugPrintln("Debug: received data batch 1 before decode: ", string(data))
d := new(NetData)
if result.RetVoid(json.Unmarshal(data, d)).IsErr() {
if i == 0 {
return
}
continue
}
if d.From == "" {
d.From = remoteAddr
}
if i == 0 {
debugPrintf("Debug: received data item 1 NetData: %+v", d)
if !tp.checkRights(d, remoteAddr) {
return
}
nodeuid = d.From
tp.connPool[nodeuid] = conn
if d.Operation != IDENTITY {
conn.Short = true
} else {
log.Printf(" * -- Client %v (%v) connected --", nodeuid, remoteAddr)
}
conn.Usable = true
}
tp.apiReadChan <- d
}
return nodeuid, true
}
// sReader reads data on the server side.
func (tp *TP) sReader(nodeuid string) {
defer func() {
tp.closeConn(nodeuid, false)
}()
var conn = tp.getConn(nodeuid)
for conn != nil {
if !conn.Short {
conn.SetReadDeadline(time.Now().Add(tp.timeout))
}
if !tp.read(conn) {
return
}
}
}
// sWriter sends data on the server side.
func (tp *TP) sWriter(nodeuid string) {
defer func() {
tp.closeConn(nodeuid, false)
}()
var conn = tp.getConn(nodeuid)
for conn != nil {
data := <-conn.WriteChan
tp.send(data)
if conn.Short {
return
}
}
}
================================================
FILE: app/distribute/teleport/teleport.go
================================================
// Package teleport provides a high-concurrency API framework for distributed systems.
// It uses socket duplex communication for peer-to-peer S/C, supports long and short connections,
// auto-reconnect after disconnect, and JSON for data transport.
package teleport
import (
"encoding/json"
"log"
"time"
"github.com/andeya/gust/result"
)
// Run mode constants.
const (
SERVER = iota + 1
CLIENT
)
// Reserved operation names for API handlers.
const (
IDENTITY = "+identity+"
HEARTBEAT = "+heartbeat+"
DEFAULT_PACK_HEADER = "andeya"
DEFAULT_SERVER_UID = "server"
DEFAULT_PORT = ":8080"
DEFAULT_TIMEOUT_S = 20e9
DEFAULT_TIMEOUT_C = 15e9
LOOP_TIMEOUT = 1e9
)
type Teleport interface {
Server(port ...string)
Client(serverAddr string, port string, isShort ...bool)
Request(body interface{}, operation string, flag string, nodeuid ...string)
SetAPI(api API) Teleport
Close(nodeuid ...string)
SetUID(mine string, server ...string) Teleport
SetPackHeader(string) Teleport
SetApiRChan(int) Teleport
SetConnWChan(int) Teleport
SetConnBuffer(int) Teleport
SetTimeout(time.Duration) Teleport
GetMode() int
CountNodes() int
}
type TP struct {
uid string
mode int
port string
serverAddr string
connPool map[string]*Connect
timeout time.Duration
*Protocol
apiReadChan chan *NetData
connWChanCap int
connBufferLen int
api API
*tpServer
*tpClient
}
type API map[string]Handle
// Handle processes requests.
type Handle interface {
Process(*NetData) *NetData
}
// New creates a Teleport instance.
func New() Teleport {
return &TP{
connPool: make(map[string]*Connect),
api: API{},
Protocol: NewProtocol(DEFAULT_PACK_HEADER),
apiReadChan: make(chan *NetData, 4096),
connWChanCap: 2048,
connBufferLen: 1024,
tpServer: new(tpServer),
tpClient: new(tpClient),
}
}
// --- Interface implementation ---
func (tp *TP) SetUID(mine string, server ...string) Teleport {
if len(server) > 0 {
tp.tpClient.serverUID = server[0]
}
tp.uid = mine
return tp
}
// SetAPI sets the application API.
func (tp *TP) SetAPI(api API) Teleport {
tp.api = api
return tp
}
// Request pushes data; blocks until a connection exists; empty nodeuid sends to a random node.
func (tp *TP) Request(body interface{}, operation string, flag string, nodeuid ...string) {
var conn *Connect
var uid string
if len(nodeuid) == 0 {
for {
if tp.CountNodes() > 0 {
break
}
time.Sleep(LOOP_TIMEOUT)
}
for uid, conn = range tp.connPool {
if conn.Usable {
nodeuid = append(nodeuid, uid)
break
}
}
}
conn = tp.getConn(nodeuid[0])
for conn == nil || !conn.Usable {
conn = tp.getConn(nodeuid[0])
time.Sleep(LOOP_TIMEOUT)
}
conn.WriteChan <- NewNetData(tp.uid, nodeuid[0], operation, flag, body)
}
// Close disconnects; empty nodeuid closes all; in server mode also stops listening.
func (tp *TP) Close(nodeuid ...string) {
if tp.mode == CLIENT {
tp.tpClient.mustClose = true
} else if tp.mode == SERVER && tp.tpServer.listener != nil {
tp.tpServer.listener.Close()
log.Printf(" * -- Server stopped listening on %v --", tp.port)
}
if len(nodeuid) == 0 {
uids := make([]string, 0, len(tp.connPool))
for uid := range tp.connPool {
uids = append(uids, uid)
}
for _, uid := range uids {
conn := tp.connPool[uid]
delete(tp.connPool, uid)
if conn != nil {
conn.Close()
tp.closeMsg(uid, conn.Addr(), conn.Short)
}
}
return
}
for _, uid := range nodeuid {
conn := tp.connPool[uid]
delete(tp.connPool, uid)
if conn != nil {
conn.Close()
tp.closeMsg(uid, conn.Addr(), conn.Short)
}
}
}
// SetPackHeader sets the packet header string.
func (tp *TP) SetPackHeader(header string) Teleport {
tp.Protocol.ReSet(header)
return tp
}
// SetApiRChan sets the global receive channel length.
func (tp *TP) SetApiRChan(length int) Teleport {
tp.apiReadChan = make(chan *NetData, length)
return tp
}
// SetConnWChan sets per-connection write channel length.
func (tp *TP) SetConnWChan(length int) Teleport {
tp.connWChanCap = length
return tp
}
// SetConnBuffer sets per-connection receive buffer size.
func (tp *TP) SetConnBuffer(length int) Teleport {
tp.connBufferLen = length
return tp
}
// SetTimeout sets connection timeout (heartbeat interval).
func (tp *TP) SetTimeout(long time.Duration) Teleport {
tp.timeout = long
return tp
}
// GetMode returns run mode.
func (tp *TP) GetMode() int {
return tp.mode
}
// CountNodes returns the number of active connections.
func (tp *TP) CountNodes() int {
count := 0
for _, conn := range tp.connPool {
if conn != nil && conn.Usable {
count++
}
}
return count
}
func (tp *TP) read(conn *Connect) bool {
readLen, err := conn.Read(conn.Buffer)
if result.TryErrVoid(err).IsErr() || readLen == 0 {
return false
}
conn.TmpBuffer = append(conn.TmpBuffer, conn.Buffer[:readLen]...)
tp.save(conn)
return true
}
// getConn returns the connection for the given node UID.
func (tp *TP) getConn(nodeuid string) *Connect {
return tp.connPool[nodeuid]
}
// getConnAddr returns the address of the connection for the given node UID.
func (tp *TP) getConnAddr(nodeuid string) string {
conn := tp.getConn(nodeuid)
if conn == nil {
return ""
}
return conn.Addr()
}
// closeConn closes the connection and exits the goroutine.
func (tp *TP) closeConn(nodeuid string, reconnect bool) {
conn, ok := tp.connPool[nodeuid]
if !ok {
return
}
if reconnect {
tp.connPool[nodeuid] = nil
} else {
delete(tp.connPool, nodeuid)
}
if conn == nil {
return
}
conn.Close()
tp.closeMsg(nodeuid, conn.Addr(), conn.Short)
}
// closeMsg logs connection close.
func (tp *TP) closeMsg(uid, addr string, short bool) {
if short {
return
}
switch tp.mode {
case SERVER:
log.Printf(" * -- Disconnected from client %v (%v) --", uid, addr)
case CLIENT:
log.Printf(" * -- Disconnected from server %v --", addr)
}
}
// send encodes and sends data.
func (tp *TP) send(data *NetData) {
if data.From == "" {
data.From = tp.uid
}
d := result.Ret(json.Marshal(*data)).UnwrapOrElse(func(err error) []byte {
debugPrintln("Debug: send data encode error", err)
return nil
})
if d == nil {
return
}
conn := tp.getConn(data.To)
if conn == nil {
debugPrintf("Debug: send data connection closed: %+v", data)
return
}
end := tp.Packet(d)
conn.Write(end)
debugPrintf("Debug: send data success: %+v", data)
}
// save decodes received data and stores it in the cache.
func (tp *TP) save(conn *Connect) {
debugPrintf("Debug: received data bytes: %v", conn.TmpBuffer)
dataSlice := make([][]byte, 10)
dataSlice, conn.TmpBuffer = tp.Unpack(conn.TmpBuffer)
for _, data := range dataSlice {
debugPrintf("Debug: received data before decode: %v", string(data))
d := new(NetData)
if r := result.RetVoid(json.Unmarshal(data, d)); r.IsErr() {
debugPrintf("Debug: received data decode error: %v", r.UnwrapErr())
continue
}
if d.From == "" {
d.From = conn.Addr()
}
tp.apiReadChan <- d
debugPrintf("Debug: received data NetData: %+v", d)
}
}
// apiHandle processes requests concurrently via the API.
func (tp *TP) apiHandle() {
for {
req := <-tp.apiReadChan
go func(req *NetData) {
var conn *Connect
operation, from, to, flag := req.Operation, req.To, req.From, req.Flag
handle, ok := tp.api[operation]
if !ok {
peerUID := from
peerConn := tp.getConn(peerUID)
addrStr := ""
if peerConn != nil {
addrStr = peerConn.LocalAddr().String()
}
if tp.mode == SERVER {
tp.autoErrorHandle(req, LLLEGAL, "Server ("+addrStr+") has no API: "+req.Operation, peerUID)
log.Printf("Client %v (%v) requesting non-existent API: %v", from, tp.getConnAddr(peerUID), req.Operation)
} else {
tp.autoErrorHandle(req, LLLEGAL, "Client "+from+" ("+addrStr+") has no API: "+req.Operation, peerUID)
log.Printf("Server (%v) requesting non-existent API: %v", tp.getConnAddr(peerUID), req.Operation)
}
return
}
resp := handle.Process(req)
if resp == nil {
if conn = tp.getConn(to); conn != nil && tp.getConn(to).Short {
tp.closeConn(to, false)
}
return //continue
}
if resp.To == "" {
resp.To = to
}
if conn = tp.getConn(resp.To); conn == nil {
tp.autoErrorHandle(req, FAILURE, "", to)
return
}
if resp.Operation == "" {
resp.Operation = operation
}
if resp.From == "" {
resp.From = from
}
if resp.Flag == "" {
resp.Flag = flag
}
conn.WriteChan <- resp
}(req)
}
}
func (tp *TP) autoErrorHandle(data *NetData, status int, msg string, reqFrom string) bool {
oldConn := tp.getConn(reqFrom)
if oldConn == nil {
return false
}
respErr := ReturnError(data, status, msg)
respErr.From = tp.uid
respErr.To = reqFrom
oldConn.WriteChan <- respErr
return true
}
// checkRights validates connection permissions.
func (tp *TP) checkRights(data *NetData, addr string) bool {
if data.To != tp.uid {
log.Printf("Unknown connection (%v) provided wrong server identifier, request rejected", addr)
return false
}
return true
}
// reserveAPI sets system-reserved API handlers.
func (tp *TP) reserveAPI() {
tp.api[IDENTITY] = identi
tp.api[HEARTBEAT] = beat
}
var identi, beat = new(identity), new(heartbeat)
type identity struct{}
func (*identity) Process(receive *NetData) *NetData {
return nil
}
type heartbeat struct{}
func (*heartbeat) Process(receive *NetData) *NetData {
return nil
}
================================================
FILE: app/distribute/teleport/teleport_test.go
================================================
package teleport
import (
"encoding/json"
"net"
"strconv"
"sync"
"testing"
"time"
)
func freePort(t *testing.T) string {
l, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("freePort: %v", err)
}
defer l.Close()
return strconv.Itoa(l.Addr().(*net.TCPAddr).Port)
}
func TestNew(t *testing.T) {
tp := New()
if tp == nil {
t.Fatal("New returned nil")
}
if tp.GetMode() != 0 {
t.Errorf("GetMode = %d, want 0", tp.GetMode())
}
if tp.CountNodes() != 0 {
t.Errorf("CountNodes = %d, want 0", tp.CountNodes())
}
}
func TestTP_SetUID(t *testing.T) {
tp := New().(*TP)
tp.SetUID("mine")
if tp.uid != "mine" {
t.Errorf("uid = %q, want mine", tp.uid)
}
tp.SetUID("client", "server")
if tp.tpClient.serverUID != "server" {
t.Errorf("serverUID = %q, want server", tp.tpClient.serverUID)
}
}
func TestTP_SetPackHeader(t *testing.T) {
tp := New().(*TP)
tp.SetPackHeader("custom")
if tp.Protocol.header != "custom" {
t.Errorf("header = %q, want custom", tp.Protocol.header)
}
}
func TestTP_SetApiRChan(t *testing.T) {
tp := New().(*TP)
tp.SetApiRChan(100)
if cap(tp.apiReadChan) != 100 {
t.Errorf("apiReadChan cap = %d, want 100", cap(tp.apiReadChan))
}
}
func TestTP_SetConnWChan(t *testing.T) {
tp := New().(*TP)
tp.SetConnWChan(512)
if tp.connWChanCap != 512 {
t.Errorf("connWChanCap = %d, want 512", tp.connWChanCap)
}
}
func TestTP_SetConnBuffer(t *testing.T) {
tp := New().(*TP)
tp.SetConnBuffer(2048)
if tp.connBufferLen != 2048 {
t.Errorf("connBufferLen = %d, want 2048", tp.connBufferLen)
}
}
func TestTP_SetTimeout(t *testing.T) {
tp := New().(*TP)
d := 5 * time.Second
tp.SetTimeout(d)
if tp.timeout != d {
t.Errorf("timeout = %v, want %v", tp.timeout, d)
}
}
func TestTP_SetAPI(t *testing.T) {
tp := New().(*TP)
api := API{"test": &identity{}}
tp.SetAPI(api)
if tp.api["test"] == nil {
t.Error("SetAPI did not set handler")
}
}
func TestTP_ServerClient_Pipe(t *testing.T) {
port := freePort(t)
portStr := ":" + port
serverTP := New().(*TP)
serverTP.SetUID("server").SetTimeout(100 * time.Millisecond)
serverTP.api["echo"] = &echoHandle{}
serverTP.Server(portStr)
time.Sleep(50 * time.Millisecond)
clientTP := New().(*TP)
clientTP.SetUID("client1").SetTimeout(100 * time.Millisecond)
clientTP.api["echo"] = &echoHandle{}
clientTP.Client("127.0.0.1", portStr)
time.Sleep(100 * time.Millisecond)
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
clientTP.Request("hello", "echo", "", "server")
}()
time.Sleep(200 * time.Millisecond)
serverTP.Close()
clientTP.Close()
wg.Wait()
}
type echoHandle struct{}
func (*echoHandle) Process(receive *NetData) *NetData {
return ReturnData(receive.Body, receive.Operation, receive.From, receive.To)
}
func TestTP_CloseSpecificNode(t *testing.T) {
port := freePort(t)
portStr := ":" + port
serverTP := New().(*TP)
serverTP.SetUID("server").SetTimeout(100 * time.Millisecond)
serverTP.api["echo"] = &echoHandle{}
serverTP.Server(portStr)
time.Sleep(50 * time.Millisecond)
clientTP := New().(*TP)
clientTP.SetUID("client1").SetTimeout(100 * time.Millisecond)
clientTP.api["echo"] = &echoHandle{}
clientTP.Client("127.0.0.1", portStr)
time.Sleep(100 * time.Millisecond)
serverTP.Close("client1")
clientTP.Close("server")
}
func TestConnect_Close(t *testing.T) {
client, server := net.Pipe()
defer server.Close()
_, conn := NewConnect(client, 64, 16)
if err := conn.Close(); err != nil {
t.Errorf("Close() = %v", err)
}
}
func TestTP_CheckRightsReject(t *testing.T) {
port := freePort(t)
portStr := ":" + port
serverTP := New().(*TP)
serverTP.SetUID("server").SetTimeout(100 * time.Millisecond)
serverTP.api["echo"] = &echoHandle{}
serverTP.Server(portStr)
time.Sleep(50 * time.Millisecond)
conn, err := net.Dial("tcp", "127.0.0.1"+portStr)
if err != nil {
t.Fatalf("Dial: %v", err)
}
defer conn.Close()
nd := &NetData{From: "evil", To: "wrongserver", Operation: IDENTITY, Body: nil}
data, _ := json.Marshal(nd)
p := NewProtocol(DEFAULT_PACK_HEADER)
packed := p.Packet(data)
conn.Write(packed)
conn.Close()
time.Sleep(100 * time.Millisecond)
serverTP.Close()
}
func TestDebugPrint(t *testing.T) {
Debug = true
defer func() { Debug = false }()
debugPrintf("test %v", 1)
debugPrintln("test")
}
func TestTP_GetConnAddr(t *testing.T) {
tp := New().(*TP)
if got := tp.getConnAddr("x"); got != "" {
t.Errorf("getConnAddr(\"x\") = %q, want empty", got)
}
client, server := net.Pipe()
defer client.Close()
defer server.Close()
_, c := NewConnect(client, 64, 16)
c.Usable = true
tp.connPool["node1"] = c
if got := tp.getConnAddr("node1"); got == "" {
t.Error("getConnAddr(\"node1\") = empty")
}
}
================================================
FILE: app/distribute/teleport/util.go
================================================
package teleport
import (
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"hash/crc32"
"hash/fnv"
"strconv"
)
// MakeHash converts a string to a hash value.
func MakeHash(s string) string {
const IEEE = 0xedb88320
var IEEETable = crc32.MakeTable(IEEE)
hash := fmt.Sprintf("%x", crc32.Checksum([]byte(s), IEEETable))
return hash
}
// HashString encodes a string to a 64-bit hash value.
func HashString(encode string) uint64 {
hash := fnv.New64()
hash.Write([]byte(encode))
return hash.Sum64()
}
// MakeUnique generates a unique fingerprint for an object (method 1).
func MakeUnique(obj interface{}) string {
baseString, _ := json.Marshal(obj)
return strconv.FormatUint(HashString(string(baseString)), 10)
}
// MakeMd5 generates an MD5 fingerprint for an object (method 2).
func MakeMd5(obj interface{}, length int) string {
if length > 32 {
length = 32
}
h := md5.New()
baseString, _ := json.Marshal(obj)
h.Write([]byte(baseString))
s := hex.EncodeToString(h.Sum(nil))
return s[:length]
}
================================================
FILE: app/distribute/teleport/util_test.go
================================================
package teleport
import (
"testing"
)
func TestMakeHash(t *testing.T) {
tests := []struct {
s string
want string
}{
{"", "0"},
{"a", "e8b7be43"},
{"hello", "3610a686"},
}
for _, tt := range tests {
got := MakeHash(tt.s)
if got != tt.want {
t.Errorf("MakeHash(%q) = %q, want %q", tt.s, got, tt.want)
}
}
}
func TestHashString(t *testing.T) {
tests := []struct {
s string
}{
{""},
{"x"},
{"hello world"},
}
for _, tt := range tests {
got := HashString(tt.s)
if tt.s != "" && got == 0 {
t.Errorf("HashString(%q) = 0", tt.s)
}
}
}
func TestMakeUnique(t *testing.T) {
tests := []struct {
obj interface{}
}{
{nil},
{"s"},
{map[string]int{"a": 1}},
}
for _, tt := range tests {
got := MakeUnique(tt.obj)
if got == "" {
t.Errorf("MakeUnique(%v) = empty", tt.obj)
}
}
}
func TestMakeMd5(t *testing.T) {
tests := []struct {
obj interface{}
length int
}{
{"x", 8},
{123, 16},
{[]int{1, 2}, 32},
{"y", 64},
}
for _, tt := range tests {
got := MakeMd5(tt.obj, tt.length)
wantLen := tt.length
if wantLen > 32 {
wantLen = 32
}
if len(got) != wantLen {
t.Errorf("MakeMd5(%v, %d) len = %d, want %d", tt.obj, tt.length, len(got), wantLen)
}
}
}
================================================
FILE: app/downloader/downloader.go
================================================
// Package downloader defines the page downloader interface.
package downloader
import (
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/spider"
)
// The Downloader interface.
// You can implement the interface by implement function Download.
// Function Download need to return Page instance pointer that has request result downloaded from Request.
type Downloader interface {
Download(*spider.Spider, *request.Request) *spider.Context
}
================================================
FILE: app/downloader/downloader_surfer.go
================================================
package downloader
import (
"errors"
"net/http"
"net/http/cookiejar"
"github.com/andeya/gust/result"
"github.com/andeya/gust/syncutil"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/downloader/surfer"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/config"
)
type Surfer struct {
surf surfer.Surfer
}
var (
cookieJar, _ = cookiejar.New(nil)
SurferDownloader = &Surfer{
surf: surfer.New(cookieJar),
}
)
var lazyPhantom = syncutil.NewLazyValueWithFunc(func() result.Result[surfer.Surfer] {
return result.Ok[surfer.Surfer](surfer.NewPhantom(config.Conf().PhantomJS, config.PhantomJSTemp, cookieJar))
})
var lazyChrome = syncutil.NewLazyValueWithFunc(func() result.Result[surfer.Surfer] {
return result.Ok[surfer.Surfer](surfer.NewChrome(cookieJar))
})
func (s *Surfer) Download(sp *spider.Spider, cReq *request.Request) *spider.Context {
ctx := spider.GetContext(sp, cReq)
var resp *http.Response
var err error
switch cReq.GetDownloaderID() {
case request.SurfID:
r := s.surf.Download(cReq)
if r.IsErr() {
err = r.UnwrapErr()
} else {
resp = r.Unwrap()
}
case request.PhantomID:
r := lazyPhantom.TryGetValue().Unwrap().Download(cReq)
if r.IsErr() {
err = r.UnwrapErr()
} else {
resp = r.Unwrap()
}
case request.ChromeID:
r := lazyChrome.TryGetValue().Unwrap().Download(cReq)
if r.IsErr() {
err = r.UnwrapErr()
} else {
resp = r.Unwrap()
}
}
if resp != nil && resp.StatusCode >= 400 {
err = errors.New("response status " + resp.Status)
}
ctx.SetResponse(resp).SetError(err)
return ctx
}
================================================
FILE: app/downloader/downloader_test.go
================================================
package downloader
import (
"net/http"
"net/http/httptest"
"testing"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/spider"
)
func TestSurferDownloader_implementsInterface(t *testing.T) {
var _ Downloader = SurferDownloader
}
func makeSpiderNotStopping(name string) *spider.Spider {
sp := &spider.Spider{
Name: name,
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
sp.Register()
return sp
}
func TestSurferDownloader_Download_SurfID(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("ok"))
}))
defer ts.Close()
sp := makeSpiderNotStopping("DownloaderTestSpider1")
req := &request.Request{URL: ts.URL, Rule: "r"}
req.Prepare()
ctx := SurferDownloader.Download(sp, req)
if ctx == nil {
t.Fatal("Download returned nil context")
}
if err := ctx.GetError(); err != nil {
t.Errorf("GetError() = %v, want nil", err)
}
if ctx.Response == nil {
t.Fatal("Response is nil")
}
if ctx.Response.StatusCode != 200 {
t.Errorf("StatusCode = %d, want 200", ctx.Response.StatusCode)
}
}
func TestSurferDownloader_Download_SurfID_error(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
}))
defer ts.Close()
sp := makeSpiderNotStopping("DownloaderTestSpider2")
req := &request.Request{URL: ts.URL, Rule: "r"}
req.Prepare()
ctx := SurferDownloader.Download(sp, req)
if ctx == nil {
t.Fatal("Download returned nil context")
}
if err := ctx.GetError(); err == nil {
t.Error("GetError() = nil, want error for 5xx")
}
}
func TestSurferDownloader_Download_SurfID_4xx(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
}))
defer ts.Close()
sp := makeSpiderNotStopping("DownloaderTestSpider4xx")
req := &request.Request{URL: ts.URL, Rule: "r"}
req.Prepare()
ctx := SurferDownloader.Download(sp, req)
if ctx == nil {
t.Fatal("Download returned nil context")
}
if err := ctx.GetError(); err == nil {
t.Error("GetError() = nil, want error for 4xx")
}
}
func TestSurferDownloader_Download_SurfID_badURL(t *testing.T) {
sp := makeSpiderNotStopping("DownloaderTestSpider3")
req := &request.Request{URL: "http://localhost:0/nonexistent", Rule: "r"}
req.Prepare()
ctx := SurferDownloader.Download(sp, req)
if ctx == nil {
t.Fatal("Download returned nil context")
}
if err := ctx.GetError(); err == nil {
t.Error("GetError() = nil, want error for failed request")
}
}
================================================
FILE: app/downloader/request/request.go
================================================
// Package request provides encapsulation and deduplication of crawl requests.
package request
import (
"crypto/md5"
"encoding/hex"
"encoding/json"
"net/http"
"net/url"
"strings"
"sync"
"time"
"github.com/andeya/gust/option"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/util"
)
// Request represents object waiting for being crawled.
type Request struct {
Spider string // spider name, auto-set, do not set manually
URL string // target URL, required
Rule string // rule node name for parsing response, required
Method string // GET POST POST-M HEAD
Header http.Header // request headers
EnableCookie bool // whether to use cookies, set in Spider.EnableCookie
PostData string // POST values
DialTimeout time.Duration // dial timeout (dial tcp: i/o timeout)
ConnTimeout time.Duration // connection timeout (WSARecv tcp: i/o timeout)
TryTimes int // max download retry attempts
RetryPause time.Duration // wait time before retry after download failure
RedirectTimes int // max redirects; 0=unlimited, <0=no redirects
Temp Temp // temporary data
TempIsJSON map[string]bool // marks Temp fields stored as JSON; auto-set, do not set manually
Priority int // scheduling priority, default 0 (min priority)
Reloadable bool // whether the link can be re-downloaded
// DownloaderID: 0=Surf (high concurrency, full features), 1=PhantomJS (strong anti-block, slow, low concurrency)
DownloaderID int
proxy string // proxy, auto-set when UI enables proxy
unique string // unique ID
lock sync.RWMutex
}
const (
DefaultDialTimeout = 2 * time.Minute // default server request timeout
DefaultConnTimeout = 2 * time.Minute // default download timeout
DefaultTryTimes = 3 // default max download attempts
DefaultRetryPause = 2 * time.Second // default pause before retry
)
const (
SurfID = 0 // Surf downloader (native Go), do not change
PhantomID = 1 // PhantomJS downloader (fallback, rarely used)
ChromeID = 2 // Chromium headless browser downloader
)
// Prepare sets default values before sending a request.
// Request.URL and Request.Rule must be set.
// Request.Spider is auto-set by the system.
// Request.EnableCookie is set in Spider; per-request values are ignored.
// Optional fields with defaults: Method (GET), DialTimeout, ConnTimeout, TryTimes,
// RedirectTimes, RetryPause, DownloaderID (0=Surf, 1=PhantomJS).
func (r *Request) Prepare() result.VoidResult {
URL, err := url.Parse(r.URL)
if err != nil {
return result.TryErrVoid(err)
}
r.URL = URL.String()
if r.Method == "" {
r.Method = "GET"
} else {
r.Method = strings.ToUpper(r.Method)
}
if r.Header == nil {
r.Header = make(http.Header)
}
if r.DialTimeout < 0 {
r.DialTimeout = 0
} else if r.DialTimeout == 0 {
r.DialTimeout = DefaultDialTimeout
}
if r.ConnTimeout < 0 {
r.ConnTimeout = 0
} else if r.ConnTimeout == 0 {
r.ConnTimeout = DefaultConnTimeout
}
if r.TryTimes == 0 {
r.TryTimes = DefaultTryTimes
}
if r.RetryPause <= 0 {
r.RetryPause = DefaultRetryPause
}
if r.Priority < 0 {
r.Priority = 0
}
if r.DownloaderID < SurfID || r.DownloaderID > ChromeID {
r.DownloaderID = SurfID
}
if r.TempIsJSON == nil {
r.TempIsJSON = make(map[string]bool)
}
if r.Temp == nil {
r.Temp = make(Temp)
}
return result.OkVoid()
}
// UnSerialize deserializes a Request from JSON string.
func UnSerialize(s string) result.Result[*Request] {
req := new(Request)
return result.Ret(req, json.Unmarshal([]byte(s), req))
}
// Serialize serializes the Request to JSON string.
func (r *Request) Serialize() result.Result[string] {
for k, v := range r.Temp {
r.Temp.set(k, v)
r.TempIsJSON[k] = true
}
b, err := json.Marshal(r)
if err != nil {
return result.TryErr[string](err)
}
return result.Ok(strings.ReplaceAll(util.Bytes2String(b), `\u0026`, `&`))
}
// Unique returns the unique identifier for the request.
func (r *Request) Unique() string {
if r.unique == "" {
block := md5.Sum([]byte(r.Spider + r.Rule + r.URL + r.Method))
r.unique = hex.EncodeToString(block[:])
}
return r.unique
}
// Copy returns a deep copy of the request.
func (r *Request) Copy() result.Result[*Request] {
reqcopy := new(Request)
b, err := json.Marshal(r)
if err != nil {
return result.TryErr[*Request](err)
}
return result.Ret(reqcopy, json.Unmarshal(b, reqcopy))
}
// GetURL returns the request URL.
func (r *Request) GetURL() string {
return r.URL
}
// GetMethod returns the HTTP method name (e.g. GET, POST).
func (r *Request) GetMethod() string {
return r.Method
}
// SetMethod sets the HTTP method.
func (r *Request) SetMethod(method string) *Request {
r.Method = strings.ToUpper(method)
return r
}
func (r *Request) SetURL(url string) *Request {
r.URL = url
return r
}
func (r *Request) GetReferer() string {
return r.Header.Get("Referer")
}
func (r *Request) SetReferer(referer string) *Request {
r.Header.Set("Referer", referer)
return r
}
func (r *Request) GetPostData() string {
return r.PostData
}
func (r *Request) GetHeader() http.Header {
return r.Header
}
func (r *Request) SetHeader(key, value string) *Request {
r.Header.Set(key, value)
return r
}
func (r *Request) AddHeader(key, value string) *Request {
r.Header.Add(key, value)
return r
}
func (r *Request) GetEnableCookie() bool {
return r.EnableCookie
}
func (r *Request) SetEnableCookie(enableCookie bool) *Request {
r.EnableCookie = enableCookie
return r
}
func (r *Request) GetCookies() string {
return r.Header.Get("Cookie")
}
func (r *Request) SetCookies(cookie string) *Request {
r.Header.Set("Cookie", cookie)
return r
}
func (r *Request) GetDialTimeout() time.Duration {
return r.DialTimeout
}
func (r *Request) GetConnTimeout() time.Duration {
return r.ConnTimeout
}
func (r *Request) GetTryTimes() int {
return r.TryTimes
}
func (r *Request) GetRetryPause() time.Duration {
return r.RetryPause
}
func (r *Request) GetProxy() string {
return r.proxy
}
func (r *Request) SetProxy(proxy string) *Request {
r.proxy = proxy
return r
}
func (r *Request) GetRedirectTimes() int {
return r.RedirectTimes
}
func (r *Request) GetRuleName() string {
return r.Rule
}
func (r *Request) SetRuleName(ruleName string) *Request {
r.Rule = ruleName
return r
}
func (r *Request) GetSpiderName() string {
return r.Spider
}
func (r *Request) SetSpiderName(spiderName string) *Request {
r.Spider = spiderName
return r
}
func (r *Request) IsReloadable() bool {
return r.Reloadable
}
func (r *Request) SetReloadable(can bool) *Request {
r.Reloadable = can
return r
}
// GetTemp returns temporary cached data. defaultValue must not be nil.
func (r *Request) GetTemp(key string, defaultValue interface{}) interface{} {
if defaultValue == nil {
panic("*Request.GetTemp() defaultValue must not be nil, key=" + key)
}
r.lock.RLock()
defer r.lock.RUnlock()
if r.Temp[key] == nil {
return defaultValue
}
if r.TempIsJSON[key] {
return r.Temp.get(key, defaultValue)
}
return r.Temp[key]
}
// GetTempOpt returns temporary cached data as Option. None when key is missing.
func (r *Request) GetTempOpt(key string) option.Option[interface{}] {
r.lock.RLock()
defer r.lock.RUnlock()
if _, ok := r.Temp[key]; !ok {
return option.None[interface{}]()
}
if r.TempIsJSON[key] {
var v interface{}
r.Temp.get(key, &v)
return option.Some(v)
}
return option.Some(r.Temp[key])
}
func (r *Request) GetTemps() Temp {
return r.Temp
}
func (r *Request) SetTemp(key string, value interface{}) *Request {
r.lock.Lock()
r.Temp[key] = value
delete(r.TempIsJSON, key)
r.lock.Unlock()
return r
}
func (r *Request) SetTemps(temp map[string]interface{}) *Request {
r.lock.Lock()
r.Temp = temp
r.TempIsJSON = make(map[string]bool)
r.lock.Unlock()
return r
}
func (r *Request) GetPriority() int {
return r.Priority
}
func (r *Request) SetPriority(priority int) *Request {
r.Priority = priority
return r
}
func (r *Request) GetDownloaderID() int {
return r.DownloaderID
}
func (r *Request) SetDownloaderID(id int) *Request {
r.DownloaderID = id
return r
}
func (r *Request) MarshalJSON() ([]byte, error) {
for k, v := range r.Temp {
if r.TempIsJSON[k] {
continue
}
r.Temp.set(k, v)
r.TempIsJSON[k] = true
}
// Marshal a struct without the mutex to avoid copying sync.RWMutex
j := struct {
Spider string
URL string
Rule string
Method string
Header http.Header
EnableCookie bool
PostData string
DialTimeout time.Duration
ConnTimeout time.Duration
TryTimes int
RetryPause time.Duration
RedirectTimes int
Temp Temp
TempIsJSON map[string]bool
Priority int
Reloadable bool
DownloaderID int
}{
Spider: r.Spider,
URL: r.URL,
Rule: r.Rule,
Method: r.Method,
Header: r.Header,
EnableCookie: r.EnableCookie,
PostData: r.PostData,
DialTimeout: r.DialTimeout,
ConnTimeout: r.ConnTimeout,
TryTimes: r.TryTimes,
RetryPause: r.RetryPause,
RedirectTimes: r.RedirectTimes,
Temp: r.Temp,
TempIsJSON: r.TempIsJSON,
Priority: r.Priority,
Reloadable: r.Reloadable,
DownloaderID: r.DownloaderID,
}
return json.Marshal(j)
}
================================================
FILE: app/downloader/request/request_test.go
================================================
package request
import (
"encoding/json"
"net/http"
"testing"
"time"
)
func TestReqTemp(t *testing.T) {
var a = &Request{
Temp: Temp{"3": map[string]int{"33": 33}},
}
a.Prepare()
a.SetTemp("6", 66)
c, _ := json.Marshal(&a)
var b = Request{}
json.Unmarshal(c, &b)
b.SetTemp("1", map[string]int{"11": 11})
b.SetTemp("2", []int{22})
b.SetTemp("4", 44)
b.SetTemp("5", "55")
b.SetTemp("x", x{"henry"})
t.Logf("%#v", b.TempIsJSON)
t.Logf("%#v", b.Temp)
t.Logf("1:%#v\n", b.GetTemp("1", map[string]int{}))
t.Logf("2:%#v\n", b.GetTemp("2", []int{}))
t.Logf("3:%#v\n", b.GetTemp("3", map[string]int{}))
t.Logf("4:%v\n", b.GetTemp("4", 0))
t.Logf("5:%#v\n", b.GetTemp("5", ""))
t.Logf("6:%v\n", b.GetTemp("6", 0))
t.Logf("x:%v\n", b.GetTemp("x", x{}))
_b := b.Copy().Unwrap()
_b.SetTemp("6", 666)
t.Logf("%#v", _b.TempIsJSON)
t.Logf("%#v", _b.Temp)
t.Logf("5:%#v\n", _b.GetTemp("5", 1.0))
t.Logf("5:%#v\n", _b.GetTemp("5", ""))
t.Logf("6:%#v\n", _b.GetTemp("6", 0))
t.Logf("x:%v\n", b.GetTemp("x", &x{}))
t.Logf("10000:%#v\n", _b.GetTemp("10000", 999))
}
type x struct {
Name string
}
func TestPrepare(t *testing.T) {
t.Run("invalid URL", func(t *testing.T) {
r := &Request{URL: "://invalid"}
res := r.Prepare()
if res.IsOk() {
t.Error("expected Prepare to fail for invalid URL")
}
})
t.Run("edge cases", func(t *testing.T) {
tests := []struct {
name string
req *Request
chk func(*Request)
}{
{
name: "negative DialTimeout",
req: &Request{URL: "http://a.com", Rule: "r", DialTimeout: -1},
chk: func(r *Request) {
r.Prepare()
if r.DialTimeout != 0 {
t.Errorf("DialTimeout=%v", r.DialTimeout)
}
},
},
{
name: "negative ConnTimeout",
req: &Request{URL: "http://a.com", Rule: "r", ConnTimeout: -1},
chk: func(r *Request) {
r.Prepare()
if r.ConnTimeout != 0 {
t.Errorf("ConnTimeout=%v", r.ConnTimeout)
}
},
},
{
name: "negative Priority",
req: &Request{URL: "http://a.com", Rule: "r", Priority: -5},
chk: func(r *Request) {
r.Prepare()
if r.Priority != 0 {
t.Errorf("Priority=%v", r.Priority)
}
},
},
{
name: "DownloaderID out of range low",
req: &Request{URL: "http://a.com", Rule: "r", DownloaderID: -1},
chk: func(r *Request) {
r.Prepare()
if r.DownloaderID != SurfID {
t.Errorf("DownloaderID=%v", r.DownloaderID)
}
},
},
{
name: "DownloaderID out of range high",
req: &Request{URL: "http://a.com", Rule: "r", DownloaderID: 99},
chk: func(r *Request) {
r.Prepare()
if r.DownloaderID != SurfID {
t.Errorf("DownloaderID=%v", r.DownloaderID)
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.chk(tt.req)
})
}
})
}
func TestSerializeUnSerialize(t *testing.T) {
r := &Request{
Spider: "s", URL: "http://example.com", Rule: "r",
Method: "POST", PostData: "a=1",
Header: http.Header{"X-Custom": {"v"}},
EnableCookie: true,
Temp: Temp{"k": "v"},
}
r.Prepare()
res := r.Serialize()
if res.IsErr() {
t.Fatalf("Serialize: %v", res.Err())
}
s := res.Unwrap()
if s == "" {
t.Error("Serialize returned empty string")
}
ures := UnSerialize(s)
if ures.IsErr() {
t.Fatalf("UnSerialize: %v", ures.Err())
}
req := ures.Unwrap()
if req.URL != r.URL || req.Method != r.Method || req.Spider != r.Spider {
t.Errorf("UnSerialize mismatch: got %+v", req)
}
}
func TestUnSerializeInvalid(t *testing.T) {
res := UnSerialize("invalid json {{{")
if res.IsOk() {
t.Error("expected UnSerialize to fail")
}
}
func TestUnique(t *testing.T) {
r := &Request{Spider: "s", Rule: "r", URL: "http://a.com", Method: "GET"}
r.Prepare()
u1 := r.Unique()
u2 := r.Unique()
if u1 != u2 || len(u1) != 32 {
t.Errorf("Unique: %q vs %q", u1, u2)
}
}
func TestCopy(t *testing.T) {
r := &Request{Spider: "s", URL: "http://a.com", Rule: "r"}
r.Prepare()
r.SetTemp("x", 1)
cres := r.Copy()
if cres.IsErr() {
t.Fatal(cres.Err())
}
c := cres.Unwrap()
if c.URL != r.URL || c.Spider != r.Spider {
t.Errorf("Copy mismatch")
}
if v, ok := c.GetTemp("x", 0).(float64); !ok || v != 1 {
t.Errorf("Copy Temp mismatch: got %v", c.GetTemp("x", 0))
}
}
func TestGettersSetters(t *testing.T) {
r := &Request{URL: "http://a.com", Rule: "r"}
r.Prepare()
tests := []struct {
name string
fn func()
}{
{"GetURL", func() {
r.SetURL("http://u.com")
if r.GetURL() != "http://u.com" {
t.Error("GetURL")
}
}},
{"GetMethod", func() {
r.SetMethod("post")
if r.GetMethod() != "POST" {
t.Error("GetMethod")
}
}},
{"GetReferer", func() {
r.SetReferer("http://ref")
if r.GetReferer() != "http://ref" {
t.Error("GetReferer")
}
}},
{"GetPostData", func() {
r.PostData = "p=1"
if r.GetPostData() != "p=1" {
t.Error("GetPostData")
}
}},
{"GetHeader", func() {
r.SetHeader("A", "1")
if r.GetHeader().Get("A") != "1" {
t.Error("GetHeader")
}
}},
{"AddHeader", func() {
r.AddHeader("B", "2")
if r.GetHeader().Get("B") != "2" {
t.Error("AddHeader")
}
}},
{"GetEnableCookie", func() {
r.SetEnableCookie(true)
if !r.GetEnableCookie() {
t.Error("GetEnableCookie")
}
}},
{"GetCookies", func() {
r.SetCookies("c=1")
if r.GetCookies() != "c=1" {
t.Error("GetCookies")
}
}},
{"GetDialTimeout", func() {
r.DialTimeout = 5 * time.Second
if r.GetDialTimeout() != 5*time.Second {
t.Error("GetDialTimeout")
}
}},
{"GetConnTimeout", func() {
r.ConnTimeout = 10 * time.Second
if r.GetConnTimeout() != 10*time.Second {
t.Error("GetConnTimeout")
}
}},
{"GetTryTimes", func() {
r.TryTimes = 5
if r.GetTryTimes() != 5 {
t.Error("GetTryTimes")
}
}},
{"GetRetryPause", func() {
r.RetryPause = 3 * time.Second
if r.GetRetryPause() != 3*time.Second {
t.Error("GetRetryPause")
}
}},
{"GetProxy", func() {
r.SetProxy("http://p")
if r.GetProxy() != "http://p" {
t.Error("GetProxy")
}
}},
{"GetRedirectTimes", func() {
r.RedirectTimes = 2
if r.GetRedirectTimes() != 2 {
t.Error("GetRedirectTimes")
}
}},
{"GetRuleName", func() {
r.SetRuleName("r1")
if r.GetRuleName() != "r1" {
t.Error("GetRuleName")
}
}},
{"GetSpiderName", func() {
r.SetSpiderName("sp")
if r.GetSpiderName() != "sp" {
t.Error("GetSpiderName")
}
}},
{"IsReloadable", func() {
r.SetReloadable(true)
if !r.IsReloadable() {
t.Error("IsReloadable")
}
}},
{"GetPriority", func() {
r.SetPriority(3)
if r.GetPriority() != 3 {
t.Error("GetPriority")
}
}},
{"GetDownloaderID", func() {
r.SetDownloaderID(PhantomID)
if r.GetDownloaderID() != PhantomID {
t.Error("GetDownloaderID")
}
}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.fn()
})
}
}
func TestGetTempOpt(t *testing.T) {
r := &Request{URL: "http://a.com", Rule: "r", Temp: Temp{"a": 1}}
r.Prepare()
if opt := r.GetTempOpt("missing"); opt.IsSome() {
t.Error("expected None for missing key")
}
if opt := r.GetTempOpt("a"); !opt.IsSome() || opt.Unwrap() != 1 {
t.Errorf("GetTempOpt(a)=%v", opt)
}
r.SetTemp("j", map[string]int{"x": 1})
sres := r.Serialize()
if sres.IsErr() {
t.Fatal(sres.Err())
}
ures := UnSerialize(sres.Unwrap())
if ures.IsErr() {
t.Fatal(ures.Err())
}
req := ures.Unwrap()
if opt := req.GetTempOpt("j"); !opt.IsSome() {
t.Error("GetTempOpt(j) expected Some")
}
}
func TestGetTemps(t *testing.T) {
r := &Request{URL: "http://a.com", Rule: "r", Temp: Temp{"k": "v"}}
r.Prepare()
temps := r.GetTemps()
if temps["k"] != "v" {
t.Errorf("GetTemps=%v", temps)
}
}
func TestSetTemps(t *testing.T) {
r := &Request{URL: "http://a.com", Rule: "r"}
r.Prepare()
r.SetTemps(map[string]interface{}{"x": 1, "y": "2"})
if r.Temp["x"] != 1 || r.Temp["y"] != "2" {
t.Errorf("SetTemps=%v", r.Temp)
}
}
func TestGetTempPanic(t *testing.T) {
defer func() {
if recover() == nil {
t.Error("expected panic for nil defaultValue")
}
}()
r := &Request{URL: "http://a.com", Rule: "r"}
r.Prepare()
r.GetTemp("k", nil)
}
================================================
FILE: app/downloader/request/temp.go
================================================
package request
import (
"encoding/json"
"reflect"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/logs"
)
type Temp map[string]interface{}
// get returns temporary cached data by deserializing from JSON.
func (t Temp) get(key string, defaultValue interface{}) interface{} {
defer func() {
if p := recover(); p != nil {
logs.Log().Error(" * Request.Temp.Get(%v): %v", key, p)
}
}()
var (
err error
b = util.String2Bytes(t[key].(string))
)
if reflect.TypeOf(defaultValue).Kind() == reflect.Ptr {
err = json.Unmarshal(b, defaultValue)
} else {
err = json.Unmarshal(b, &defaultValue)
}
if err != nil {
logs.Log().Error(" * Request.Temp.Get(%v): %v", key, err)
}
return defaultValue
}
func (t Temp) set(key string, value interface{}) Temp {
b, err := json.Marshal(value)
if err != nil {
logs.Log().Error(" * Request.Temp.Set(%v): %v", key, err)
}
t[key] = util.Bytes2String(b)
return t
}
================================================
FILE: app/downloader/surfer/agent/agent.go
================================================
// Package agent generates user agents strings for well known browsers
// and for custom browsers.
//
// When submitting patches to add user agents formats, please *always* include
// "{{.Coms}}" between the opening ( and closing ) braces, even if you're
// sure the browser would never have additional comments.
package agent
import (
"bytes"
"math/rand"
"runtime"
"strings"
"text/template"
"time"
)
// TemplateData structure for template data.
type TemplateData struct {
Name string
Ver string
OSN string
OSV string
Coms string
}
// OSAttributes stores OS attributes.
type OSAttributes struct {
// OSName is the operating system name.
OSName string
// OSVersion is the operating system version.
OSVersion string
// Comments are additional comments to add to a user agent string.
Comments []string
}
const (
// Windows operating system.
Windows int = iota
// Linux based operating system.
Linux
// Macintosh/OS X operating system.
Macintosh
)
// DefaultOSAttributes stores default OS attributes.
var DefaultOSAttributes = map[int]OSAttributes{
Windows: {"Windows NT", "10.0", []string{"Win64", "x64"}},
Linux: {"Linux", "x86_64", []string{}},
Macintosh: {"Intel Mac OS X", "10_15_7", []string{}},
}
type (
// Formats is a collection of UA format strings.
// key is the browser version.
// value is the browser info.
Formats map[string]string
// UAData stores information on a browser user agent.
UAData struct {
TopVersion string
DefaultOS int
Formats Formats
}
// UATable is a collection of UAData values.
// key is the name of the browser.
UATable map[string]UAData
)
// Database is the "database" of user agents.
var Database = UATable{
"chrome": {
"127.0.6533.73",
Windows,
Formats{
"127": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"126": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"125": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"124": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"123": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"122": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"121": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
"120": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{{.Ver}} Safari/537.36",
},
},
"firefox": {
"127.0",
Windows,
Formats{
"127": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:127.0) Gecko/20100101 Firefox/{{.Ver}}",
"126": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:126.0) Gecko/20100101 Firefox/{{.Ver}}",
"125": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:125.0) Gecko/20100101 Firefox/{{.Ver}}",
"124": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:124.0) Gecko/20100101 Firefox/{{.Ver}}",
"123": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:123.0) Gecko/20100101 Firefox/{{.Ver}}",
"122": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:122.0) Gecko/20100101 Firefox/{{.Ver}}",
"121": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:121.0) Gecko/20100101 Firefox/{{.Ver}}",
"120": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:120.0) Gecko/20100101 Firefox/{{.Ver}}",
},
},
"edge": {
"127.0.2651.74",
Windows,
Formats{
"127": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/{{.Ver}}",
"126": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/{{.Ver}}",
"125": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/{{.Ver}}",
"124": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/{{.Ver}}",
},
},
"safari": {
"17.5",
Macintosh,
Formats{
"17": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{{.Ver}} Safari/605.1.15",
"16": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/{{.Ver}} Safari/605.1.15",
},
},
"googlebot": {
"2.1",
Linux,
Formats{
"2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})",
"1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})",
},
},
"bingbot": {
"2.0",
Windows,
Formats{
"2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})",
},
},
"yahoobot": {
"2.0",
Linux,
Formats{
"2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})",
},
},
"default": {
"1.0",
Linux,
Formats{
"1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})",
},
},
}
// UserAgents holds all generated user agent strings.
var UserAgents = map[string][]string{}
func init() {
for browser, userAgentData := range Database {
if browser == "default" {
continue
}
os := userAgentData.DefaultOS
osAttribs := DefaultOSAttributes[os]
for version, _ := range userAgentData.Formats {
ua := createFromDetails(
browser,
version,
osAttribs.OSName,
osAttribs.OSVersion,
osAttribs.Comments)
UserAgents["all"] = append(UserAgents["all"], ua)
if browser != "googlebot" && browser != "bingbot" && browser != "yahoobot" {
UserAgents["common"] = append(UserAgents["common"], ua)
}
}
}
l := len(UserAgents["common"])
r := rand.New(rand.NewSource(time.Now().UnixNano()))
idx := r.Intn(l)
UserAgents["all"][0], UserAgents["all"][idx] = UserAgents["all"][idx], UserAgents["all"][0]
UserAgents["common"][0], UserAgents["common"][idx] = UserAgents["common"][idx], UserAgents["common"][0]
}
// Create generates and returns a complete user agent string.
func CreateReal() string {
return createFromDetails("Surfer", "1.0", osName(), osVersion(), []string{runtime.Version()})
}
// CreateDefault returns a user agent string using default values.
func CreateDefault(browser string) string {
bn := strings.ToLower(browser)
data := Database[bn]
os := data.DefaultOS
osAttribs := DefaultOSAttributes[os]
return createFromDetails(
browser,
data.TopVersion,
osAttribs.OSName,
osAttribs.OSVersion,
osAttribs.Comments)
}
// CreateVersion generates and returns a complete user agent string for a specific browser version.
func CreateVersion(browser, version string) string {
bn := strings.ToLower(browser)
data := Database[bn]
os := data.DefaultOS
osAttribs := DefaultOSAttributes[os]
return createFromDetails(
browser,
version,
osAttribs.OSName,
osAttribs.OSVersion,
osAttribs.Comments)
}
// TopVersion returns the most recent version for the given browser name.
func TopVersion(bname string) string {
bname = strings.ToLower(bname)
data, ok := Database[bname]
if ok {
return data.TopVersion
}
return Database["default"].TopVersion
}
// Format returns the format string for the given browser name and version.
//
// When a format can't be found for a version, the first format string for the browser
// is returned. When a format can't be found for the browser the default format is
// returned.
func Format(bname, bver string) string {
bname = strings.ToLower(bname)
majVer := strings.Split(bver, ".")[0]
data, ok := Database[bname]
if ok {
format, ok := data.Formats[majVer]
if ok {
return format
} else {
top := TopVersion(bname)
majVer = strings.Split(top, ".")[0]
return data.Formats[majVer]
}
}
return Database["default"].Formats["1"]
}
// createFromDetails generates and returns a complete user agent string.
func createFromDetails(bname, bver, osname, osver string, c []string) string {
if bver == "" {
bver = TopVersion(bname)
}
comments := strings.Join(c, "; ")
if comments != "" {
comments = "; " + comments
}
data := TemplateData{bname, bver, osname, osver, comments}
buff := &bytes.Buffer{}
t := template.New("formatter")
t.Parse(Format(bname, bver))
t.Execute(buff, data)
return buff.String()
}
================================================
FILE: app/downloader/surfer/agent/agent_bsd.go
================================================
//go:build darwin || dragonfly || freebsd || netbsd || openbsd
// Package agent provides system User-Agent information.
package agent
import (
"runtime"
"syscall"
)
// osName returns the name of the OS.
func osName() string {
name, err := syscall.Sysctl("kern.ostype")
if err != nil {
return runtime.GOOS
}
return name
}
// osVersion returns the OS version.
func osVersion() string {
release, err := syscall.Sysctl("kern.osrelease")
if err != nil {
return "0.0"
}
return release
}
================================================
FILE: app/downloader/surfer/agent/agent_linux.go
================================================
//go:build linux && !arm
package agent
import (
"runtime"
"syscall"
)
// osName returns the name of the OS.
func osName() string {
buf := &syscall.Utsname{}
err := syscall.Uname(buf)
if err != nil {
return runtime.GOOS
}
return charsToString(buf.Sysname)
}
// osVersion returns the OS version.
func osVersion() string {
buf := &syscall.Utsname{}
err := syscall.Uname(buf)
if err != nil {
return "0.0"
}
return charsToString(buf.Release)
}
// charsToString converts a [65]int8 byte array into a string.
func charsToString(ca [65]int8) string {
s := make([]byte, len(ca))
var lens int
for ; lens < len(ca); lens++ {
if ca[lens] == 0 {
break
}
s[lens] = uint8(ca[lens])
}
return string(s[0:lens])
}
================================================
FILE: app/downloader/surfer/agent/agent_linux_arm.go
================================================
//go:build linux && arm
package agent
import (
"runtime"
"syscall"
)
// osName returns the name of the OS.
func osName() string {
buf := &syscall.Utsname{}
err := syscall.Uname(buf)
if err != nil {
return runtime.GOOS
}
return charsToString(buf.Sysname)
}
// osVersion returns the OS version.
func osVersion() string {
buf := &syscall.Utsname{}
err := syscall.Uname(buf)
if err != nil {
return "0.0"
}
return charsToString(buf.Release)
}
// charsToString converts a [65]uint8 byte array into a string.
func charsToString(ca [65]uint8) string {
s := make([]byte, len(ca))
var lens int
for ; lens < len(ca); lens++ {
if ca[lens] == 0 {
break
}
s[lens] = uint8(ca[lens])
}
return string(s[0:lens])
}
================================================
FILE: app/downloader/surfer/agent/agent_test.go
================================================
package agent
import (
"strings"
"testing"
)
func TestUserAgentsPopulated(t *testing.T) {
if len(UserAgents["all"]) == 0 {
t.Fatal("UserAgents[all] should not be empty after init")
}
if len(UserAgents["common"]) == 0 {
t.Fatal("UserAgents[common] should not be empty after init")
}
if len(UserAgents["all"]) < len(UserAgents["common"]) {
t.Error("all should have at least as many agents as common")
}
}
func TestCreateReal(t *testing.T) {
ua := CreateReal()
if ua == "" {
t.Error("CreateReal() returned empty string")
}
if !strings.Contains(ua, "Surfer") {
t.Errorf("CreateReal() = %q, want to contain 'Surfer'", ua)
}
}
func TestCreateDefault(t *testing.T) {
browsers := []string{"Chrome", "Firefox", "Edge", "Safari", "default", "googlebot", "bingbot"}
for _, b := range browsers {
t.Run(b, func(t *testing.T) {
ua := CreateDefault(b)
if ua == "" {
t.Errorf("CreateDefault(%q) returned empty string", b)
}
})
}
}
func TestCreateVersion(t *testing.T) {
ua := CreateVersion("Chrome", "127.0.6533.73")
if ua == "" {
t.Error("CreateVersion returned empty string")
}
if !strings.Contains(ua, "Chrome/127.0.6533.73") {
t.Errorf("CreateVersion = %q, want to contain 'Chrome/127.0.6533.73'", ua)
}
}
func TestTopVersion(t *testing.T) {
tests := []struct {
browser string
want string
}{
{"chrome", "127.0.6533.73"},
{"firefox", "127.0"},
{"unknown", "1.0"},
}
for _, tt := range tests {
t.Run(tt.browser, func(t *testing.T) {
got := TopVersion(tt.browser)
if got != tt.want {
t.Errorf("TopVersion(%q) = %q, want %q", tt.browser, got, tt.want)
}
})
}
}
func TestFormat(t *testing.T) {
f := Format("chrome", "127.0")
if f == "" {
t.Error("Format(chrome, 127.0) returned empty string")
}
if !strings.Contains(f, "Chrome/{{.Ver}}") {
t.Errorf("Format = %q, want to contain template var", f)
}
f2 := Format("unknown_browser", "1.0")
if f2 == "" {
t.Error("Format for unknown browser should return default format")
}
}
func TestFormatFallbackToTopVersion(t *testing.T) {
f := Format("chrome", "999.0")
if f == "" {
t.Error("Format with unknown version should fall back to top version format")
}
}
func TestFormatUnknownVersion(t *testing.T) {
tests := []struct {
browser string
ver string
}{
{"chrome", "1"},
{"firefox", "120"},
{"unknown", "1.0"},
}
for _, tt := range tests {
t.Run(tt.browser+"/"+tt.ver, func(t *testing.T) {
f := Format(tt.browser, tt.ver)
if f == "" {
t.Error("Format returned empty")
}
})
}
}
func TestCreateVersionVariousBrowsers(t *testing.T) {
tests := []struct {
browser string
ver string
contains string
}{
{"chrome", "127.0", "Chrome/127.0"},
{"firefox", "120.0", "Firefox/120.0"},
{"safari", "16.0", "Version/16.0"},
{"googlebot", "2.1", "Googlebot/2.1"},
}
for _, tt := range tests {
t.Run(tt.browser, func(t *testing.T) {
ua := CreateVersion(tt.browser, tt.ver)
if ua == "" {
t.Error("CreateVersion returned empty")
}
if !strings.Contains(ua, tt.contains) {
t.Errorf("CreateVersion = %q, want to contain %q", ua, tt.contains)
}
})
}
}
func TestDatabaseContainsExpectedBrowsers(t *testing.T) {
expected := []string{"chrome", "firefox", "edge", "safari", "googlebot", "bingbot", "yahoobot", "default"}
for _, b := range expected {
if _, ok := Database[b]; !ok {
t.Errorf("Database missing browser %q", b)
}
}
}
func TestDefaultOSAttributes(t *testing.T) {
for _, osID := range []int{Windows, Linux, Macintosh} {
attr, ok := DefaultOSAttributes[osID]
if !ok {
t.Errorf("DefaultOSAttributes missing OS %d", osID)
continue
}
if attr.OSName == "" {
t.Errorf("OS %d has empty OSName", osID)
}
}
}
================================================
FILE: app/downloader/surfer/agent/agent_windows.go
================================================
//go:build windows
// Package agent provides system User-Agent information.
package agent
import (
"fmt"
"runtime"
"syscall"
)
// osName returns the name of the OS.
func osName() string {
return runtime.GOOS
}
// osVersion returns the OS version.
func osVersion() string {
v, err := syscall.GetVersion()
if err != nil {
return "0.0"
}
major := uint8(v)
minor := uint8(v >> 8)
return fmt.Sprintf("%d.%d", major, minor)
}
================================================
FILE: app/downloader/surfer/chrome.go
================================================
//go:build !cover
package surfer
import (
"context"
"fmt"
"io"
"log"
"net/http"
"net/http/cookiejar"
"strings"
"sync"
"time"
"github.com/andeya/gust/result"
"github.com/chromedp/chromedp"
)
// Chrome is a Chromium-based headless browser downloader that keeps a
// single long-lived browser process. Each request opens a new tab that
// first navigates to the target site's homepage (establishing session
// cookies and a valid Referer) before loading the actual URL. This
// two-step approach reliably bypasses JS-based security verification
// pages (e.g. Baidu CAPTCHA) that block direct URL access.
type Chrome struct {
mu sync.Mutex
CookieJar *cookiejar.Jar
allocCtx context.Context
allocCancel context.CancelFunc
browserCtx context.Context // root tab – keeps the browser alive
browserCancel context.CancelFunc // closing this shuts down the browser
started bool
}
func NewChrome(jar ...*cookiejar.Jar) Surfer {
c := &Chrome{}
if len(jar) != 0 {
c.CookieJar = jar[0]
} else {
c.CookieJar, _ = cookiejar.New(nil)
}
return c
}
// ensureBrowser lazily starts the shared Chrome process. Must be called
// while c.mu is held.
func (c *Chrome) ensureBrowser(ua string) {
if c.started {
return
}
opts := chromeAllocatorOpts(ua)
c.allocCtx, c.allocCancel = chromedp.NewExecAllocator(context.Background(), opts...)
c.browserCtx, c.browserCancel = chromedp.NewContext(c.allocCtx)
c.started = true
}
// chromeAllocatorOpts returns chromedp allocator options with
// anti-detection tweaks applied.
func chromeAllocatorOpts(ua string) []chromedp.ExecAllocatorOption {
var opts []chromedp.ExecAllocatorOption
for _, o := range chromedp.DefaultExecAllocatorOptions {
opts = append(opts, o)
}
opts = append(opts,
chromedp.Flag("headless", true),
chromedp.Flag("disable-gpu", true),
chromedp.Flag("no-sandbox", true),
chromedp.Flag("disable-blink-features", "AutomationControlled"),
chromedp.Flag("enable-automation", false),
chromedp.WindowSize(1920, 1080),
)
if ua != "" {
opts = append(opts, chromedp.UserAgent(ua))
}
return opts
}
// hideWebdriver removes the navigator.webdriver flag so that anti-bot
// scripts cannot detect headless automation.
func hideWebdriver() chromedp.Action {
var res interface{}
return chromedp.Evaluate(`Object.defineProperty(navigator, 'webdriver', {get: () => undefined})`, &res)
}
func (c *Chrome) Download(req Request) (r result.Result[*http.Response]) {
defer r.Catch()
param := NewParam(req).Unwrap()
c.mu.Lock()
c.ensureBrowser(param.header.Get("User-Agent"))
c.mu.Unlock()
timeout := req.GetConnTimeout()
if timeout <= 0 {
timeout = 60 * time.Second
}
// Open a new tab inside the shared browser; cookies are shared
// across tabs within the same browser context.
tabCtx, tabCancel := chromedp.NewContext(c.browserCtx)
defer tabCancel()
tabCtx, timeoutCancel := context.WithTimeout(tabCtx, timeout)
defer timeoutCancel()
retries := req.GetTryTimes()
if retries <= 0 {
retries = 1
}
var body string
var err error
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(req.GetRetryPause())
}
body, err = tryDownload(tabCtx, req.GetURL())
if err != nil {
log.Printf("[W] Chrome attempt %d/%d for %s: %v", i+1, retries, req.GetURL(), err)
continue
}
break
}
resp := &http.Response{
Request: &http.Request{},
Header: make(http.Header),
}
resp.Request.Method = strings.ToUpper(req.GetMethod())
resp.Request.Header = param.header
resp.Request.URL = param.url
resp.Request.Host = param.url.Host
if err != nil {
resp.StatusCode = http.StatusBadGateway
resp.Status = err.Error()
resp.Body = io.NopCloser(strings.NewReader(""))
} else {
resp.StatusCode = http.StatusOK
resp.Status = http.StatusText(http.StatusOK)
resp.Body = io.NopCloser(strings.NewReader(body))
}
return result.Ok(resp)
}
// tryDownload navigates to the target URL and returns the HTML.
//
// Every request follows a "homepage-first" pattern within the same tab:
// 1. Navigate to the site homepage — this establishes session cookies,
// runs any JS fingerprinting, and sets Referer for the next hop.
// 2. Navigate to the actual target URL — the site sees a natural
// browsing flow (homepage → subpage) rather than a bot hitting a
// deep link directly.
//
// If verification is still detected after this two-step flow, the
// function returns an error so the framework can retry later.
func tryDownload(ctx context.Context, targetURL string) (string, error) {
homepage := ExtractHomepage(targetURL)
// Step 1: visit the homepage first to look like a real user.
if homepage != "" && homepage != targetURL {
if err := chromedp.Run(ctx,
hideWebdriver(),
chromedp.Navigate(homepage),
chromedp.WaitReady("body"),
chromedp.Sleep(1*time.Second),
); err != nil {
return "", err
}
}
// Step 2: navigate to the actual target URL.
if err := chromedp.Run(ctx,
chromedp.Navigate(targetURL),
chromedp.WaitReady("body"),
chromedp.Sleep(3*time.Second),
); err != nil {
return "", err
}
// Check if we hit a verification page.
if isVerificationPage(ctx) {
// Wait a bit — some verification pages auto-redirect after
// JS execution completes.
waitUntilNotVerification(ctx, 10*time.Second)
if isVerificationPage(ctx) {
return "", fmt.Errorf("blocked by security verification at %s", targetURL)
}
}
var body string
if err := chromedp.Run(ctx, chromedp.OuterHTML("html", &body)); err != nil {
return "", err
}
return body, nil
}
// waitUntilNotVerification polls the page title, returning as soon as
// the page is no longer a verification page.
func waitUntilNotVerification(ctx context.Context, maxWait time.Duration) {
deadline := time.Now().Add(maxWait)
for time.Now().Before(deadline) {
if !isVerificationPage(ctx) {
return
}
time.Sleep(1 * time.Second)
}
}
// isVerificationPage checks the current page title for known security
// verification indicators.
func isVerificationPage(ctx context.Context) bool {
var title string
if err := chromedp.Run(ctx, chromedp.Title(&title)); err != nil {
return false
}
return strings.Contains(title, "security verification") ||
strings.Contains(title, "verify") ||
strings.Contains(title, "security check")
}
================================================
FILE: app/downloader/surfer/chrome_stub.go
================================================
//go:build cover
package surfer
import (
"errors"
"net/http"
"net/http/cookiejar"
"github.com/andeya/gust/result"
)
type ChromeStub struct {
CookieJar *cookiejar.Jar
}
func NewChrome(jar ...*cookiejar.Jar) Surfer {
c := &ChromeStub{}
if len(jar) != 0 {
c.CookieJar = jar[0]
} else {
c.CookieJar, _ = cookiejar.New(nil)
}
return c
}
func (c *ChromeStub) Download(req Request) result.Result[*http.Response] {
return result.TryErr[*http.Response](errors.New("chrome not available in coverage mode"))
}
================================================
FILE: app/downloader/surfer/chrome_test.go
================================================
package surfer
import (
"io"
"strings"
"testing"
)
func TestChromeDownloaderBaiduSearch(t *testing.T) {
if testing.Short() {
t.Skip("skipping Chrome test in short mode")
}
chrome := NewChrome()
req := &DefaultRequest{
URL: "https://www.baidu.com/s?wd=golang&pn=0",
DownloaderID: ChromeID,
}
r := chrome.Download(req)
if r.IsErr() {
t.Fatalf("download error: %v", r.UnwrapErr())
}
resp := r.Unwrap()
if resp.StatusCode != 200 {
t.Fatalf("unexpected status: %d %s", resp.StatusCode, resp.Status)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
t.Fatalf("read body error: %v", err)
}
body := string(bodyBytes)
t.Logf("body length: %d", len(body))
hasResults := strings.Contains(body, `class="result`) || strings.Contains(body, `class="c-container`)
if !hasResults {
t.Fatal("No search result elements found in page")
}
t.Log("SUCCESS: first request returned Baidu search results")
}
// Verify that multiple requests to the same domain all succeed,
// not just the first one.
func TestChromeDownloaderReuseSession(t *testing.T) {
if testing.Short() {
t.Skip("skipping Chrome test in short mode")
}
chrome := NewChrome()
keywords := []string{"golang", "pholcus"}
for i, kw := range keywords {
req := &DefaultRequest{
URL: "https://www.baidu.com/s?wd=" + kw + "&pn=0",
DownloaderID: ChromeID,
}
r := chrome.Download(req)
if r.IsErr() {
t.Fatalf("request %d (%s) download error: %v", i+1, kw, r.UnwrapErr())
}
resp := r.Unwrap()
bodyBytes, _ := io.ReadAll(resp.Body)
body := string(bodyBytes)
t.Logf("request %d (%s): status=%d body_length=%d", i+1, kw, resp.StatusCode, len(body))
hasResults := strings.Contains(body, `class="result`) || strings.Contains(body, `class="c-container`)
if !hasResults {
t.Fatalf("request %d (%s): no search results found", i+1, kw)
}
t.Logf("request %d (%s): OK", i+1, kw)
}
}
func TestExtractHomepage(t *testing.T) {
tests := []struct {
input string
want string
}{
{"https://www.baidu.com/s?wd=go", "https://www.baidu.com"},
{"https://www.google.com/search?q=go", "https://www.google.com"},
{"http://example.com", "http://example.com"},
{"http://example.com/path/page", "http://example.com"},
{"invalid-url", ""},
}
for _, tt := range tests {
got := ExtractHomepage(tt.input)
if got != tt.want {
t.Errorf("extractHomepage(%q) = %q, want %q", tt.input, got, tt.want)
}
}
}
================================================
FILE: app/downloader/surfer/example/example.go
================================================
package main
import (
"io"
"log"
"time"
"github.com/andeya/pholcus/app/downloader/surfer"
)
func main() {
var values = "username=123456@qq.com&password=123456&login_btn=login_btn&submit=login_btn"
log.Println("********************************************* Surf GET download test start *********************************************")
r := surfer.Download(&surfer.DefaultRequest{
URL: "http://www.baidu.com/",
})
if r.IsErr() {
log.Fatal(r.UnwrapErr())
}
resp := r.Unwrap()
log.Printf("baidu resp.Status: %s\nresp.Header: %#v\n", resp.Status, resp.Header)
b, err := io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("baidu resp.Body: %s\nerr: %v", b, err)
log.Println("********************************************* Surf POST download test start *********************************************")
r = surfer.Download(&surfer.DefaultRequest{
URL: "http://accounts.lewaos.com/",
Method: "POST",
PostData: values,
})
if r.IsErr() {
log.Fatal(r.UnwrapErr())
}
resp = r.Unwrap()
log.Printf("lewaos resp.Status: %s\nresp.Header: %#v\n", resp.Status, resp.Header)
b, err = io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("lewaos resp.Body: %s\nerr: %v", b, err)
log.Println("********************************************* PhantomJS GET download test start *********************************************")
r = surfer.Download(&surfer.DefaultRequest{
URL: "http://www.baidu.com/",
DownloaderID: 1,
})
if r.IsErr() {
log.Fatal(r.UnwrapErr())
}
resp = r.Unwrap()
log.Printf("baidu resp.Status: %s\nresp.Header: %#v\n", resp.Status, resp.Header)
b, err = io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("baidu resp.Body: %s\nerr: %v", b, err)
log.Println("********************************************* PhantomJS POST download test start *********************************************")
r = surfer.Download(&surfer.DefaultRequest{
DownloaderID: 1,
URL: "http://accounts.lewaos.com/",
Method: "POST",
PostData: values,
})
if r.IsErr() {
log.Fatal(r.UnwrapErr())
}
resp = r.Unwrap()
log.Printf("lewaos resp.Status: %s\nresp.Header: %#v\n", resp.Status, resp.Header)
b, err = io.ReadAll(resp.Body)
resp.Body.Close()
log.Printf("lewaos resp.Body: %s\nerr: %v", b, err)
surfer.DestroyJsFiles()
time.Sleep(10e9)
}
================================================
FILE: app/downloader/surfer/param.go
================================================
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package surfer
import (
"bytes"
"fmt"
"io"
"math/rand"
"mime/multipart"
"net/http"
"net/url"
"strings"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/downloader/surfer/agent"
)
type Param struct {
method string
url *url.URL
proxy *url.URL
body io.Reader
header http.Header
enableCookie bool
dialTimeout time.Duration
connTimeout time.Duration
tryTimes int
retryPause time.Duration
redirectTimes int
client *http.Client
}
func NewParam(req Request) (r result.Result[*Param]) {
defer r.Catch()
param := new(Param)
param.url = result.Ret(URLEncode(req.GetURL())).Unwrap()
if req.GetProxy() != "" {
param.proxy = result.Ret(url.Parse(req.GetProxy())).Unwrap()
}
param.header = req.GetHeader()
if param.header == nil {
param.header = make(http.Header)
}
switch method := strings.ToUpper(req.GetMethod()); method {
case "GET", "HEAD":
param.method = method
case "POST":
param.method = method
param.header.Add("Content-Type", "application/x-www-form-urlencoded")
param.body = strings.NewReader(req.GetPostData())
case "POST-M":
param.method = "POST"
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
values, _ := url.ParseQuery(req.GetPostData())
for k, vs := range values {
for _, v := range vs {
writer.WriteField(k, v)
}
}
result.RetVoid(writer.Close()).Unwrap()
param.header.Add("Content-Type", writer.FormDataContentType())
param.body = body
default:
param.method = "GET"
}
param.enableCookie = req.GetEnableCookie()
if len(param.header.Get("User-Agent")) == 0 {
if param.enableCookie {
param.header.Add("User-Agent", agent.UserAgents["common"][0])
} else {
l := len(agent.UserAgents["common"])
rng := rand.New(rand.NewSource(time.Now().UnixNano()))
param.header.Add("User-Agent", agent.UserAgents["common"][rng.Intn(l)])
}
}
param.dialTimeout = req.GetDialTimeout()
if param.dialTimeout < 0 {
param.dialTimeout = 0
}
param.connTimeout = req.GetConnTimeout()
param.tryTimes = req.GetTryTimes()
param.retryPause = req.GetRetryPause()
param.redirectTimes = req.GetRedirectTimes()
return result.Ok(param)
}
// writeback populates the response with Request content.
func (p *Param) writeback(resp *http.Response) *http.Response {
if resp == nil {
resp = new(http.Response)
resp.Request = new(http.Request)
} else if resp.Request == nil {
resp.Request = new(http.Request)
}
if resp.Header == nil {
resp.Header = make(http.Header)
}
resp.Request.Method = p.method
resp.Request.Header = p.header
resp.Request.Host = p.url.Host
return resp
}
// checkRedirect is used as the value to http.Client.CheckRedirect
// when redirectTimes equal 0, redirect times is ∞
// when redirectTimes less than 0, not allow redirects
func (p *Param) checkRedirect(req *http.Request, via []*http.Request) error {
if p.redirectTimes == 0 {
return nil
}
if len(via) >= p.redirectTimes {
if p.redirectTimes < 0 {
return fmt.Errorf("not allow redirects.")
}
return fmt.Errorf("stopped after %v redirects.", p.redirectTimes)
}
return nil
}
================================================
FILE: app/downloader/surfer/param_test.go
================================================
package surfer
import (
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestNewParam(t *testing.T) {
tests := []struct {
name string
req *DefaultRequest
wantErr bool
}{
{
name: "GET",
req: &DefaultRequest{
URL: "http://example.com",
Method: "GET",
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
},
wantErr: false,
},
{
name: "POST",
req: &DefaultRequest{
URL: "http://example.com",
Method: "POST",
PostData: "a=1",
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
},
wantErr: false,
},
{
name: "POST-M",
req: &DefaultRequest{
URL: "http://example.com",
Method: "POST-M",
PostData: "k=v",
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
},
wantErr: false,
},
{
name: "invalid URL",
req: &DefaultRequest{
URL: "://invalid",
Method: "GET",
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := NewParam(tt.req)
if tt.wantErr && r.IsOk() {
t.Error("NewParam expected error")
}
if !tt.wantErr && r.IsErr() {
t.Errorf("NewParam err: %v", r.UnwrapErr())
}
})
}
}
func TestNewParamWithProxy(t *testing.T) {
req := &DefaultRequest{
URL: "http://example.com",
Method: "GET",
Proxy: "http://proxy.example.com:8080",
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
}
r := NewParam(req)
if r.IsErr() {
t.Errorf("NewParam with proxy err: %v", r.UnwrapErr())
}
}
func TestNewParamWithUserAgent(t *testing.T) {
req := &DefaultRequest{
URL: "http://example.com",
Method: "GET",
Header: http.Header{"User-Agent": {"CustomAgent/1.0"}},
TryTimes: 1,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
}
r := NewParam(req)
if r.IsErr() {
t.Errorf("NewParam err: %v", r.UnwrapErr())
}
}
func TestRedirectUnlimited(t *testing.T) {
var redirectCount int
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
redirectCount++
if redirectCount <= 3 {
http.Redirect(w, r, r.URL.Path, http.StatusFound)
return
}
w.Write([]byte("ok"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
RedirectTimes: 0,
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Errorf("StatusCode = %d, want 200", resp.StatusCode)
}
}
func TestRedirectLimited(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/loop", http.StatusFound)
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
RedirectTimes: 2,
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsOk() {
t.Error("Download expected redirect error")
}
}
func TestRedirectNotAllowed(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/other", http.StatusFound)
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
RedirectTimes: -1,
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsOk() {
t.Error("Download expected no-redirect error")
}
}
================================================
FILE: app/downloader/surfer/phantom.go
================================================
//go:build !cover
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package surfer
import (
"encoding/json"
"fmt"
"io"
"log"
"mime"
"net/http"
"net/http/cookiejar"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
"github.com/andeya/gust/result"
)
type (
// Phantom is a PhantomJS-based downloader, complementing Surf.
// Slower than Surf but better at bypassing anti-scraping due to browser simulation.
// Supports UserAgent, TryTimes, RetryPause, and custom JS.
Phantom struct {
PhantomjsFile string // full path to PhantomJS executable
TempJsDir string // directory for temporary JS files
jsFileMap map[string]string // existing JS files
CookieJar *cookiejar.Jar
}
// Response parses PhantomJS response content.
Response struct {
Cookies []string
Body string
Error string
Header []struct {
Name string
Value string
}
}
// Cookie is used to pass cookies to PhantomJS.
Cookie struct {
Name string `json:"name"`
Value string `json:"value"`
Domain string `json:"domain"`
Path string `json:"path"`
}
)
func NewPhantom(phantomjsFile, tempJsDir string, jar ...*cookiejar.Jar) Surfer {
phantom := &Phantom{
PhantomjsFile: phantomjsFile,
TempJsDir: tempJsDir,
jsFileMap: make(map[string]string),
}
if len(jar) != 0 {
phantom.CookieJar = jar[0]
} else {
phantom.CookieJar, _ = cookiejar.New(nil) // nil options never returns error
}
if !filepath.IsAbs(phantom.PhantomjsFile) {
if absPath, err := filepath.Abs(phantom.PhantomjsFile); err != nil {
log.Printf("[E] Surfer: filepath.Abs(%q): %v", phantom.PhantomjsFile, err)
} else {
phantom.PhantomjsFile = absPath
}
}
if !filepath.IsAbs(phantom.TempJsDir) {
if absPath, err := filepath.Abs(phantom.TempJsDir); err != nil {
log.Printf("[E] Surfer: filepath.Abs(%q): %v", phantom.TempJsDir, err)
} else {
phantom.TempJsDir = absPath
}
}
err := os.MkdirAll(phantom.TempJsDir, 0777)
if err != nil {
log.Printf("[E] Surfer: %v\n", err)
return phantom
}
phantom.createJsFile("js", js)
return phantom
}
// Download implements the Surfer interface.
func (p *Phantom) Download(req Request) (r result.Result[*http.Response]) {
defer r.Catch()
var encoding = "utf-8"
if _, params, err := mime.ParseMediaType(req.GetHeader().Get("Content-Type")); err == nil {
if cs, ok := params["charset"]; ok {
encoding = strings.ToLower(strings.TrimSpace(cs))
}
}
req.GetHeader().Del("Content-Type")
param := NewParam(req).Unwrap()
cookie := ""
if req.GetEnableCookie() {
httpCookies := p.CookieJar.Cookies(param.url)
if len(httpCookies) > 0 {
surferCookies := make([]*Cookie, len(httpCookies))
for n, c := range httpCookies {
surferCookie := &Cookie{Name: c.Name, Value: c.Value, Domain: param.url.Host, Path: "/"}
surferCookies[n] = surferCookie
}
c, err := json.Marshal(surferCookies)
if err != nil {
log.Printf("cookie marshal error:%v", err)
}
cookie = string(c)
}
}
resp := param.writeback(nil)
resp.Request.URL = param.url
var args = []string{
p.jsFileMap["js"],
req.GetURL(),
cookie,
encoding,
param.header.Get("User-Agent"),
req.GetPostData(),
strings.ToLower(param.method),
fmt.Sprint(int(req.GetDialTimeout() / time.Millisecond)),
}
if req.GetProxy() != "" {
args = append([]string{"--proxy=" + req.GetProxy()}, args...)
}
var err error
for i := 0; i < param.tryTimes; i++ {
if i != 0 {
time.Sleep(param.retryPause)
}
cmd := exec.Command(p.PhantomjsFile, args...)
if resp.Body, err = cmd.StdoutPipe(); err != nil {
continue
}
err = cmd.Start()
if err != nil || resp.Body == nil {
continue
}
var b []byte
b, err = io.ReadAll(resp.Body)
if err != nil {
continue
}
retResp := Response{}
err = json.Unmarshal(b, &retResp)
if err != nil {
continue
}
if retResp.Error != "" {
log.Printf("phantomjs response error:%s", retResp.Error)
continue
}
for _, h := range retResp.Header {
resp.Header.Add(h.Name, h.Value)
}
for _, c := range retResp.Cookies {
resp.Header.Add("Set-Cookie", c)
}
if req.GetEnableCookie() {
if rc := resp.Cookies(); len(rc) > 0 {
p.CookieJar.SetCookies(param.url, rc)
}
}
resp.Body = io.NopCloser(strings.NewReader(retResp.Body))
err = nil
break
}
if err == nil {
resp.StatusCode = http.StatusOK
resp.Status = http.StatusText(http.StatusOK)
} else {
resp.StatusCode = http.StatusBadGateway
resp.Status = err.Error()
}
return result.Ok(resp)
}
// DestroyJsFiles removes temporary JS files.
func (p *Phantom) DestroyJsFiles() {
dir, _ := filepath.Split(p.TempJsDir)
if dir == "" {
return
}
for _, filename := range p.jsFileMap {
os.Remove(filename)
}
if len(WalkDir(dir)) == 1 {
os.Remove(dir)
}
}
func (p *Phantom) createJsFile(fileName, jsCode string) {
fullFileName := filepath.Join(p.TempJsDir, fileName)
f, _ := os.Create(fullFileName)
f.Write([]byte(jsCode))
f.Close()
p.jsFileMap[fileName] = fullFileName
}
/*
* system.args[0] == js
* system.args[1] == url
* system.args[2] == cookie
* system.args[3] == pageEncode
* system.args[4] == userAgent
* system.args[5] == postdata
* system.args[6] == method
* system.args[7] == timeout
*/
const js string = `
var system = require('system');
var page = require('webpage').create();
var url = system.args[1];
var cookie = system.args[2];
var pageEncode = system.args[3];
var userAgent = system.args[4];
var postdata = system.args[5];
var method = system.args[6];
var timeout = system.args[7];
var ret = new Object();
var exit = function () {
console.log(JSON.stringify(ret));
phantom.exit();
};
// output params
// console.log("url=" + url);
// console.log("cookie=" + cookie);
// console.log("pageEncode=" + pageEncode);
// console.log("userAgent=" + userAgent);
// console.log("postdata=" + postdata);
// console.log("method=" + method);
// console.log("timeout=" + timeout);
// ret += (url + "\n");
// ret += (cookie + "\n");
// ret += (pageEncode + "\n");
// ret += (userAgent + "\n");
// ret += (postdata + "\n");
// ret += (method + "\n");
// ret += (timeout + "\n");
// exit();
phantom.outputEncoding = pageEncode;
page.settings.userAgent = userAgent;
page.settings.resourceTimeout = timeout;
page.settings.XSSAuditingEnabled = true;
function addCookie() {
if (cookie != "") {
var cookies = JSON.parse(cookie);
for (var i = 0; i < cookies.length; i++) {
var c = cookies[i];
phantom.addCookie({
'name': c.name, /* required property */
'value': c.value, /* required property */
'domain': c.domain,
'path': c.path, /* required property */
});
}
}
}
addCookie();
page.onResourceRequested = function (requestData, request) {
};
page.onResourceReceived = function (response) {
if (response.stage === "end") {
// console.log("liguoqinjim received1------------------------------------------------");
// console.log("url=" + response.url);
//
// for (var j in response.headers) { // iterate object properties with for/in
// // var m = sprintf("AttrId[%d]Value[%d]", j, result.Attrs[j]);
// // message += m;
// // console.log(response.headers[j]);
// console.log(response.headers[j]["name"] + ":" + response.headers[j]["value"]);
// }
//
// console.log("liguoqinjim received2------------------------------------------------");
ret["Header"] = response.headers;
}
};
page.onError = function (msg, trace) {
ret["Error"] = msg;
exit();
};
page.onResourceTimeout = function (e) {
// console.log("phantomjs onResourceTimeout error");
// console.log(e.errorCode); // it'll probably be 408
// console.log(e.errorString); // it'll probably be 'Network timeout on resource'
// console.log(e.url); // the url whose request timed out
// phantom.exit(1);
ret["Error"] = "onResourceTimeout";
exit();
};
page.onResourceError = function (e) {
// console.log("onResourceError");
// console.log("1:" + e.errorCode + "," + e.errorString);
if (e.errorCode != 5) { // errorCode=5 conflicts with onResourceTimeout
ret["Error"] = "onResourceError";
exit();
}
};
page.onLoadFinished = function (status) {
if (status !== 'success') {
ret["Error"] = "status=" + status;
exit();
} else {
var cookies = new Array();
for (var i in page.cookies) {
var cookie = page.cookies[i];
var c = cookie["name"] + "=" + cookie["value"];
for (var obj in cookie) {
if (obj == 'name' || obj == 'value') {
continue;
}
if (obj == "httponly" || obj == "secure") {
if (cookie[obj] == true) {
c += ";" + obj;
}
} else {
c += "; " + obj + "=" + cookie[obj];
}
}
cookies[i] = c;
}
if (page.content.indexOf("body") != -1) {
ret["Cookies"] = cookies;
ret["Body"] = page.content;
// ret = JSON.stringify(resp);
exit();
}
}
};
page.open(url, method, postdata, function (status) {
});
`
================================================
FILE: app/downloader/surfer/phantom_stub.go
================================================
//go:build cover
package surfer
import (
"errors"
"net/http"
"net/http/cookiejar"
"github.com/andeya/gust/result"
)
type PhantomStub struct {
CookieJar *cookiejar.Jar
}
func NewPhantom(phantomjsFile, tempJsDir string, jar ...*cookiejar.Jar) Surfer {
p := &PhantomStub{}
if len(jar) != 0 {
p.CookieJar = jar[0]
} else {
p.CookieJar, _ = cookiejar.New(nil)
}
return p
}
func (p *PhantomStub) Download(req Request) result.Result[*http.Response] {
return result.TryErr[*http.Response](errors.New("phantom not available in coverage mode"))
}
func (p *PhantomStub) DestroyJsFiles() {}
================================================
FILE: app/downloader/surfer/request.go
================================================
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package surfer
import (
"net/http"
"strings"
"sync"
"time"
)
type (
Request interface {
// url
GetURL() string
// GET POST POST-M HEAD
GetMethod() string
// POST values
GetPostData() string
// http header
GetHeader() http.Header
// enable http cookies
GetEnableCookie() bool
// dial tcp: i/o timeout
GetDialTimeout() time.Duration
// WSARecv tcp: i/o timeout
GetConnTimeout() time.Duration
// the max times of download
GetTryTimes() int
// the pause time of retry
GetRetryPause() time.Duration
// the download ProxyHost
GetProxy() string
// max redirect times
GetRedirectTimes() int
// select Surf ro PhomtomJS
GetDownloaderID() int
}
// DefaultRequest is the default Request implementation.
DefaultRequest struct {
URL string // required
Method string // GET POST POST-M HEAD (default GET)
Header http.Header // http header
EnableCookie bool // set in Spider.EnableCookie
// POST values
PostData string
// dial tcp: i/o timeout
DialTimeout time.Duration
// WSARecv tcp: i/o timeout
ConnTimeout time.Duration
// the max times of download
TryTimes int
// how long pause when retry
RetryPause time.Duration
// max redirect times
// when RedirectTimes equal 0, redirect times is ∞
// when RedirectTimes less than 0, redirect times is 0
RedirectTimes int
// the download ProxyHost
Proxy string
// DownloaderID: 0=Surf (high concurrency), 1=PhantomJS (strong anti-block, slow)
DownloaderID int
once sync.Once // ensures prepare is called only once
}
)
const (
SurfID = 0 // Surf downloader identifier
PhantomJsID = 1 // PhantomJS downloader identifier
ChromeID = 2 // Chromium headless browser downloader identifier
// Deprecated: Use PhantomJsID instead.
PhomtomJsID = PhantomJsID
DefaultMethod = "GET" // default request method
DefaultDialTimeout = 2 * time.Minute // default server request timeout
DefaultConnTimeout = 2 * time.Minute // default download timeout
DefaultTryTimes = 3 // default max download attempts
DefaultRetryPause = 2 * time.Second // default pause before retry
)
func (dr *DefaultRequest) prepare() {
if dr.Method == "" {
dr.Method = DefaultMethod
}
dr.Method = strings.ToUpper(dr.Method)
if dr.Header == nil {
dr.Header = make(http.Header)
}
if dr.DialTimeout < 0 {
dr.DialTimeout = 0
} else if dr.DialTimeout == 0 {
dr.DialTimeout = DefaultDialTimeout
}
if dr.ConnTimeout < 0 {
dr.ConnTimeout = 0
} else if dr.ConnTimeout == 0 {
dr.ConnTimeout = DefaultConnTimeout
}
if dr.TryTimes == 0 {
dr.TryTimes = DefaultTryTimes
}
if dr.RetryPause <= 0 {
dr.RetryPause = DefaultRetryPause
}
if dr.DownloaderID != PhantomJsID && dr.DownloaderID != ChromeID {
dr.DownloaderID = SurfID
}
}
// GetURL returns the request URL.
func (dr *DefaultRequest) GetURL() string {
dr.once.Do(dr.prepare)
return dr.URL
}
// GetMethod returns the HTTP method (e.g. GET, POST).
func (dr *DefaultRequest) GetMethod() string {
dr.once.Do(dr.prepare)
return dr.Method
}
// GetPostData returns the POST request body.
func (dr *DefaultRequest) GetPostData() string {
dr.once.Do(dr.prepare)
return dr.PostData
}
// GetHeader returns the HTTP request headers.
func (dr *DefaultRequest) GetHeader() http.Header {
dr.once.Do(dr.prepare)
return dr.Header
}
// GetEnableCookie returns whether cookies are enabled.
func (dr *DefaultRequest) GetEnableCookie() bool {
dr.once.Do(dr.prepare)
return dr.EnableCookie
}
// GetDialTimeout returns the dial timeout.
func (dr *DefaultRequest) GetDialTimeout() time.Duration {
dr.once.Do(dr.prepare)
return dr.DialTimeout
}
// GetConnTimeout returns the connection read/write timeout.
func (dr *DefaultRequest) GetConnTimeout() time.Duration {
dr.once.Do(dr.prepare)
return dr.ConnTimeout
}
// GetTryTimes returns the max retry count.
func (dr *DefaultRequest) GetTryTimes() int {
dr.once.Do(dr.prepare)
return dr.TryTimes
}
// GetRetryPause returns the retry pause duration.
func (dr *DefaultRequest) GetRetryPause() time.Duration {
dr.once.Do(dr.prepare)
return dr.RetryPause
}
// GetProxy returns the proxy address.
func (dr *DefaultRequest) GetProxy() string {
dr.once.Do(dr.prepare)
return dr.Proxy
}
// GetRedirectTimes returns the max redirect count.
func (dr *DefaultRequest) GetRedirectTimes() int {
dr.once.Do(dr.prepare)
return dr.RedirectTimes
}
// GetDownloaderID returns the downloader ID (0=Surf, 1=PhantomJS, 2=Chrome).
func (dr *DefaultRequest) GetDownloaderID() int {
dr.once.Do(dr.prepare)
return dr.DownloaderID
}
================================================
FILE: app/downloader/surfer/request_test.go
================================================
package surfer
import (
"net/http"
"testing"
"time"
)
func TestDefaultRequestPrepare(t *testing.T) {
tests := []struct {
name string
req *DefaultRequest
chk func(*testing.T, *DefaultRequest)
}{
{
name: "default method",
req: &DefaultRequest{URL: "http://a.com"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetMethod() != DefaultMethod {
t.Errorf("Method = %q", r.GetMethod())
}
},
},
{
name: "default dial timeout",
req: &DefaultRequest{URL: "http://a.com"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetDialTimeout() != DefaultDialTimeout {
t.Errorf("DialTimeout = %v", r.GetDialTimeout())
}
},
},
{
name: "default conn timeout",
req: &DefaultRequest{URL: "http://a.com"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetConnTimeout() != DefaultConnTimeout {
t.Errorf("ConnTimeout = %v", r.GetConnTimeout())
}
},
},
{
name: "default try times",
req: &DefaultRequest{URL: "http://a.com"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetTryTimes() != DefaultTryTimes {
t.Errorf("TryTimes = %v", r.GetTryTimes())
}
},
},
{
name: "default retry pause",
req: &DefaultRequest{URL: "http://a.com"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetRetryPause() != DefaultRetryPause {
t.Errorf("RetryPause = %v", r.GetRetryPause())
}
},
},
{
name: "negative dial timeout",
req: &DefaultRequest{URL: "http://a.com", DialTimeout: -1},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetDialTimeout() != 0 {
t.Errorf("DialTimeout = %v", r.GetDialTimeout())
}
},
},
{
name: "negative conn timeout",
req: &DefaultRequest{URL: "http://a.com", ConnTimeout: -1},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetConnTimeout() != 0 {
t.Errorf("ConnTimeout = %v", r.GetConnTimeout())
}
},
},
{
name: "method uppercase",
req: &DefaultRequest{URL: "http://a.com", Method: "get"},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetMethod() != "GET" {
t.Errorf("Method = %q", r.GetMethod())
}
},
},
{
name: "PhantomJsID preserved",
req: &DefaultRequest{URL: "http://a.com", DownloaderID: PhantomJsID},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetDownloaderID() != PhantomJsID {
t.Errorf("DownloaderID = %v", r.GetDownloaderID())
}
},
},
{
name: "ChromeID preserved",
req: &DefaultRequest{URL: "http://a.com", DownloaderID: ChromeID},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetDownloaderID() != ChromeID {
t.Errorf("DownloaderID = %v", r.GetDownloaderID())
}
},
},
{
name: "invalid DownloaderID defaults to SurfID",
req: &DefaultRequest{URL: "http://a.com", DownloaderID: 99},
chk: func(t *testing.T, r *DefaultRequest) {
r.GetURL()
if r.GetDownloaderID() != SurfID {
t.Errorf("DownloaderID = %v", r.GetDownloaderID())
}
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tt.chk(t, tt.req)
})
}
}
func TestDefaultRequestGetters(t *testing.T) {
h := http.Header{"X-Custom": {"val"}}
req := &DefaultRequest{
URL: "http://example.com/path",
Method: "POST",
PostData: "a=1",
Header: h,
EnableCookie: true,
DialTimeout: time.Minute,
ConnTimeout: time.Minute,
TryTimes: 5,
RetryPause: time.Second,
RedirectTimes: 3,
Proxy: "http://proxy:8080",
DownloaderID: SurfID,
}
req.GetURL()
if req.GetURL() != "http://example.com/path" {
t.Errorf("GetURL = %q", req.GetURL())
}
if req.GetMethod() != "POST" {
t.Errorf("GetMethod = %q", req.GetMethod())
}
if req.GetPostData() != "a=1" {
t.Errorf("GetPostData = %q", req.GetPostData())
}
if req.GetHeader().Get("X-Custom") != "val" {
t.Errorf("GetHeader X-Custom = %q", req.GetHeader().Get("X-Custom"))
}
if !req.GetEnableCookie() {
t.Error("GetEnableCookie = false")
}
if req.GetDialTimeout() != time.Minute {
t.Errorf("GetDialTimeout = %v", req.GetDialTimeout())
}
if req.GetConnTimeout() != time.Minute {
t.Errorf("GetConnTimeout = %v", req.GetConnTimeout())
}
if req.GetTryTimes() != 5 {
t.Errorf("GetTryTimes = %v", req.GetTryTimes())
}
if req.GetRetryPause() != time.Second {
t.Errorf("GetRetryPause = %v", req.GetRetryPause())
}
if req.GetProxy() != "http://proxy:8080" {
t.Errorf("GetProxy = %q", req.GetProxy())
}
if req.GetRedirectTimes() != 3 {
t.Errorf("GetRedirectTimes = %v", req.GetRedirectTimes())
}
if req.GetDownloaderID() != SurfID {
t.Errorf("GetDownloaderID = %v", req.GetDownloaderID())
}
}
================================================
FILE: app/downloader/surfer/surf.go
================================================
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package surfer
import (
"compress/flate"
"compress/gzip"
"compress/zlib"
"crypto/tls"
"math/rand"
"net"
"net/http"
"net/http/cookiejar"
"strings"
"time"
"github.com/andeya/gust/option"
"github.com/andeya/gust/result"
"github.com/andeya/gust/syncutil"
"github.com/andeya/pholcus/app/downloader/surfer/agent"
)
// Surf is the default Download implementation.
type Surf struct {
CookieJar *cookiejar.Jar
}
// New creates a Surf downloader instance.
func New(jar ...*cookiejar.Jar) Surfer {
s := new(Surf)
if len(jar) != 0 {
s.CookieJar = jar[0]
} else {
s.CookieJar, _ = cookiejar.New(nil) // nil options never returns error
}
return s
}
// Download implements the Surfer interface.
func (s *Surf) Download(req Request) (r result.Result[*http.Response]) {
defer r.Catch()
param := NewParam(req).Unwrap()
param.header.Set("Connection", "close")
param.client = s.buildClient(param)
resp, err := s.httpRequest(param)
result.RetVoid(err).Unwrap()
switch resp.Header.Get("Content-Encoding") {
case "gzip":
gzipReader, err := gzip.NewReader(resp.Body)
result.RetVoid(err).Unwrap()
resp.Body = gzipReader
case "deflate":
resp.Body = flate.NewReader(resp.Body)
case "zlib":
readCloser, err := zlib.NewReader(resp.Body)
result.RetVoid(err).Unwrap()
resp.Body = readCloser
}
resp = param.writeback(resp)
return result.Ok(resp)
}
var dnsCache = &DnsCache{}
// DnsCache DNS cache
type DnsCache struct {
ipPortLib syncutil.SyncMap[string, string]
}
// Reg registers ipPort to DNS cache.
func (d *DnsCache) Reg(addr, ipPort string) {
d.ipPortLib.Store(addr, ipPort)
}
// Del deletes ipPort from DNS cache.
func (d *DnsCache) Del(addr string) {
d.ipPortLib.Delete(addr)
}
// Query queries ipPort from DNS cache.
func (d *DnsCache) Query(addr string) option.Option[string] {
return d.ipPortLib.Load(addr)
}
// buildClient creates, configures, and returns a *http.Client type.
func (s *Surf) buildClient(param *Param) *http.Client {
client := &http.Client{
CheckRedirect: param.checkRedirect,
}
if param.enableCookie {
client.Jar = s.CookieJar
}
transport := &http.Transport{
Dial: func(network, addr string) (net.Conn, error) {
var (
c net.Conn
err error
ipOpt = dnsCache.Query(addr)
)
ipPort := addr
if ipOpt.IsSome() {
ipPort = ipOpt.Unwrap()
defer func() {
if err != nil {
dnsCache.Del(addr)
}
}()
} else {
defer func() {
if err == nil {
dnsCache.Reg(addr, c.RemoteAddr().String())
}
}()
}
c, err = net.DialTimeout(network, ipPort, param.dialTimeout)
if err != nil {
return nil, err
}
if param.connTimeout > 0 {
c.SetDeadline(time.Now().Add(param.connTimeout))
}
return c, nil
},
}
if param.proxy != nil {
transport.Proxy = http.ProxyURL(param.proxy)
}
if strings.ToLower(param.url.Scheme) == "https" {
transport.TLSClientConfig = &tls.Config{RootCAs: nil, InsecureSkipVerify: true}
transport.DisableCompression = true
}
client.Transport = transport
return client
}
// send uses the given *http.Request to make an HTTP request.
func (s *Surf) httpRequest(param *Param) (resp *http.Response, err error) {
req, err := http.NewRequest(param.method, param.url.String(), param.body)
if err != nil {
return nil, err
}
req.Header = param.header
if param.tryTimes <= 0 {
for {
resp, err = param.client.Do(req)
if err != nil {
if !param.enableCookie {
l := len(agent.UserAgents["common"])
r := rand.New(rand.NewSource(time.Now().UnixNano()))
req.Header.Set("User-Agent", agent.UserAgents["common"][r.Intn(l)])
}
time.Sleep(param.retryPause)
continue
}
break
}
} else {
for i := 0; i < param.tryTimes; i++ {
resp, err = param.client.Do(req)
if err != nil {
if !param.enableCookie {
l := len(agent.UserAgents["common"])
r := rand.New(rand.NewSource(time.Now().UnixNano()))
req.Header.Set("User-Agent", agent.UserAgents["common"][r.Intn(l)])
}
time.Sleep(param.retryPause)
continue
}
break
}
}
return resp, err
}
================================================
FILE: app/downloader/surfer/surf_stub_test.go
================================================
//go:build cover
package surfer
import "testing"
func TestDownloadPhantomJsIDStub(t *testing.T) {
req := &mockRequest{downloaderID: PhantomJsID}
r := Download(req)
if r.IsOk() {
t.Error("Download with PhantomJsID expected error in coverage mode")
}
}
func TestDownloadChromeIDStub(t *testing.T) {
req := &mockRequest{downloaderID: ChromeID}
r := Download(req)
if r.IsOk() {
t.Error("Download with ChromeID expected error in coverage mode")
}
}
func TestDestroyJsFilesStub(t *testing.T) {
req := &mockRequest{downloaderID: PhantomJsID}
Download(req)
DestroyJsFiles()
}
================================================
FILE: app/downloader/surfer/surf_test.go
================================================
package surfer
import (
"bytes"
"compress/flate"
"compress/gzip"
"compress/zlib"
"io"
"net/http"
"net/http/cookiejar"
"net/http/httptest"
"strings"
"testing"
"time"
)
func TestNew(t *testing.T) {
s := New()
if s == nil {
t.Fatal("New() returned nil")
}
if _, ok := s.(*Surf); !ok {
t.Errorf("New() = %T, want *Surf", s)
}
jar, _ := cookiejar.New(nil)
s2 := New(jar)
if s2 == nil {
t.Fatal("New(jar) returned nil")
}
}
func TestSurfDownload(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("hello"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
tests := []struct {
name string
method string
url string
want string
}{
{"GET", "GET", srv.URL, "hello"},
{"HEAD", "HEAD", srv.URL, ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
s := New()
req := &DefaultRequest{
URL: tt.url,
Method: tt.method,
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download() err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Errorf("StatusCode = %d, want 200", resp.StatusCode)
}
body, _ := io.ReadAll(resp.Body)
if !strings.Contains(string(body), tt.want) && tt.want != "" {
t.Errorf("body = %q, want to contain %q", body, tt.want)
}
})
}
}
func TestSurfDownloadGzip(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Encoding", "gzip")
gz := gzip.NewWriter(w)
gz.Write([]byte("gzip body"))
gz.Close()
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download() err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "gzip body" {
t.Errorf("body = %q, want %q", body, "gzip body")
}
}
func TestSurfDownloadPOST(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
t.Errorf("method = %s, want POST", r.Method)
}
body, _ := io.ReadAll(r.Body)
w.Write(body)
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "POST",
PostData: "a=1&b=2",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download() err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "a=1&b=2" {
t.Errorf("body = %q, want a=1&b=2", body)
}
}
func TestSurfDownloadPOSTM(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != "POST" {
t.Errorf("method = %s, want POST", r.Method)
}
if !strings.HasPrefix(r.Header.Get("Content-Type"), "multipart/form-data") {
t.Errorf("Content-Type = %s, want multipart", r.Header.Get("Content-Type"))
}
w.WriteHeader(http.StatusOK)
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "POST-M",
PostData: "k=v",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download() err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Errorf("StatusCode = %d", resp.StatusCode)
}
}
func TestSurfDownloadRetry(t *testing.T) {
var attempt int
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
attempt++
if attempt == 1 {
hj, ok := w.(http.Hijacker)
if !ok {
http.Error(w, "no hijack", 500)
return
}
conn, _, _ := hj.Hijack()
conn.Close()
return
}
w.Write([]byte("ok"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
EnableCookie: false,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "ok" {
t.Errorf("body = %q", body)
}
if attempt < 2 {
t.Errorf("expected retry, got %d attempts", attempt)
}
}
func TestSurfDownloadWithCookie(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("ok"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
EnableCookie: true,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
}
func TestSurfDownloadHTTPS(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("https ok"))
})
srv := httptest.NewTLSServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download() err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "https ok" {
t.Errorf("body = %q, want https ok", body)
}
}
func TestSurfDownloadDeflate(t *testing.T) {
var buf bytes.Buffer
fw, _ := flate.NewWriter(&buf, flate.DefaultCompression)
fw.Write([]byte("deflate body"))
fw.Close()
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Encoding", "deflate")
w.Write(buf.Bytes())
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "deflate body" {
t.Errorf("deflate body = %q, want deflate body", body)
}
}
func TestSurfDownloadZlib(t *testing.T) {
var buf bytes.Buffer
zw := zlib.NewWriter(&buf)
zw.Write([]byte("zlib body"))
zw.Close()
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Encoding", "zlib")
w.Write(buf.Bytes())
})
srv := httptest.NewServer(handler)
defer srv.Close()
s := New()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := s.Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "zlib body" {
t.Errorf("zlib body = %q, want zlib body", body)
}
}
func TestDownloadSurfID(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Write([]byte("ok"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
req := &DefaultRequest{
URL: srv.URL,
Method: "GET",
DownloaderID: SurfID,
TryTimes: 3,
RetryPause: time.Millisecond,
DialTimeout: time.Second,
ConnTimeout: time.Second,
}
r := Download(req)
if r.IsErr() {
t.Fatalf("Download err: %v", r.UnwrapErr())
}
resp := r.Unwrap()
defer resp.Body.Close()
body, _ := io.ReadAll(resp.Body)
if string(body) != "ok" {
t.Errorf("body = %q", body)
}
}
func TestDestroyJsFiles(t *testing.T) {
DestroyJsFiles()
}
func TestDownloadUnknownID(t *testing.T) {
req := &mockRequest{downloaderID: 99}
r := Download(req)
if r.IsOk() {
t.Error("Download expected error for unknown ID")
}
}
type mockRequest struct {
downloaderID int
}
func (m *mockRequest) GetURL() string { return "http://example.com" }
func (m *mockRequest) GetMethod() string { return "GET" }
func (m *mockRequest) GetPostData() string { return "" }
func (m *mockRequest) GetHeader() http.Header { return nil }
func (m *mockRequest) GetEnableCookie() bool { return false }
func (m *mockRequest) GetDialTimeout() time.Duration { return time.Second }
func (m *mockRequest) GetConnTimeout() time.Duration { return time.Second }
func (m *mockRequest) GetTryTimes() int { return 1 }
func (m *mockRequest) GetRetryPause() time.Duration { return time.Millisecond }
func (m *mockRequest) GetProxy() string { return "" }
func (m *mockRequest) GetRedirectTimes() int { return 0 }
func (m *mockRequest) GetDownloaderID() int { return m.downloaderID }
func TestDnsCache(t *testing.T) {
dc := &DnsCache{}
dc.Reg("host:80", "127.0.0.1:80")
opt := dc.Query("host:80")
if !opt.IsSome() || opt.Unwrap() != "127.0.0.1:80" {
t.Errorf("Query = %v, want Some(127.0.0.1:80)", opt)
}
dc.Del("host:80")
opt2 := dc.Query("host:80")
if opt2.IsSome() {
t.Errorf("Query after Del = %v, want None", opt2)
}
}
================================================
FILE: app/downloader/surfer/surfer.go
================================================
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package surfer provides a high-concurrency web downloader written in Go.
// It supports GET/POST/HEAD methods and http/https, fixed UserAgent with cookie
// persistence or random UserAgents without cookies, and simulates browser behavior for login flows.
package surfer
import (
"errors"
"net/http"
"net/http/cookiejar"
"sync"
"github.com/andeya/gust/result"
)
var (
surf Surfer
phantom Surfer
chrome Surfer
once_surf sync.Once
once_phantom sync.Once
once_chrome sync.Once
tempJsDir = "./tmp"
// phantomjsFile = filepath.Clean(path.Join(os.Getenv("GOPATH"), `/src/github.com/andeya/surfer/phantomjs/phantomjs`))
phantomjsFile = `./phantomjs`
cookieJar, _ = cookiejar.New(nil) // nil options never returns error
)
func Download(req Request) result.Result[*http.Response] {
switch req.GetDownloaderID() {
case SurfID:
once_surf.Do(func() { surf = New(cookieJar) })
return surf.Download(req)
case PhantomJsID:
once_phantom.Do(func() { phantom = NewPhantom(phantomjsFile, tempJsDir, cookieJar) })
return phantom.Download(req)
case ChromeID:
once_chrome.Do(func() { chrome = NewChrome(cookieJar) })
return chrome.Download(req)
}
return result.TryErr[*http.Response](errors.New("unknown downloader id"))
}
// DestroyJsFiles removes PhantomJS temporary JS files.
func DestroyJsFiles() {
if pt, ok := phantom.(interface{ DestroyJsFiles() }); ok {
pt.DestroyJsFiles()
}
}
// Downloader represents an core of HTTP web browser for crawler.
type Surfer interface {
// GET @param url string, header http.Header, cookies []*http.Cookie
// HEAD @param url string, header http.Header, cookies []*http.Cookie
// POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
// POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
Download(Request) result.Result[*http.Response]
}
================================================
FILE: app/downloader/surfer/util.go
================================================
// Copyright 2015 andeya Author. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package surfer
import (
"io"
"log"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"golang.org/x/net/html/charset"
)
// AutoToUTF8 attempts to transcode response body to UTF-8 when using Surf.
// PhantomJS output is already UTF-8, so no transcoding is needed.
func AutoToUTF8(resp *http.Response) error {
destReader, err := charset.NewReader(resp.Body, resp.Header.Get("Content-Type"))
if err == nil {
resp.Body = &Body{
ReadCloser: resp.Body,
Reader: destReader,
}
}
return err
}
// BodyBytes reads the full response body.
func BodyBytes(resp *http.Response) ([]byte, error) {
body, err := io.ReadAll(resp.Body)
resp.Body.Close()
return body, err
}
// URLEncode parses and encodes the URL, returning the result and any parse error.
func URLEncode(urlStr string) (*url.URL, error) {
urlObj, err := url.Parse(urlStr)
urlObj.RawQuery = urlObj.Query().Encode()
return urlObj, err
}
// GetWDPath returns the working directory path (GOPATH).
func GetWDPath() string {
wd := os.Getenv("GOPATH")
if wd == "" {
panic("GOPATH is not set in env.")
}
return wd
}
// IsDirExists checks whether the path is a directory.
func IsDirExists(path string) bool {
fi, err := os.Stat(path)
if err != nil {
return os.IsExist(err)
}
return fi.IsDir()
}
// IsFileExists checks whether the path is a file.
func IsFileExists(path string) bool {
fi, err := os.Stat(path)
if err != nil {
return os.IsExist(err)
}
return !fi.IsDir()
}
// WalkDir walks a directory, optionally filtering by suffix.
func WalkDir(targpath string, suffixes ...string) (dirlist []string) {
if !filepath.IsAbs(targpath) {
targpath, _ = filepath.Abs(targpath)
}
err := filepath.Walk(targpath, func(retpath string, f os.FileInfo, err error) error {
if err != nil {
return err
}
if !f.IsDir() {
return nil
}
if len(suffixes) == 0 {
dirlist = append(dirlist, retpath)
return nil
}
for _, suffix := range suffixes {
if strings.HasSuffix(retpath, suffix) {
dirlist = append(dirlist, retpath)
}
}
return nil
})
if err != nil {
log.Printf("utils.WalkDir: %v\n", err)
return
}
return
}
// ExtractHomepage returns the scheme + host portion of a URL, e.g.
// "https://www.baidu.com/s?wd=go" → "https://www.baidu.com".
func ExtractHomepage(rawURL string) string {
idx := strings.Index(rawURL, "://")
if idx < 0 {
return ""
}
rest := rawURL[idx+3:]
slash := strings.Index(rest, "/")
if slash < 0 {
return rawURL
}
return rawURL[:idx+3+slash]
}
// Body wraps Response.Body with a custom Reader for transcoding.
type Body struct {
io.ReadCloser
io.Reader
}
func (b *Body) Read(p []byte) (int, error) {
return b.Reader.Read(p)
}
================================================
FILE: app/downloader/surfer/util_test.go
================================================
package surfer
import (
"bytes"
"io"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
)
func TestURLEncode(t *testing.T) {
tests := []struct {
url string
wantQ string
}{
{"http://example.com", ""},
{"http://example.com?a=1&b=2", "a=1&b=2"},
{"http://example.com?x=hello world", "x=hello+world"},
}
for _, tt := range tests {
t.Run(tt.url, func(t *testing.T) {
u, err := URLEncode(tt.url)
if err != nil {
t.Fatalf("URLEncode err: %v", err)
}
if tt.wantQ != "" && u.RawQuery != tt.wantQ {
t.Errorf("RawQuery = %q, want %q", u.RawQuery, tt.wantQ)
}
})
}
}
func TestBodyBytes(t *testing.T) {
body := []byte("test body")
resp := &http.Response{
Body: io.NopCloser(bytes.NewReader(body)),
}
got, err := BodyBytes(resp)
if err != nil {
t.Fatalf("BodyBytes err: %v", err)
}
if !bytes.Equal(got, body) {
t.Errorf("BodyBytes = %q, want %q", got, body)
}
}
func TestAutoToUTF8(t *testing.T) {
handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=gbk")
w.Write([]byte("hello"))
})
srv := httptest.NewServer(handler)
defer srv.Close()
resp, err := http.Get(srv.URL)
if err != nil {
t.Fatalf("http.Get err: %v", err)
}
defer resp.Body.Close()
err = AutoToUTF8(resp)
if err != nil {
t.Logf("AutoToUTF8 err (charset may be unsupported): %v", err)
}
}
func TestBodyRead(t *testing.T) {
r := strings.NewReader("abc")
b := &Body{
ReadCloser: io.NopCloser(r),
Reader: r,
}
p := make([]byte, 2)
n, err := b.Read(p)
if err != nil && err != io.EOF {
t.Fatalf("Read err: %v", err)
}
if n != 2 || string(p) != "ab" {
t.Errorf("Read = %d, %q", n, p)
}
}
func TestIsDirExists(t *testing.T) {
tests := []struct {
path string
want bool
}{
{os.TempDir(), true},
{"/nonexistent/path/12345", false},
{"util_test.go", false},
}
for _, tt := range tests {
got := IsDirExists(tt.path)
if got != tt.want {
t.Errorf("IsDirExists(%q) = %v, want %v", tt.path, got, tt.want)
}
}
}
func TestIsFileExists(t *testing.T) {
tests := []struct {
path string
want bool
}{
{"util_test.go", true},
{os.TempDir(), false},
{"/nonexistent/file", false},
}
for _, tt := range tests {
got := IsFileExists(tt.path)
if got != tt.want {
t.Errorf("IsFileExists(%q) = %v, want %v", tt.path, got, tt.want)
}
}
}
func TestWalkDir(t *testing.T) {
td := t.TempDir()
os.MkdirAll(filepath.Join(td, "a"), 0755)
os.MkdirAll(filepath.Join(td, "b"), 0755)
os.WriteFile(filepath.Join(td, "f.txt"), nil, 0644)
dirs := WalkDir(td)
if len(dirs) < 2 {
t.Errorf("WalkDir len = %d, want >= 2", len(dirs))
}
dirsSuffix := WalkDir(td, "a")
if len(dirsSuffix) != 1 {
t.Errorf("WalkDir with suffix len = %d, want 1", len(dirsSuffix))
}
}
================================================
FILE: app/pipeline/collector/collector.go
================================================
// Package collector implements result collection and output.
package collector
import (
"runtime/debug"
"sync"
"sync/atomic"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector/data"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/cache"
)
// Collector collects spider results and writes them to the configured output backend.
type Collector struct {
*spider.Spider
DataChan chan data.DataCell
FileChan chan data.FileCell
dataBuf []data.DataCell
outType string
batchCap int
dataBatch uint64
fileBatch uint64
wait sync.WaitGroup
sum [4]uint64
dataSumLock sync.RWMutex
fileSumLock sync.RWMutex
}
// NewCollector creates a new Collector for the given spider.
func NewCollector(sp *spider.Spider, outType string, batchCap int) *Collector {
if batchCap < 1 {
batchCap = 1
}
return &Collector{
Spider: sp,
outType: outType,
batchCap: batchCap,
DataChan: make(chan data.DataCell, batchCap),
FileChan: make(chan data.FileCell, batchCap),
dataBuf: make([]data.DataCell, 0, batchCap),
}
}
// CollectData sends a data cell to the collector.
func (c *Collector) CollectData(dataCell data.DataCell) (r result.VoidResult) {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
r = result.FmtErrVoid("output goroutine has terminated")
}
}()
c.DataChan <- dataCell
return result.OkVoid()
}
// CollectFile sends a file cell to the collector.
func (c *Collector) CollectFile(fileCell data.FileCell) (r result.VoidResult) {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
r = result.FmtErrVoid("output goroutine has terminated")
}
}()
c.FileChan <- fileCell
return result.OkVoid()
}
// Stop closes the collector's channels and shuts down the pipeline.
func (c *Collector) Stop() {
go func() {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
close(c.DataChan)
}()
go func() {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
close(c.FileChan)
}()
}
// Start launches the data collection and output pipeline.
func (c *Collector) Start() {
go func() {
dataStop := make(chan bool)
fileStop := make(chan bool)
go func() {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
for data := range c.DataChan {
c.dataBuf = append(c.dataBuf, data)
if len(c.dataBuf) < c.batchCap {
continue
}
c.dataBatch++
c.outputData()
}
c.dataBatch++
c.outputData()
close(dataStop)
}()
go func() {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
for file := range c.FileChan {
atomic.AddUint64(&c.fileBatch, 1)
c.wait.Add(1)
go c.outputFile(file)
}
close(fileStop)
}()
<-dataStop
<-fileStop
c.wait.Wait()
c.Report()
}()
}
func (c *Collector) resetDataBuf() {
for _, cell := range c.dataBuf {
data.PutDataCell(cell)
}
c.dataBuf = c.dataBuf[:0]
}
// dataSum returns the total number of text records output.
func (c *Collector) dataSum() uint64 {
c.dataSumLock.RLock()
defer c.dataSumLock.RUnlock()
return c.sum[1]
}
// addDataSum increments the text record count.
func (c *Collector) addDataSum(add uint64) {
c.dataSumLock.Lock()
defer c.dataSumLock.Unlock()
c.sum[0] = c.sum[1]
c.sum[1] += add
}
// fileSum returns the total number of files output.
func (c *Collector) fileSum() uint64 {
c.fileSumLock.RLock()
defer c.fileSumLock.RUnlock()
return c.sum[3]
}
// addFileSum increments the file count.
func (c *Collector) addFileSum(add uint64) {
c.fileSumLock.Lock()
defer c.fileSumLock.Unlock()
c.sum[2] = c.sum[3]
c.sum[3] += add
}
// Report sends the collection report to the report channel.
func (c *Collector) Report() {
cache.ReportChan <- &cache.Report{
SpiderName: c.Spider.GetName(),
Keyin: c.GetKeyin(),
DataNum: c.dataSum(),
FileNum: c.fileSum(),
Time: time.Since(cache.StartTime),
}
}
================================================
FILE: app/pipeline/collector/collector_test.go
================================================
package collector
import (
"os"
"path/filepath"
"testing"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector/data"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/runtime/cache"
)
func TestNewCollector(t *testing.T) {
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
tests := []struct {
name string
batchCap int
wantCap int
}{
{"normal", 10, 10},
{"one", 1, 1},
{"zero", 0, 1},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
c := NewCollector(sp, "csv", tt.batchCap)
if c == nil {
t.Fatal("NewCollector returned nil")
}
if cap(c.DataChan) != tt.wantCap {
t.Errorf("DataChan cap = %d, want %d", cap(c.DataChan), tt.wantCap)
}
if cap(c.FileChan) != tt.wantCap {
t.Errorf("FileChan cap = %d, want %d", cap(c.FileChan), tt.wantCap)
}
})
}
}
func TestCollector_CollectData(t *testing.T) {
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 10)
c.Start()
defer c.Stop()
cell := data.GetDataCell("r1", map[string]interface{}{"a": "b"}, "u", "pu", "dt")
r := c.CollectData(cell)
if r.IsErr() {
t.Errorf("CollectData: %v", r.UnwrapErr())
}
}
func TestCollector_CollectFile(t *testing.T) {
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 10)
c.Start()
defer c.Stop()
cell := data.GetFileCell("r1", "test.txt", []byte("hello"))
r := c.CollectFile(cell)
if r.IsErr() {
t.Errorf("CollectFile: %v", r.UnwrapErr())
}
}
func TestCollector_Stop(t *testing.T) {
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 10)
c.Start()
c.Stop()
}
func TestCollector_OutputData_EmptyBuf(t *testing.T) {
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 1)
c.outputData()
}
func TestCollector_OutputData_PanicRecovery(t *testing.T) {
oldFn := DataOutput["csv"]
DataOutput["csv"] = func(*Collector) result.VoidResult { panic("test panic") }
defer func() { DataOutput["csv"] = oldFn }()
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{"r1": {ItemFields: []string{"f1"}}}},
}
c := NewCollector(sp, "csv", 1)
c.dataBuf = []data.DataCell{
data.GetDataCell("r1", map[string]interface{}{"f1": "v1"}, "u", "pu", "dt"),
}
c.dataBatch = 1
c.addDataSum(1)
c.outputData()
}
func TestCollector_OutputData_ErrorResult(t *testing.T) {
oldFn := DataOutput["csv"]
DataOutput["csv"] = func(*Collector) result.VoidResult { return result.FmtErrVoid("test error") }
defer func() { DataOutput["csv"] = oldFn }()
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{"r1": {ItemFields: []string{"f1"}}}},
}
c := NewCollector(sp, "csv", 1)
c.dataBuf = []data.DataCell{
data.GetDataCell("r1", map[string]interface{}{"f1": "v1"}, "u", "pu", "dt"),
}
c.dataBatch = 1
c.addDataSum(1)
c.outputData()
}
func TestCollector_OutputCSV(t *testing.T) {
tmp := t.TempDir()
_ = config.Conf()
conf := config.Conf()
conf.TextDir = tmp
cache.StartTime = time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC)
if cache.Task == nil {
cache.Task = &cache.AppConf{}
}
cache.Task.OutType = "csv"
cache.Task.Mode = 0
cache.Task.SuccessInherit = false
go func() {
for range cache.ReportChan {
}
}()
sp := &spider.Spider{
Name: "CSVSpider",
Keyin: "",
RuleTree: &spider.RuleTree{
Trunk: map[string]*spider.Rule{
"list": {ItemFields: []string{"title", "url"}},
},
},
}
sp.ReqmatrixInit()
c := NewCollector(sp, "csv", 2)
c.dataBuf = []data.DataCell{
data.GetDataCell("list", map[string]interface{}{"title": "t1", "url": "u1"}, "http://a.com", "http://p.com", "2024-01-15"),
data.GetDataCell("list", map[string]interface{}{"title": "t2", "url": "u2"}, "http://b.com", "http://p.com", "2024-01-15"),
}
c.dataBatch = 1
c.addDataSum(2)
DataOutput["csv"](c)
var matches []string
filepath.WalkDir(tmp, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && filepath.Ext(path) == ".csv" {
matches = append(matches, path)
}
return nil
})
if len(matches) == 0 {
t.Fatal("no CSV file created")
}
content, err := readFile(matches[0])
if err != nil {
t.Fatalf("read CSV: %v", err)
}
if len(content) < 10 {
t.Errorf("CSV content too short: %q", content)
}
}
func TestCollector_OutputCSV_NonStringData(t *testing.T) {
tmp := t.TempDir()
conf := config.Conf()
conf.TextDir = tmp
cache.StartTime = time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC)
if cache.Task == nil {
cache.Task = &cache.AppConf{}
}
sp := &spider.Spider{
Name: "CSVSpider2",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{"r1": {ItemFields: []string{"n", "v"}}}},
}
c := NewCollector(sp, "csv", 1)
c.dataBuf = []data.DataCell{
data.GetDataCell("r1", map[string]interface{}{"n": 123, "v": 3.14}, "u", "pu", "dt"),
}
c.dataBatch = 1
c.addDataSum(1)
DataOutput["csv"](c)
var matches []string
filepath.WalkDir(tmp, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && filepath.Ext(path) == ".csv" {
matches = append(matches, path)
}
return nil
})
if len(matches) == 0 {
t.Fatal("no CSV file created")
}
}
func TestCollector_OutputCSV_NotDefaultField(t *testing.T) {
tmp := t.TempDir()
conf := config.Conf()
conf.TextDir = tmp
cache.StartTime = time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC)
if cache.Task == nil {
cache.Task = &cache.AppConf{}
}
sp := &spider.Spider{
Name: "CSVSpider3",
NotDefaultField: true,
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{"r1": {ItemFields: []string{"x"}}}},
}
c := NewCollector(sp, "csv", 1)
c.dataBuf = []data.DataCell{
data.GetDataCell("r1", map[string]interface{}{"x": "y"}, "u", "pu", "dt"),
}
c.dataBatch = 1
c.addDataSum(1)
DataOutput["csv"](c)
var matches []string
filepath.WalkDir(tmp, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && filepath.Ext(path) == ".csv" {
matches = append(matches, path)
}
return nil
})
if len(matches) == 0 {
t.Fatal("no CSV file created")
}
}
func TestCollector_OutputExcel(t *testing.T) {
tmp := t.TempDir()
_ = config.Conf()
conf := config.Conf()
conf.TextDir = tmp
cache.StartTime = time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC)
if cache.Task == nil {
cache.Task = &cache.AppConf{}
}
cache.Task.OutType = "excel"
sp := &spider.Spider{
Name: "ExcelSpider",
RuleTree: &spider.RuleTree{
Trunk: map[string]*spider.Rule{
"sheet1": {ItemFields: []string{"col1", "col2"}},
},
},
}
c := NewCollector(sp, "excel", 2)
c.dataBuf = []data.DataCell{
data.GetDataCell("sheet1", map[string]interface{}{"col1": "v1", "col2": "v2"}, "u", "pu", "dt"),
data.GetDataCell("sheet1", map[string]interface{}{"col1": 99, "col2": 1.5}, "u2", "pu2", "dt2"),
}
c.dataBatch = 1
c.addDataSum(2)
DataOutput["excel"](c)
var excelMatches []string
filepath.WalkDir(tmp, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && filepath.Ext(path) == ".xlsx" {
excelMatches = append(excelMatches, path)
}
return nil
})
if len(excelMatches) == 0 {
t.Fatal("no Excel file created")
}
}
func TestCollector_OutputFile(t *testing.T) {
tmp := t.TempDir()
_ = config.Conf()
conf := config.Conf()
conf.FileDir = tmp
sp := &spider.Spider{
Name: "FileSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 1)
c.wait.Add(1)
fc := data.GetFileCell("r1", "subdir/file.txt", []byte("file content"))
c.outputFile(fc)
var filePath string
filepath.WalkDir(tmp, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() && filepath.Base(path) == "file.txt" {
filePath = path
}
return nil
})
if filePath == "" {
t.Fatal("no file.txt created")
}
content, err := readFile(filePath)
if err != nil {
t.Fatalf("read file: %v", err)
}
if content != "file content" {
t.Errorf("content = %q, want %q", content, "file content")
}
}
func readFile(path string) (string, error) {
b, err := os.ReadFile(path)
if err != nil {
return "", err
}
return string(b), nil
}
func TestCollector_OutputFile_MkdirFail(t *testing.T) {
tmp := t.TempDir()
blocker := filepath.Join(tmp, "blocker")
if err := os.WriteFile(blocker, []byte("x"), 0644); err != nil {
t.Fatal(err)
}
conf := config.Conf()
oldFileDir := conf.FileDir
conf.FileDir = blocker
defer func() { conf.FileDir = oldFileDir }()
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
c := NewCollector(sp, "csv", 1)
c.wait.Add(1)
fc := data.GetFileCell("r1", "x/file.txt", []byte("content"))
c.outputFile(fc)
}
================================================
FILE: app/pipeline/collector/data/data.go
================================================
// Package data provides storage structure definitions for data and file cells.
package data
import (
"sync"
)
const (
FieldRuleName = "RuleName"
FieldURL = "Url"
FieldParentURL = "ParentUrl"
FieldDownloadTime = "DownloadTime"
)
type (
// DataCell is a storage unit for text data.
DataCell map[string]interface{}
// FileCell is a storage unit for file data.
// Stored path format: file/"Dir"/"RuleName"/"time"/"Name"
FileCell map[string]interface{}
)
var (
dataCellPool = &sync.Pool{
New: func() interface{} {
return DataCell{}
},
}
fileCellPool = &sync.Pool{
New: func() interface{} {
return FileCell{}
},
}
)
// GetDataCell returns a DataCell from the pool with the given fields.
func GetDataCell(ruleName string, data map[string]interface{}, url string, parentURL string, downloadTime string) DataCell {
cell := dataCellPool.Get().(DataCell)
cell[FieldRuleName] = ruleName
cell["Data"] = data
cell[FieldURL] = url
cell[FieldParentURL] = parentURL
cell[FieldDownloadTime] = downloadTime
return cell
}
// GetFileCell returns a FileCell from the pool with the given fields.
func GetFileCell(ruleName, name string, bytes []byte) FileCell {
cell := fileCellPool.Get().(FileCell)
cell[FieldRuleName] = ruleName
cell["Name"] = name
cell["Bytes"] = bytes
return cell
}
// PutDataCell returns a DataCell to the pool.
func PutDataCell(cell DataCell) {
cell[FieldRuleName] = nil
cell["Data"] = nil
cell[FieldURL] = nil
cell[FieldParentURL] = nil
cell[FieldDownloadTime] = nil
dataCellPool.Put(cell)
}
// PutFileCell returns a FileCell to the pool.
func PutFileCell(cell FileCell) {
cell[FieldRuleName] = nil
cell["Name"] = nil
cell["Bytes"] = nil
fileCellPool.Put(cell)
}
================================================
FILE: app/pipeline/collector/data/data_test.go
================================================
package data
import (
"testing"
)
func TestGetDataCell(t *testing.T) {
d := map[string]interface{}{"key": "value"}
cell := GetDataCell("rule1", d, "http://example.com", "http://parent.com", "2024-01-01")
if cell[FieldRuleName] != "rule1" {
t.Errorf("RuleName = %v, want %q", cell[FieldRuleName], "rule1")
}
if cell[FieldURL] != "http://example.com" {
t.Errorf("Url = %v, want %q", cell[FieldURL], "http://example.com")
}
if cell[FieldParentURL] != "http://parent.com" {
t.Errorf("ParentUrl = %v, want %q", cell[FieldParentURL], "http://parent.com")
}
if cell[FieldDownloadTime] != "2024-01-01" {
t.Errorf("DownloadTime = %v, want %q", cell[FieldDownloadTime], "2024-01-01")
}
data := cell["Data"].(map[string]interface{})
if data["key"] != "value" {
t.Errorf("Data[key] = %v, want %q", data["key"], "value")
}
}
func TestGetFileCell(t *testing.T) {
body := []byte("hello world")
cell := GetFileCell("rule2", "test.txt", body)
if cell[FieldRuleName] != "rule2" {
t.Errorf("RuleName = %v, want %q", cell[FieldRuleName], "rule2")
}
if cell["Name"] != "test.txt" {
t.Errorf("Name = %v, want %q", cell["Name"], "test.txt")
}
if string(cell["Bytes"].([]byte)) != "hello world" {
t.Errorf("Bytes = %v, want %q", cell["Bytes"], "hello world")
}
}
func TestPutDataCell(t *testing.T) {
cell := GetDataCell("r", nil, "", "", "")
PutDataCell(cell)
if cell[FieldRuleName] != nil {
t.Error("RuleName should be nil after Put")
}
if cell["Data"] != nil {
t.Error("Data should be nil after Put")
}
}
func TestPutFileCell(t *testing.T) {
cell := GetFileCell("r", "f", []byte{1})
PutFileCell(cell)
if cell[FieldRuleName] != nil {
t.Error("RuleName should be nil after Put")
}
if cell["Name"] != nil {
t.Error("Name should be nil after Put")
}
if cell["Bytes"] != nil {
t.Error("Bytes should be nil after Put")
}
}
func TestPoolReuseDataCell(t *testing.T) {
c1 := GetDataCell("a", nil, "", "", "")
PutDataCell(c1)
c2 := GetDataCell("b", nil, "", "", "")
if c2[FieldRuleName] != "b" {
t.Errorf("reused cell RuleName = %v, want %q", c2[FieldRuleName], "b")
}
}
================================================
FILE: app/pipeline/collector/output_beanstalkd.go
================================================
//go:build !coverage
package collector
import (
"encoding/json"
"fmt"
"net/url"
"time"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/beanstalkd"
"github.com/andeya/pholcus/common/util"
)
// --- Beanstalkd Output ---
func init() {
DataOutput["beanstalkd"] = func(col *Collector) (r result.VoidResult) {
defer r.Catch()
client := beanstalkd.New().Unwrap()
defer client.Close()
namespace := fmt.Sprintf("%v__%v-%v", util.FileNameReplace(col.namespace()), col.sum[0], col.sum[1])
createtime := fmt.Sprintf("%d", time.Now().Unix())
for _, datacell := range col.dataBuf {
var subNamespace = util.FileNameReplace(col.subNamespace(datacell))
tmp := make(map[string]interface{})
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
vd := datacell["Data"].(map[string]interface{})
if v, ok := vd[title].(string); ok || vd[title] == nil {
tmp[title] = v
} else {
tmp[title] = util.JSONString(vd[title])
}
}
if col.Spider.OutDefaultField() {
tmp["Url"] = datacell["Url"].(string)
tmp["ParentUrl"] = datacell["ParentUrl"].(string)
tmp["DownloadTime"] = datacell["DownloadTime"].(string)
}
data := url.Values{}
res, err := json.Marshal(tmp)
result.RetVoid(err).Unwrap()
data.Add("createtime", createtime)
data.Add("type", namespace+"__"+subNamespace)
data.Add("content", string(res))
client.Send(data).Unwrap()
}
return result.OkVoid()
}
}
================================================
FILE: app/pipeline/collector/output_beanstalkd_stub.go
================================================
//go:build coverage
package collector
import (
"github.com/andeya/gust/result"
)
func init() {
DataOutput["beanstalkd"] = func(*Collector) result.VoidResult { return result.OkVoid() }
}
================================================
FILE: app/pipeline/collector/output_csv.go
================================================
package collector
import (
"encoding/csv"
"fmt"
"os"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/runtime/cache"
)
// --- CSV Output ---
func init() {
DataOutput["csv"] = func(col *Collector) (r result.VoidResult) {
defer r.Catch()
var (
namespace = util.FileNameReplace(col.namespace())
sheets = make(map[string]*csv.Writer)
)
for _, datacell := range col.dataBuf {
var subNamespace = util.FileNameReplace(col.subNamespace(datacell))
if _, ok := sheets[subNamespace]; !ok {
folder := config.Conf().TextDir + "/" + cache.StartTime.Format("2006-01-02 150405") + "/" + joinNamespaces(namespace, subNamespace)
filename := fmt.Sprintf("%v/%v-%v.csv", folder, col.sum[0], col.sum[1])
f, err := os.Stat(folder)
if err != nil || !f.IsDir() {
result.RetVoid(os.MkdirAll(folder, 0777)).Unwrap()
}
file, err := os.Create(filename)
result.RetVoid(err).Unwrap()
defer func(ns string, f *os.File) {
if w := sheets[ns]; w != nil {
w.Flush()
}
f.Close()
}(subNamespace, file)
file.WriteString("\xEF\xBB\xBF") // UTF-8 BOM
sheets[subNamespace] = csv.NewWriter(file)
th := col.MustGetRule(datacell["RuleName"].(string)).ItemFields
if col.Spider.OutDefaultField() {
th = append(th, "Url", "ParentUrl", "DownloadTime")
}
sheets[subNamespace].Write(th)
}
row := []string{}
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
vd := datacell["Data"].(map[string]interface{})
if v, ok := vd[title].(string); ok || vd[title] == nil {
row = append(row, v)
} else {
row = append(row, util.JSONString(vd[title]))
}
}
if col.Spider.OutDefaultField() {
row = append(row, datacell["Url"].(string))
row = append(row, datacell["ParentUrl"].(string))
row = append(row, datacell["DownloadTime"].(string))
}
sheets[subNamespace].Write(row)
}
return result.OkVoid()
}
}
================================================
FILE: app/pipeline/collector/output_data.go
================================================
package collector
import (
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/logs"
)
// Refresher is an optional interface that output backends can implement
// to refresh their state (e.g., reconnect) before a new task run.
type Refresher interface {
Refresh()
}
var (
// DataOutput maps output type names to their implementation functions.
DataOutput = make(map[string]func(col *Collector) result.VoidResult)
// DataOutputLib lists the names of supported text data output backends.
DataOutputLib []string
// dataRefreshers maps output type names to optional Refresher implementations.
dataRefreshers = make(map[string]Refresher)
)
// outputData writes collected text data to the configured output backend.
func (c *Collector) outputData() {
defer func() {
c.resetDataBuf()
}()
dataLen := uint64(len(c.dataBuf))
if dataLen == 0 {
return
}
defer func() {
if p := recover(); p != nil {
logs.Log().Informational(" * ")
logs.Log().App(" * Panic [Data output: %v | KEYIN: %v | Batch: %v] %v records! [ERROR] %v\n",
c.Spider.GetName(), c.Spider.GetKeyin(), c.dataBatch, dataLen, p)
}
}()
c.addDataSum(dataLen)
r := DataOutput[c.outType](c)
logs.Log().Informational(" * ")
if r.IsErr() {
logs.Log().App(" * Fail [Data output: %v | KEYIN: %v | Batch: %v] %v records! [ERROR] %v\n",
c.Spider.GetName(), c.Spider.GetKeyin(), c.dataBatch, dataLen, r.UnwrapErr())
} else {
logs.Log().App(" * [Data output: %v | KEYIN: %v | Batch: %v] %v records!\n",
c.Spider.GetName(), c.Spider.GetKeyin(), c.dataBatch, dataLen)
c.Spider.TryFlushSuccess()
}
}
// Register adds an output backend for the given type name.
func Register(outType string, outFunc func(col *Collector) result.VoidResult) {
DataOutput[outType] = outFunc
}
// RegisterRefresher associates a Refresher with an output type.
func RegisterRefresher(outType string, r Refresher) {
dataRefreshers[outType] = r
}
// RefreshBackend calls the Refresher for the given output type, if registered.
func RefreshBackend(outType string) {
if r, ok := dataRefreshers[outType]; ok {
r.Refresh()
}
}
================================================
FILE: app/pipeline/collector/output_data_test.go
================================================
package collector
import (
"testing"
"github.com/andeya/gust/result"
)
func TestRegister(t *testing.T) {
Register("_test_register", func(*Collector) result.VoidResult { return result.OkVoid() })
if _, ok := DataOutput["_test_register"]; !ok {
t.Error("_test_register not registered")
}
}
func TestRegisterRefresher(t *testing.T) {
called := false
RegisterRefresher("_test_refresher", &testRefresher{fn: func() { called = true }})
RefreshBackend("_test_refresher")
if !called {
t.Error("Refresh should have been called")
}
}
func TestRefreshBackend_Unregistered(t *testing.T) {
RefreshBackend("_nonexistent_type_")
}
type testRefresher struct {
fn func()
}
func (t *testRefresher) Refresh() {
if t.fn != nil {
t.fn()
}
}
================================================
FILE: app/pipeline/collector/output_excel.go
================================================
package collector
import (
"fmt"
"os"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/common/xlsx"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/cache"
)
// --- Excel Output ---
func init() {
DataOutput["excel"] = func(col *Collector) (r result.VoidResult) {
defer r.Catch()
var (
file *xlsx.File
row *xlsx.Row
cell *xlsx.Cell
sheets = make(map[string]*xlsx.Sheet)
)
file = xlsx.NewFile()
for _, datacell := range col.dataBuf {
var subNamespace = util.FileNameReplace(col.subNamespace(datacell))
if _, ok := sheets[subNamespace]; !ok {
r := file.AddSheet(subNamespace)
if r.IsErr() {
logs.Log().Error("%v", r.UnwrapErr())
continue
}
sheet := r.Unwrap()
sheets[subNamespace] = sheet
row = sheets[subNamespace].AddRow()
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
row.AddCell().Value = title
}
if col.Spider.OutDefaultField() {
row.AddCell().Value = "Url"
row.AddCell().Value = "ParentUrl"
row.AddCell().Value = "DownloadTime"
}
}
row = sheets[subNamespace].AddRow()
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
cell = row.AddCell()
vd := datacell["Data"].(map[string]interface{})
if v, ok := vd[title].(string); ok || vd[title] == nil {
cell.Value = v
} else {
cell.Value = util.JSONString(vd[title])
}
}
if col.Spider.OutDefaultField() {
row.AddCell().Value = datacell["Url"].(string)
row.AddCell().Value = datacell["ParentUrl"].(string)
row.AddCell().Value = datacell["DownloadTime"].(string)
}
}
folder := config.Conf().TextDir + "/" + cache.StartTime.Format("2006-01-02 150405")
filename := fmt.Sprintf("%v/%v__%v-%v.xlsx", folder, util.FileNameReplace(col.namespace()), col.sum[0], col.sum[1])
f2, err := os.Stat(folder)
if err != nil || !f2.IsDir() {
result.RetVoid(os.MkdirAll(folder, 0777)).Unwrap()
}
return file.Save(filename)
}
}
================================================
FILE: app/pipeline/collector/output_file.go
================================================
package collector
import (
"bytes"
"io"
"os"
"path/filepath"
"sync/atomic"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector/data"
bytesSize "github.com/andeya/pholcus/common/bytes"
"github.com/andeya/pholcus/common/closer"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
)
// outputFile writes a file cell to disk.
func (c *Collector) outputFile(file data.FileCell) {
defer func() {
data.PutFileCell(file)
c.wait.Done()
}()
// Path format: file/"RuleName"/"time"/"Name"
p, n := filepath.Split(filepath.Clean(file["Name"].(string)))
dir := filepath.Join(config.Conf().FileDir, util.FileNameReplace(c.namespace()), p)
fileName := filepath.Join(dir, util.FileNameReplace(n))
d, err := os.Stat(dir)
if err != nil || !d.IsDir() {
if r := result.RetVoid(os.MkdirAll(dir, 0777)); r.IsErr() {
logs.Log().Error(
" * Fail [File download: %v | KEYIN: %v | Batch: %v] %v [ERROR] %v\n",
c.Spider.GetName(), c.Spider.GetKeyin(), atomic.LoadUint64(&c.fileBatch), fileName, r.UnwrapErr(),
)
return
}
}
// Create file with 0777 if not exists, truncate if exists
f, err := os.OpenFile(fileName, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0777)
if err != nil {
logs.Log().Error(
" * Fail [File download: %v | KEYIN: %v | Batch: %v] %v [ERROR] %v\n",
c.Spider.GetName(), c.Spider.GetKeyin(), atomic.LoadUint64(&c.fileBatch), fileName, err,
)
return
}
defer closer.LogClose(f, logs.Log().Error)
size, err := io.Copy(f, bytes.NewReader(file["Bytes"].([]byte)))
if err != nil {
logs.Log().Error(
" * Fail [File download: %v | KEYIN: %v | Batch: %v] %v (%s) [ERROR] %v\n",
c.Spider.GetName(), c.Spider.GetKeyin(), atomic.LoadUint64(&c.fileBatch), fileName, bytesSize.Format(uint64(size)), err,
)
return
}
c.addFileSum(1)
logs.Log().Informational(" * ")
logs.Log().App(
" * [File download: %v | KEYIN: %v | Batch: %v] %v (%s)\n",
c.Spider.GetName(), c.Spider.GetKeyin(), atomic.LoadUint64(&c.fileBatch), fileName, bytesSize.Format(uint64(size)),
)
logs.Log().Informational(" * ")
}
================================================
FILE: app/pipeline/collector/output_kafka.go
================================================
//go:build !coverage
package collector
import (
"regexp"
"sync"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/kafka"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/logs"
)
// --- Kafka Output ---
func init() {
var (
kafkaSenders = map[string]*kafka.KafkaSender{}
kafkaSenderLock sync.RWMutex
)
var getKafkaSender = func(name string) (*kafka.KafkaSender, bool) {
kafkaSenderLock.RLock()
tab, ok := kafkaSenders[name]
kafkaSenderLock.RUnlock()
return tab, ok
}
var setKafkaSender = func(name string, tab *kafka.KafkaSender) {
kafkaSenderLock.Lock()
kafkaSenders[name] = tab
kafkaSenderLock.Unlock()
}
var topic = regexp.MustCompile("^[0-9a-zA-Z_-]+$")
DataOutput["kafka"] = func(col *Collector) (r result.VoidResult) {
defer r.Catch()
kafka.GetProducer().Unwrap()
var (
kafkas = make(map[string]*kafka.KafkaSender)
namespace = util.FileNameReplace(col.namespace())
)
for _, datacell := range col.dataBuf {
subNamespace := util.FileNameReplace(col.subNamespace(datacell))
topicName := joinNamespaces(namespace, subNamespace)
if !topic.MatchString(topicName) {
logs.Log().Error("topic must match '^[0-9a-zA-Z_-]+$', got: %s", topicName)
continue
}
sender, ok := kafkas[topicName]
if !ok {
sender, ok = getKafkaSender(topicName)
if ok {
kafkas[topicName] = sender
} else {
sender = kafka.New()
sender.SetTopic(topicName)
setKafkaSender(topicName, sender)
kafkas[topicName] = sender
}
}
data := make(map[string]interface{})
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
vd := datacell["Data"].(map[string]interface{})
if v, ok := vd[title].(string); ok || vd[title] == nil {
data[title] = v
} else {
data[title] = util.JSONString(vd[title])
}
}
if col.Spider.OutDefaultField() {
data["url"] = datacell["Url"].(string)
data["parent_url"] = datacell["ParentUrl"].(string)
data["download_time"] = datacell["DownloadTime"].(string)
}
sender.Push(data).Unwrap()
}
kafkas = nil
return result.OkVoid()
}
}
================================================
FILE: app/pipeline/collector/output_kafka_stub.go
================================================
//go:build coverage
package collector
import (
"github.com/andeya/gust/result"
)
func init() {
DataOutput["kafka"] = func(*Collector) result.VoidResult { return result.OkVoid() }
}
================================================
FILE: app/pipeline/collector/output_mgo.go
================================================
//go:build !coverage
package collector
import (
mgov2 "gopkg.in/mgo.v2"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/mgo"
"github.com/andeya/pholcus/common/pool"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
)
// --- MongoDB Output ---
func init() {
DataOutput["mgo"] = func(col *Collector) result.VoidResult {
if mgo.Error() != nil {
mgo.Refresh()
if mgo.Error() != nil {
return result.FmtErrVoid("MongoDB connection failed: %v", mgo.Error())
}
}
return mgo.Call(func(src pool.Src) error {
var (
db = src.(*mgo.MgoSrc).DB(config.Conf().DBName)
namespace = util.FileNameReplace(col.namespace())
collections = make(map[string]*mgov2.Collection)
dataMap = make(map[string][]interface{})
err error
)
for _, datacell := range col.dataBuf {
subNamespace := util.FileNameReplace(col.subNamespace(datacell))
cName := joinNamespaces(namespace, subNamespace)
if _, ok := collections[subNamespace]; !ok {
collections[subNamespace] = db.C(cName)
}
for k, v := range datacell["Data"].(map[string]interface{}) {
datacell[k] = v
}
delete(datacell, "Data")
delete(datacell, "RuleName")
if !col.Spider.OutDefaultField() {
delete(datacell, "Url")
delete(datacell, "ParentUrl")
delete(datacell, "DownloadTime")
}
dataMap[subNamespace] = append(dataMap[subNamespace], datacell)
}
for collection, docs := range dataMap {
c := collections[collection]
count := len(docs)
loop := count / mgo.MaxLen
for i := 0; i < loop; i++ {
err = c.Insert(docs[i*mgo.MaxLen : (i+1)*mgo.MaxLen]...)
if err != nil {
logs.Log().Error("%v", err)
}
}
if count%mgo.MaxLen == 0 {
continue
}
err = c.Insert(docs[loop*mgo.MaxLen:]...)
if err != nil {
logs.Log().Error("%v", err)
}
}
return nil
})
}
}
================================================
FILE: app/pipeline/collector/output_mgo_stub.go
================================================
//go:build coverage
package collector
import (
"github.com/andeya/gust/result"
)
func init() {
DataOutput["mgo"] = func(*Collector) result.VoidResult { return result.OkVoid() }
}
================================================
FILE: app/pipeline/collector/output_mysql.go
================================================
//go:build !coverage
package collector
import (
"sync"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/common/util"
)
// --- MySQL Output ---
func init() {
var (
mysqlTable = map[string]*mysql.Table{}
mysqlTableLock sync.RWMutex
)
var getMysqlTable = func(name string) (*mysql.Table, bool) {
mysqlTableLock.RLock()
defer mysqlTableLock.RUnlock()
tab, ok := mysqlTable[name]
if ok {
return tab.Clone(), true
}
return nil, false
}
var setMysqlTable = func(name string, tab *mysql.Table) {
mysqlTableLock.Lock()
mysqlTable[name] = tab
mysqlTableLock.Unlock()
}
DataOutput["mysql"] = func(col *Collector) (r result.VoidResult) {
defer r.Catch()
_, err := mysql.DB()
result.RetVoid(err).Unwrap()
var (
mysqls = make(map[string]*mysql.Table)
namespace = util.FileNameReplace(col.namespace())
)
for _, datacell := range col.dataBuf {
subNamespace := util.FileNameReplace(col.subNamespace(datacell))
tName := joinNamespaces(namespace, subNamespace)
table, ok := mysqls[tName]
if !ok {
table, ok = getMysqlTable(tName)
if ok {
mysqls[tName] = table
} else {
table = mysql.New().Unwrap()
table.SetTableName(tName)
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
table.AddColumn(title + ` MEDIUMTEXT`)
}
if col.Spider.OutDefaultField() {
table.AddColumn(`Url VARCHAR(255)`, `ParentUrl VARCHAR(255)`, `DownloadTime VARCHAR(50)`)
}
table.Create().Unwrap()
setMysqlTable(tName, table)
mysqls[tName] = table
}
}
data := []string{}
for _, title := range col.MustGetRule(datacell["RuleName"].(string)).ItemFields {
vd := datacell["Data"].(map[string]interface{})
if v, ok := vd[title].(string); ok || vd[title] == nil {
data = append(data, v)
} else {
data = append(data, util.JSONString(vd[title]))
}
}
if col.Spider.OutDefaultField() {
data = append(data, datacell["Url"].(string), datacell["ParentUrl"].(string), datacell["DownloadTime"].(string))
}
table.AutoInsert(data)
}
for _, tab := range mysqls {
tab.FlushInsert().Unwrap()
}
mysqls = nil
return result.OkVoid()
}
}
================================================
FILE: app/pipeline/collector/output_mysql_stub.go
================================================
//go:build coverage
package collector
import (
"github.com/andeya/gust/result"
)
func init() {
DataOutput["mysql"] = func(*Collector) result.VoidResult { return result.OkVoid() }
}
================================================
FILE: app/pipeline/collector/output_util.go
================================================
package collector
import (
"github.com/andeya/pholcus/logs"
)
// namespace returns the main namespace (relative to DB name); optional, does not depend on data content.
func (c *Collector) namespace() string {
if c.Spider.Namespace == nil {
if c.Spider.GetSubName() == "" {
return c.Spider.GetName()
}
return c.Spider.GetName() + "__" + c.Spider.GetSubName()
}
return c.Spider.Namespace(c.Spider)
}
// subNamespace returns the sub-namespace (relative to table name); optional, may depend on data content.
func (c *Collector) subNamespace(dataCell map[string]interface{}) string {
if c.Spider.SubNamespace == nil {
return dataCell["RuleName"].(string)
}
defer func() {
if p := recover(); p != nil {
logs.Log().Error("subNamespace: %v", p)
}
}()
return c.Spider.SubNamespace(c.Spider, dataCell)
}
// joinNamespaces concatenates main and sub-namespace with double underscore.
func joinNamespaces(namespace, subNamespace string) string {
if namespace == "" {
return subNamespace
} else if subNamespace != "" {
return namespace + "__" + subNamespace
}
return namespace
}
================================================
FILE: app/pipeline/collector/output_util_test.go
================================================
package collector
import (
"strings"
"testing"
"github.com/andeya/pholcus/app/spider"
)
func TestJoinNamespaces(t *testing.T) {
tests := []struct {
namespace string
subNamespace string
want string
}{
{"", "", ""},
{"", "sub", "sub"},
{"ns", "", "ns"},
{"ns", "sub", "ns__sub"},
{"a", "b", "a__b"},
}
for _, tt := range tests {
got := joinNamespaces(tt.namespace, tt.subNamespace)
if got != tt.want {
t.Errorf("joinNamespaces(%q, %q) = %q, want %q", tt.namespace, tt.subNamespace, got, tt.want)
}
}
}
func TestCollector_Namespace(t *testing.T) {
tests := []struct {
name string
keyin string
namespace func(*spider.Spider) string
want string
}{
{"name_only", "", nil, "Spider"},
{"name_with_keyin", "kw", nil, ""},
{"custom", "", func(sp *spider.Spider) string { return "custom_ns" }, "custom_ns"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sp := &spider.Spider{
Name: "Spider",
Keyin: tt.keyin,
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
if tt.namespace != nil {
sp.Namespace = tt.namespace
}
c := NewCollector(sp, "csv", 1)
got := c.namespace()
if tt.namespace != nil {
if got != tt.want {
t.Errorf("namespace() = %q, want %q", got, tt.want)
}
} else if tt.keyin == "" {
if got != "Spider" && !strings.HasPrefix(got, "Spider__") {
t.Errorf("namespace() = %q, want Spider or Spider__", got)
}
} else {
sub := sp.GetSubName()
if len(sub) == 0 || got != "Spider__"+sub {
t.Errorf("namespace() = %q, want Spider__%s", got, sub)
}
}
})
}
}
func TestCollector_SubNamespace(t *testing.T) {
tests := []struct {
name string
subNs func(*spider.Spider, map[string]interface{}) string
dataCell map[string]interface{}
wantRule string
}{
{"default", nil, map[string]interface{}{"RuleName": "r1"}, "r1"},
{"custom", func(sp *spider.Spider, dc map[string]interface{}) string { return "custom" }, map[string]interface{}{"RuleName": "r1"}, "custom"},
{"panic_recovered", func(sp *spider.Spider, dc map[string]interface{}) string { panic("test") }, map[string]interface{}{"RuleName": "r1"}, ""},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
sp := &spider.Spider{
Name: "S",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
if tt.subNs != nil {
sp.SubNamespace = tt.subNs
}
c := NewCollector(sp, "csv", 1)
got := c.subNamespace(tt.dataCell)
if got != tt.wantRule {
t.Errorf("subNamespace() = %q, want %q", got, tt.wantRule)
}
})
}
}
================================================
FILE: app/pipeline/output.go
================================================
package pipeline
import (
"sort"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector"
"github.com/andeya/pholcus/common/kafka"
"github.com/andeya/pholcus/common/mgo"
"github.com/andeya/pholcus/common/mysql"
"github.com/andeya/pholcus/runtime/cache"
)
// init populates the output library and registers refreshers for stateful backends.
func init() {
for out := range collector.DataOutput {
collector.DataOutputLib = append(collector.DataOutputLib, out)
}
sort.Strings(collector.DataOutputLib)
collector.RegisterRefresher("mgo", refresherFunc(func() { mgo.Refresh() }))
collector.RegisterRefresher("mysql", refresherFunc(func() { mysql.Refresh() }))
collector.RegisterRefresher("kafka", refresherFunc(func() { kafka.Refresh() }))
}
// refresherFunc adapts a plain function to the Refresher interface.
type refresherFunc func()
func (f refresherFunc) Refresh() { f() }
// RegisterOutput registers an output backend at the pipeline level.
func RegisterOutput(name string, fn func(*collector.Collector) result.VoidResult) {
collector.Register(name, fn)
collector.DataOutputLib = append(collector.DataOutputLib, name)
sort.Strings(collector.DataOutputLib)
}
// GetOutputLib returns a sorted list of all registered output backend names.
func GetOutputLib() []string {
return collector.DataOutputLib
}
// RefreshOutput refreshes the state of the configured output backend via the registry.
func RefreshOutput() {
collector.RefreshBackend(cache.Task.OutType)
}
================================================
FILE: app/pipeline/pipeline.go
================================================
// Package pipeline provides the data collection and output pipeline.
package pipeline
import (
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector"
"github.com/andeya/pholcus/app/pipeline/collector/data"
"github.com/andeya/pholcus/app/spider"
)
// Pipeline collects spider results and writes them to the configured output.
type Pipeline interface {
Start()
Stop()
CollectData(data.DataCell) result.VoidResult
CollectFile(data.FileCell) result.VoidResult
}
// New creates a new Pipeline for the given spider.
func New(sp *spider.Spider, outType string, batchCap int) Pipeline {
return collector.NewCollector(sp, outType, batchCap)
}
================================================
FILE: app/pipeline/pipeline_test.go
================================================
package pipeline
import (
"testing"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/app/pipeline/collector"
"github.com/andeya/pholcus/app/pipeline/collector/data"
"github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/runtime/cache"
)
func TestNew(t *testing.T) {
sp := &spider.Spider{
Name: "TestSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}},
}
tests := []struct {
name string
outType string
batchCap int
}{
{"csv", "csv", 10},
{"excel", "excel", 5},
{"batch_one", "csv", 1},
{"batch_zero", "csv", 0},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
p := New(sp, tt.outType, tt.batchCap)
if p == nil {
t.Fatal("New returned nil")
}
col, ok := p.(*collector.Collector)
if !ok {
t.Fatalf("New returned %T, want *collector.Collector", p)
}
_ = tt.outType
wantCap := tt.batchCap
if wantCap < 1 {
wantCap = 1
}
if cap(col.DataChan) != wantCap {
t.Errorf("DataChan cap = %d, want %d", cap(col.DataChan), wantCap)
}
})
}
}
func TestGetOutputLib(t *testing.T) {
lib := GetOutputLib()
if len(lib) == 0 {
t.Fatal("GetOutputLib returned empty")
}
for i := 1; i < len(lib); i++ {
if lib[i] < lib[i-1] {
t.Errorf("GetOutputLib not sorted: %q >= %q", lib[i-1], lib[i])
}
}
}
func TestRegisterOutput(t *testing.T) {
origLen := len(GetOutputLib())
RegisterOutput("_test_output_", func(*collector.Collector) result.VoidResult { return result.OkVoid() })
lib := GetOutputLib()
if len(lib) != origLen+1 {
t.Errorf("after RegisterOutput len = %d, want %d", len(lib), origLen+1)
}
found := false
for _, name := range lib {
if name == "_test_output_" {
found = true
break
}
}
if !found {
t.Error("_test_output_ not in GetOutputLib")
}
for i := 1; i < len(lib); i++ {
if lib[i] < lib[i-1] {
t.Errorf("GetOutputLib not sorted after RegisterOutput")
}
}
}
func TestRefreshOutput(t *testing.T) {
if cache.Task == nil {
cache.Task = &cache.AppConf{}
}
oldOutType := cache.Task.OutType
cache.Task.OutType = "csv"
defer func() { cache.Task.OutType = oldOutType }()
RefreshOutput()
}
func TestRefresherFunc(t *testing.T) {
called := false
f := refresherFunc(func() { called = true })
f.Refresh()
if !called {
t.Error("Refresh should have been called")
}
}
func TestPipeline_StartStopCollect(t *testing.T) {
sp := &spider.Spider{
Name: "PipeSpider",
RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{"r1": {ItemFields: []string{"f1"}}}},
}
p := New(sp, "csv", 2)
p.Start()
defer p.Stop()
cell := data.GetDataCell("r1", map[string]interface{}{"f1": "v1"}, "u", "pu", "dt")
r := p.CollectData(cell)
if r.IsErr() {
t.Errorf("CollectData: %v", r.UnwrapErr())
}
}
================================================
FILE: app/scheduler/matrix.go
================================================
package scheduler
import (
"runtime/debug"
"sort"
"sync"
"sync/atomic"
"time"
"github.com/andeya/pholcus/app/aid/history"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/cache"
"github.com/andeya/pholcus/runtime/status"
)
// Matrix is the request queue for a single Spider instance.
type Matrix struct {
maxPage int64 // max pages to collect (negative value)
resCount int32 // resource usage count
spiderName string // associated Spider name
reqs map[int][]*request.Request // [priority] queues, default priority 0
priorities []int // priority order, low to high
history history.HistoryStore // history
tempHistory map[string]bool // temp record [reqUnique(url+method)]true
failures map[string]*request.Request // historical and current failed requests
tempHistoryLock sync.RWMutex
failureLock sync.Mutex
sync.Mutex
}
func newMatrix(spiderName, spiderSubName string, maxPage int64) *Matrix {
matrix := &Matrix{
spiderName: spiderName,
maxPage: maxPage,
reqs: make(map[int][]*request.Request),
priorities: []int{},
history: history.New(spiderName, spiderSubName),
tempHistory: make(map[string]bool),
failures: make(map[string]*request.Request),
}
if cache.Task.Mode != status.SERVER {
matrix.history.ReadSuccess(cache.Task.OutType, cache.Task.SuccessInherit)
matrix.history.ReadFailure(cache.Task.OutType, cache.Task.FailureInherit)
matrix.setFailures(matrix.history.PullFailure())
}
return matrix
}
// Push adds a request to the queue. Concurrency-safe.
func (m *Matrix) Push(req *request.Request) {
m.Lock()
defer m.Unlock()
if sched.checkStatus(status.STOP) {
return
}
if m.maxPage >= 0 {
return
}
waited := false
for sched.checkStatus(status.PAUSE) {
waited = true
time.Sleep(time.Second)
}
if waited && sched.checkStatus(status.STOP) {
return
}
waited = false
for atomic.LoadInt32(&m.resCount) > sched.avgRes() {
waited = true
time.Sleep(100 * time.Millisecond)
}
if waited && sched.checkStatus(status.STOP) {
return
}
if !req.IsReloadable() {
if m.hasHistory(req.Unique()) {
return
}
m.insertTempHistory(req.Unique())
}
var priority = req.GetPriority()
if _, found := m.reqs[priority]; !found {
m.priorities = append(m.priorities, priority)
sort.Ints(m.priorities)
m.reqs[priority] = []*request.Request{}
}
m.reqs[priority] = append(m.reqs[priority], req)
atomic.AddInt64(&m.maxPage, 1)
}
// Pull removes and returns a request from the queue, or nil if empty. Concurrency-safe.
func (m *Matrix) Pull() (req *request.Request) {
m.Lock()
defer m.Unlock()
if !sched.checkStatus(status.RUN) {
return
}
for i := len(m.reqs) - 1; i >= 0; i-- {
idx := m.priorities[i]
if len(m.reqs[idx]) > 0 {
req = m.reqs[idx][0]
m.reqs[idx] = m.reqs[idx][1:]
if req.GetProxy() != "" {
return
}
if sched.useProxy {
req.SetProxy(sched.proxy.GetOne(req.GetURL()).UnwrapOr(""))
} else {
req.SetProxy("")
}
return
}
}
return
}
// Use acquires a resource slot for this Matrix.
func (m *Matrix) Use() {
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
sched.count <- true
atomic.AddInt32(&m.resCount, 1)
}
// Free releases a resource slot.
func (m *Matrix) Free() {
<-sched.count
atomic.AddInt32(&m.resCount, -1)
}
// DoHistory records success/failure and returns true if the request was requeued as a new failure.
func (m *Matrix) DoHistory(req *request.Request, ok bool) bool {
if !req.IsReloadable() {
m.tempHistoryLock.Lock()
delete(m.tempHistory, req.Unique())
m.tempHistoryLock.Unlock()
if ok {
m.history.UpsertSuccess(req.Unique())
return false
}
}
if ok {
return false
}
m.failureLock.Lock()
defer m.failureLock.Unlock()
if _, ok := m.failures[req.Unique()]; !ok {
m.failures[req.Unique()] = req
logs.Log().Informational(" * + Failed request: [%v]\n", req.GetURL())
return true
}
m.history.UpsertFailure(req)
return false
}
// CanStop reports whether this Matrix can stop (no pending work).
func (m *Matrix) CanStop() bool {
if sched.checkStatus(status.STOP) {
return true
}
if m.maxPage >= 0 {
return true
}
if atomic.LoadInt32(&m.resCount) != 0 {
return false
}
if m.Len() > 0 {
return false
}
m.failureLock.Lock()
defer m.failureLock.Unlock()
if len(m.failures) > 0 {
var goon bool
for reqUnique, req := range m.failures {
if req == nil {
continue
}
m.failures[reqUnique] = nil
goon = true
logs.Log().Informational(" * - Failed request: [%v]\n", req.GetURL())
m.Push(req)
}
if goon {
return false
}
}
return true
}
// TryFlushSuccess flushes success history in non-server mode.
func (m *Matrix) TryFlushSuccess() {
if cache.Task.Mode != status.SERVER && cache.Task.SuccessInherit {
m.history.FlushSuccess(cache.Task.OutType)
}
}
// TryFlushFailure flushes failure history in non-server mode.
func (m *Matrix) TryFlushFailure() {
if cache.Task.Mode != status.SERVER && cache.Task.FailureInherit {
m.history.FlushFailure(cache.Task.OutType)
}
}
// Wait blocks until all in-flight requests complete.
func (m *Matrix) Wait() {
if sched.checkStatus(status.STOP) {
return
}
for atomic.LoadInt32(&m.resCount) != 0 {
time.Sleep(500 * time.Millisecond)
}
}
// Len returns the number of queued requests.
func (m *Matrix) Len() int {
m.Lock()
defer m.Unlock()
var l int
for _, reqs := range m.reqs {
l += len(reqs)
}
return l
}
func (m *Matrix) hasHistory(reqUnique string) bool {
if m.history.HasSuccess(reqUnique) {
return true
}
m.tempHistoryLock.RLock()
has := m.tempHistory[reqUnique]
m.tempHistoryLock.RUnlock()
return has
}
func (m *Matrix) insertTempHistory(reqUnique string) {
m.tempHistoryLock.Lock()
m.tempHistory[reqUnique] = true
m.tempHistoryLock.Unlock()
}
func (m *Matrix) setFailures(reqs map[string]*request.Request) {
m.failureLock.Lock()
defer m.failureLock.Unlock()
for key, req := range reqs {
m.failures[key] = req
logs.Log().Informational(" * + Failed request: [%v]\n", req.GetURL())
}
}
================================================
FILE: app/scheduler/scheduler.go
================================================
// Package scheduler provides crawl task scheduling and resource allocation.
package scheduler
import (
"runtime/debug"
"sync"
"github.com/andeya/pholcus/app/aid/proxy"
"github.com/andeya/pholcus/logs"
"github.com/andeya/pholcus/runtime/status"
)
// scheduler coordinates crawl tasks and resource allocation.
type scheduler struct {
status int // running status
count chan bool // total concurrency count
useProxy bool // whether proxy IP is used
proxy *proxy.Proxy // global proxy IP
matrices []*Matrix // request matrices per Spider instance
sync.RWMutex // global read-write lock
}
// sched is the global scheduler instance.
var sched = &scheduler{
status: status.RUN,
count: make(chan bool, 1),
proxy: proxy.New(),
}
// Init initializes the scheduler with the given concurrency and proxy settings.
func Init(threadNum int, proxyMinute int64) {
sched.matrices = []*Matrix{}
sched.count = make(chan bool, threadNum)
if proxyMinute > 0 {
if sched.proxy.Count() > 0 {
sched.useProxy = true
sched.proxy.UpdateTicker(proxyMinute)
logs.Log().Informational(" * Using proxy IP, rotation interval: %v minutes\n", proxyMinute)
} else {
sched.useProxy = false
logs.Log().Informational(" * Proxy IP list is empty, cannot use proxy\n")
}
} else {
sched.useProxy = false
logs.Log().Informational(" * Not using proxy IP\n")
}
sched.status = status.RUN
}
// ReloadProxyLib reloads the proxy IP list from the config file.
func ReloadProxyLib() {
sched.proxy.Update()
}
// AddMatrix registers a resource queue for the given spider and returns its Matrix.
func AddMatrix(spiderName, spiderSubName string, maxPage int64) *Matrix {
matrix := newMatrix(spiderName, spiderSubName, maxPage)
sched.RLock()
defer sched.RUnlock()
sched.matrices = append(sched.matrices, matrix)
return matrix
}
// PauseRecover toggles pause/resume for all crawl tasks.
func PauseRecover() {
sched.Lock()
defer sched.Unlock()
switch sched.status {
case status.PAUSE:
sched.status = status.RUN
case status.RUN:
sched.status = status.PAUSE
}
}
// Stop terminates all crawl tasks.
func Stop() {
sched.Lock()
defer sched.Unlock()
sched.status = status.STOP
defer func() {
if p := recover(); p != nil {
logs.Log().Error("panic recovered: %v\n%s", p, debug.Stack())
}
}()
close(sched.count)
sched.matrices = []*Matrix{}
}
// avgRes returns the average resources allocated per spider instance.
func (sched *scheduler) avgRes() int32 {
avg := int32(cap(sched.count) / len(sched.matrices))
if avg == 0 {
avg = 1
}
return avg
}
func (sched *scheduler) checkStatus(s int) bool {
sched.RLock()
b := sched.status == s
sched.RUnlock()
return b
}
================================================
FILE: app/scheduler/scheduler_test.go
================================================
package scheduler
import (
"testing"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/runtime/cache"
"github.com/andeya/pholcus/runtime/status"
)
func makeReq(url, rule string) *request.Request {
r := &request.Request{URL: url, Rule: rule, Method: "GET"}
r.Prepare()
return r
}
func TestInit(t *testing.T) {
tests := []struct {
name string
threadNum int
proxyMinute int64
}{
{"basic", 4, 0},
{"with_proxy_minute", 8, 5},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Init(tt.threadNum, tt.proxyMinute)
})
}
}
func TestAddMatrix(t *testing.T) {
Init(4, 0)
tests := []struct {
name string
spiderName string
spiderSub string
maxPage int64
wantNotNil bool
}{
{"basic", "sp1", "", -10, true},
{"with_sub", "sp2", "sub1", -1, true},
{"zero_limit", "sp3", "", 0, true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
m := AddMatrix(tt.spiderName, tt.spiderSub, tt.maxPage)
if (m != nil) != tt.wantNotNil {
t.Errorf("AddMatrix() got nil=%v, want not nil=%v", m == nil, tt.wantNotNil)
}
})
}
}
func TestPauseRecover(t *testing.T) {
Init(4, 0)
PauseRecover()
PauseRecover()
}
func TestReloadProxyLib(t *testing.T) {
ReloadProxyLib()
}
func TestMatrix_PushPull_Len(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -5)
if m == nil {
t.Fatal("AddMatrix returned nil")
}
reqs := []*request.Request{
makeReq("http://a.com/1", "r1"),
makeReq("http://a.com/2", "r2"),
}
for _, r := range reqs {
m.Push(r)
}
if got := m.Len(); got != 2 {
t.Errorf("Len() = %d, want 2", got)
}
p1 := m.Pull()
if p1 == nil {
t.Fatal("Pull() returned nil")
}
if p1.GetURL() != "http://a.com/1" && p1.GetURL() != "http://a.com/2" {
t.Errorf("Pull() got URL %s", p1.GetURL())
}
if m.Len() != 1 {
t.Errorf("Len() after Pull = %d, want 1", m.Len())
}
p2 := m.Pull()
if p2 == nil {
t.Fatal("Pull() returned nil")
}
if m.Len() != 0 {
t.Errorf("Len() after 2nd Pull = %d, want 0", m.Len())
}
_ = p1
_ = p2
}
func TestMatrix_Push_ignored_when_maxPage_non_negative(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", 0)
req := makeReq("http://a.com", "r")
m.Push(req)
if m.Len() != 0 {
t.Errorf("Push with maxPage>=0 should be ignored, Len()=%d", m.Len())
}
}
func TestMatrix_Pull_empty_returns_nil(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -1)
if got := m.Pull(); got != nil {
t.Errorf("Pull() on empty queue = %v, want nil", got)
}
}
func TestMatrix_Pull_paused_returns_nil(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -2)
m.Push(makeReq("http://a.com", "r"))
PauseRecover()
got := m.Pull()
PauseRecover()
if got != nil {
t.Errorf("Pull() when paused = %v, want nil", got)
}
}
func TestMatrix_Use_Free(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -1)
m.Use()
m.Free()
}
func TestMatrix_DoHistory(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -2)
req := makeReq("http://a.com/x", "r")
m.Push(req)
pulled := m.Pull()
if pulled == nil {
t.Fatal("Pull failed")
}
tests := []struct {
name string
ok bool
want bool
}{
{"success", true, false},
{"failure_new", false, true},
{"failure_again", false, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := m.DoHistory(pulled, tt.ok)
if got != tt.want {
t.Errorf("DoHistory(ok=%v) = %v, want %v", tt.ok, got, tt.want)
}
})
}
}
func TestMatrix_DoHistory_reloadable(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -1)
req := makeReq("http://a.com/r", "r")
req.SetReloadable(true)
got := m.DoHistory(req, true)
if got != false {
t.Errorf("DoHistory(reloadable, true) = %v, want false", got)
}
got = m.DoHistory(req, false)
if got != true {
t.Errorf("DoHistory(reloadable, false) first = %v, want true (new failure)", got)
}
got = m.DoHistory(req, false)
if got != false {
t.Errorf("DoHistory(reloadable, false) again = %v, want false", got)
}
}
func TestMatrix_CanStop(t *testing.T) {
Init(4, 0)
tests := []struct {
name string
maxPage int64
push int
use bool
want bool
}{
{"empty_no_work", -1, 0, false, true},
{"has_pending", -2, 1, false, false},
{"has_inflight", -1, 0, true, false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
m := AddMatrix("sp_"+tt.name, "", tt.maxPage)
for i := 0; i < tt.push; i++ {
m.Push(makeReq("http://a.com/"+tt.name+string(rune('a'+i)), "r"))
}
if tt.use {
m.Use()
defer m.Free()
}
got := m.CanStop()
if got != tt.want {
t.Errorf("CanStop() = %v, want %v", got, tt.want)
}
})
}
}
func TestMatrix_CanStop_after_Stop(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -5)
m.Push(makeReq("http://a.com", "r"))
Stop()
got := m.CanStop()
Init(4, 0)
if !got {
t.Errorf("CanStop() after Stop = %v, want true", got)
}
}
func TestMatrix_TryFlushSuccess_Failure(t *testing.T) {
orig := cache.Task
defer func() { cache.Task = orig }()
cache.Task = &cache.AppConf{Mode: status.SERVER}
Init(4, 0)
m := AddMatrix("sp", "", -1)
m.TryFlushSuccess()
m.TryFlushFailure()
cache.Task = &cache.AppConf{Mode: status.OFFLINE, SuccessInherit: true, FailureInherit: true, OutType: "csv"}
m2 := AddMatrix("sp2", "", -1)
m2.TryFlushSuccess()
m2.TryFlushFailure()
}
func TestMatrix_Wait(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -1)
m.Wait()
}
func TestMatrix_Push_duplicate_skipped(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -3)
req := makeReq("http://a.com/dup", "r")
m.Push(req)
m.Push(req)
if m.Len() != 1 {
t.Errorf("duplicate Push should be skipped, Len()=%d", m.Len())
}
}
func TestMatrix_Pull_priority(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -3)
low := makeReq("http://a.com/low", "r")
low.SetPriority(0)
high := makeReq("http://a.com/high", "r")
high.SetPriority(10)
m.Push(low)
m.Push(high)
first := m.Pull()
if first == nil {
t.Fatal("Pull returned nil")
}
if first.GetURL() != "http://a.com/high" {
t.Errorf("higher priority should be pulled first, got %s", first.GetURL())
}
}
func TestMatrix_Pull_request_with_proxy_passthrough(t *testing.T) {
Init(4, 0)
m := AddMatrix("sp", "", -1)
req := makeReq("http://a.com", "r")
req.SetProxy("http://proxy:8080")
m.Push(req)
got := m.Pull()
if got == nil || got.GetProxy() != "http://proxy:8080" {
t.Errorf("Pull with existing proxy should preserve it, got %v", got)
}
}
================================================
FILE: app/spider/common/common.go
================================================
// Package common provides HTML cleaning, form parsing, and other utility functions for spider rules.
package common
import (
"math"
"net/http"
"net/url"
"regexp"
"strconv"
"strings"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/common/goquery"
"github.com/andeya/pholcus/common/mahonia"
"github.com/andeya/pholcus/common/ping"
)
// CleanHtml strips HTML tags at increasing levels of aggressiveness based on depth.
func CleanHtml(str string, depth int) string {
if depth > 0 {
re, _ := regexp.Compile("<[\\S\\s]+?>")
str = re.ReplaceAllStringFunc(str, strings.ToLower)
}
if depth > 1 {
re, _ := regexp.Compile("")
html = re.ReplaceAllString(html, "")
re = regexp.MustCompile("y", 3, "xy"},
{"depth4_replace_tags", "a
", 4, "\na\n"},
{"depth5_collapse_ws", "a \n b", 5, "a\nb"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := CleanHtml(tt.str, tt.depth); got != tt.want {
t.Errorf("CleanHtml() = %q, want %q", got, tt.want)
}
})
}
}
func TestDeprive(t *testing.T) {
tests := []struct {
s string
want string
}{
{"a\nb\rc\td e", "abcde"},
{"", ""},
}
for _, tt := range tests {
if got := Deprive(tt.s); got != tt.want {
t.Errorf("Deprive(%q) = %q, want %q", tt.s, got, tt.want)
}
}
}
func TestDeprive2(t *testing.T) {
tests := []struct {
s string
want string
}{
{"a\nb\\nc", "abc"},
{"a\rb\\rd", "abd"},
{"a\tb\\te", "abe"},
{" a b ", "ab"},
}
for _, tt := range tests {
if got := Deprive2(tt.s); got != tt.want {
t.Errorf("Deprive2(%q) = %q, want %q", tt.s, got, tt.want)
}
}
}
func TestFloor(t *testing.T) {
tests := []struct {
f float64
n int
want float64
}{
{3.14159, 2, 3.14},
{3.14159, 0, 3},
{-1.234, 1, -1.2},
}
for _, tt := range tests {
if got := Floor(tt.f, tt.n); got != tt.want {
t.Errorf("Floor(%v, %d) = %v, want %v", tt.f, tt.n, got, tt.want)
}
}
}
func TestSplitCookies(t *testing.T) {
tests := []struct {
cookieStr string
wantLen int
want map[string]string
}{
{"a=1; b=2", 2, map[string]string{"a": "1", "b": "2"}},
{"mt=ci%3D-1_0; thw=cn", 2, map[string]string{"mt": "ci%3D-1_0", "thw": "cn"}},
{"single=val", 1, map[string]string{"single": "val"}},
{"", 0, nil},
{" a = b ", 1, map[string]string{"a": "b"}},
}
for _, tt := range tests {
cookies := SplitCookies(tt.cookieStr)
if len(cookies) != tt.wantLen {
t.Errorf("SplitCookies(%q) len = %d, want %d", tt.cookieStr, len(cookies), tt.wantLen)
}
if tt.want != nil {
for _, c := range cookies {
if v, ok := tt.want[c.Name]; !ok || c.Value != v {
t.Errorf("SplitCookies(%q) cookie %s = %q, want %q", tt.cookieStr, c.Name, c.Value, v)
}
}
}
}
}
func TestDecodeString(t *testing.T) {
tests := []struct {
src string
charset string
want string
}{
{"hello", "UTF-8", "hello"},
{string([]byte{0xD6, 0xD0}), "GB18030", "中"},
}
for _, tt := range tests {
if got := DecodeString(tt.src, tt.charset); got != tt.want {
t.Errorf("DecodeString(%q, %q) = %q, want %q", tt.src, tt.charset, got, tt.want)
}
}
}
func TestEncodeString(t *testing.T) {
tests := []struct {
src string
charset string
want string
}{
{"hello", "UTF-8", "hello"},
{"中", "GB18030", string([]byte{0xD6, 0xD0})},
}
for _, tt := range tests {
if got := EncodeString(tt.src, tt.charset); got != tt.want {
t.Errorf("EncodeString(%q, %q) = %q, want %q", tt.src, tt.charset, got, tt.want)
}
}
}
func TestConvertToString(t *testing.T) {
tests := []struct {
src string
srcCode string
tagCode string
want string
}{
{"hello", "UTF-8", "UTF-8", "hello"},
{string([]byte{0xD6, 0xD0}), "GB18030", "UTF-8", "中"},
}
for _, tt := range tests {
if got := ConvertToString(tt.src, tt.srcCode, tt.tagCode); got != tt.want {
t.Errorf("ConvertToString(%q, %q, %q) = %q, want %q", tt.src, tt.srcCode, tt.tagCode, got, tt.want)
}
}
}
func TestGBKToUTF8(t *testing.T) {
tests := []struct {
src string
want string
}{
{"hello", "hello"},
{string([]byte{0xD6, 0xD0}), "中"},
}
for _, tt := range tests {
if got := GBKToUTF8(tt.src); got != tt.want {
t.Errorf("GBKToUTF8(%q) = %q, want %q", tt.src, got, tt.want)
}
}
}
func TestUnicodeToUTF8(t *testing.T) {
tests := []struct {
str string
want string
}{
{"咖啡", "咖啡"},
{"ab", "ab"},
{"A", "A"},
}
for _, tt := range tests {
if got := UnicodeToUTF8(tt.str); got != tt.want {
t.Errorf("UnicodeToUTF8(%q) = %q, want %q", tt.str, got, tt.want)
}
}
}
func TestUnicode16ToUTF8(t *testing.T) {
tests := []struct {
str string
want string
}{
{`\u4e2d\u6587`, "中文"},
{`a\u0041b`, "aAb"},
{`\u0041`, "A"},
{`x\u0041`, "xA"},
}
for _, tt := range tests {
if got := Unicode16ToUTF8(tt.str); got != tt.want {
t.Errorf("Unicode16ToUTF8(%q) = %q, want %q", tt.str, got, tt.want)
}
}
}
func TestMakeUrl(t *testing.T) {
tests := []struct {
path string
schemeAndHost []string
wantUrl string
wantOk bool
}{
{"/path", []string{"https://example.com"}, "https://example.com/path", true},
{"path", []string{"https://example.com"}, "https://example.com/path", true},
{"https://example.com/path", nil, "https://example.com/path", true},
{"/path", nil, "/path", false},
{"Https://x.com", nil, "Https://x.com", true},
}
for _, tt := range tests {
got, ok := MakeUrl(tt.path, tt.schemeAndHost...)
if got != tt.wantUrl || ok != tt.wantOk {
t.Errorf("MakeUrl(%q, %v) = %q, %v; want %q, %v", tt.path, tt.schemeAndHost, got, ok, tt.wantUrl, tt.wantOk)
}
}
}
func TestProcessHtml(t *testing.T) {
tests := []struct {
html string
want string
}{
{"ab", "ab"},
{"**bold**", "*"},
{"text
", "text
"},
}
for _, tt := range tests {
if got := ProcessHtml(tt.html); got != tt.want {
t.Errorf("ProcessHtml(%q) = %q, want %q", tt.html, got, tt.want)
}
}
}
func TestDepriveBreak(t *testing.T) {
tests := []struct {
s string
want string
}{
{"a\nb\rc\td", "abcd"},
{"a\\nb", "ab"},
}
for _, tt := range tests {
if got := DepriveBreak(tt.s); got != tt.want {
t.Errorf("DepriveBreak(%q) = %q, want %q", tt.s, got, tt.want)
}
}
}
func TestDepriveMutiBreak(t *testing.T) {
tests := []struct {
s string
want string
}{
{"a\n\n\nb", "a\nb"},
{"x \n y", "x\ny"},
}
for _, tt := range tests {
if got := DepriveMutiBreak(tt.s); got != tt.want {
t.Errorf("DepriveMutiBreak(%q) = %q, want %q", tt.s, got, tt.want)
}
}
}
func TestHrefSub(t *testing.T) {
tests := []struct {
src string
sub string
want string
}{
{"http://a.com", "k=v", "http://a.com?k=v"},
{"http://a.com?x=1", "k=v", "http://a.com?x=1&k=v"},
{"http://a.com", "", "http://a.com"},
}
for _, tt := range tests {
if got := HrefSub(tt.src, tt.sub); got != tt.want {
t.Errorf("HrefSub(%q, %q) = %q, want %q", tt.src, tt.sub, got, tt.want)
}
}
}
func TestGetHref(t *testing.T) {
tests := []struct {
baseURL string
url string
href string
mustBase bool
want string
}{
{"https://example.com/", "https://example.com/page", "javascript:void(0)", false, ""},
{"https://example.com/", "https://example.com/page", "/", false, "https://example.com/"},
{"https://example.com", "https://example.com/a/b", "./c", false, "https://example.com/a/c"},
{"https://example.com/", "https://example.com/a/b", "/path", false, "https://example.com/path"},
{"https://example.com/", "https://example.com/a", "https://other.com/x", false, "https://other.com/x"},
{"https://example.com/", "https://example.com/a", "rel", false, "https://example.com/rel"},
{"https://example.com/", "https://example.com/a/b/c", "../d", false, "https://example.com/a/d"},
{"https://example.com/", "https://example.com/a", "", false, ""},
{"https://example.com/", "https://example.com/a", "./../x", false, ""},
{"https://example.com/", "https://example.com/a", "192.168.1.1", false, "https://example.com/192.168.1.1"},
{"https://example.com/", "https://example.com/a", "a.b.c.d.e", false, "a.b.c.d.e"},
{"https://example.com/", "https://example.com/a", "https://example.com/x", true, "https://example.com/x"},
{"https://example.com/", "https://other.com/a", "./x", false, "https://other.com/x"},
}
for _, tt := range tests {
got := GetHref(tt.baseURL, tt.url, tt.href, tt.mustBase)
if got != tt.want {
t.Errorf("GetHref(%q, %q, %q, %v) = %q, want %q", tt.baseURL, tt.url, tt.href, tt.mustBase, got, tt.want)
}
}
}
func TestExtractArticle(t *testing.T) {
html := `This is the longest article content here.
`
got := ExtractArticle(html)
if got == "" && len(html) > 0 {
t.Logf("ExtractArticle returned empty (heuristic may vary)")
}
html2 := `single
`
got2 := ExtractArticle(html2)
if got2 != "" {
t.Logf("ExtractArticle(%q) = %q", html2, got2)
}
}
func TestPinger(t *testing.T) {
r := Pinger("127.0.0.1", 1)
if r.IsErr() {
t.Logf("Pinger(127.0.0.1) err (expected if no listener): %v", r.UnwrapErr())
}
}
func TestPing(t *testing.T) {
r := Ping("127.0.0.1", 1)
if r.IsErr() {
t.Logf("Ping(127.0.0.1) err (expected): %v", r.UnwrapErr())
}
}
================================================
FILE: app/spider/common/form.go
================================================
package common
import (
"net/url"
"strings"
"github.com/andeya/pholcus/common/goquery"
"github.com/andeya/pholcus/app/downloader/request"
spider "github.com/andeya/pholcus/app/spider"
)
// Form is the default form element.
type Form struct {
ctx *spider.Context
rule string
selection *goquery.Selection
method string
action string
fields url.Values
buttons url.Values
}
// NewForm creates and returns a *Form type.
func NewForm(ctx *spider.Context, rule string, u string, form *goquery.Selection, schemeAndHost ...string) *Form {
fields, buttons := serializeForm(form)
if len(schemeAndHost) == 0 {
aurl, _ := url.Parse(u)
schemeAndHost = append(schemeAndHost, aurl.Scheme+"://"+aurl.Host)
}
method, action := formAttributes(u, form, schemeAndHost...)
if action == "" {
return nil
}
if method == "" {
method = "GET"
}
return &Form{
ctx: ctx,
rule: rule,
selection: form,
method: method,
action: action,
fields: fields,
buttons: buttons,
}
}
// Method returns the form method, eg "GET" or "POST" or "POST-M".
func (f *Form) Method() string {
return f.method
}
// Action returns the form action URL.
// The URL will always be absolute.
func (f *Form) Action() string {
return f.action
}
// Input sets the value of a form field.
func (f *Form) Input(name, value string) *Form {
if _, ok := f.fields[name]; ok {
f.fields.Set(name, value)
}
return f
}
// Input sets the value of a form field.
func (f *Form) Inputs(kv map[string]string) *Form {
for k, v := range kv {
if _, ok := f.fields[k]; ok {
f.fields.Set(k, v)
}
}
return f
}
// Submit submits the form.
// Clicks the first button in the form, or submits the form without using
// any button when the form does not contain any buttons.
func (f *Form) Submit() bool {
if len(f.buttons) > 0 {
for name := range f.buttons {
return f.Click(name)
}
}
return f.send("", "")
}
// Click submits the form by clicking the button with the given name.
func (f *Form) Click(button string) bool {
if _, ok := f.buttons[button]; !ok {
return false
}
return f.send(button, f.buttons[button][0])
}
// Dom returns the inner *goquery.Selection.
func (f *Form) Dom() *goquery.Selection {
return f.selection
}
// send submits the form.
func (f *Form) send(buttonName, buttonValue string) bool {
values := make(url.Values, len(f.fields)+1)
for name, vals := range f.fields {
values[name] = vals
}
if buttonName != "" {
values.Set(buttonName, buttonValue)
}
valsStr := values.Encode()
if f.Method() == "GET" {
f.ctx.AddQueue(&request.Request{
Rule: f.rule,
URL: f.Action() + "?" + valsStr,
Method: f.Method(),
})
return true
} else {
enctype := f.selection.Attr("enctype").UnwrapOr("")
if enctype == "multipart/form-data" {
f.ctx.AddQueue(&request.Request{
Rule: f.rule,
URL: f.Action(),
PostData: valsStr,
Method: "POST-M",
})
return true
}
f.ctx.AddQueue(&request.Request{
Rule: f.rule,
URL: f.Action(),
PostData: valsStr,
Method: f.Method(),
})
return true
}
}
// Serialize converts the form fields into a url.Values type.
// Returns two url.Value types. The first is the form field values, and the
// second is the form button values.
func serializeForm(sel *goquery.Selection) (url.Values, url.Values) {
input := sel.Find("input,button,textarea")
if input.Length() == 0 {
return url.Values{}, url.Values{}
}
fields := make(url.Values)
buttons := make(url.Values)
input.Each(func(_ int, s *goquery.Selection) {
name := s.Attr("name")
if name.IsSome() {
typ := s.Attr("type")
if typ.IsSome() || s.Is("textarea") {
if typ.UnwrapOr("") == "submit" {
val := s.Attr("value")
buttons.Add(name.Unwrap(), val.UnwrapOr(""))
} else {
fields.Add(name.Unwrap(), s.Attr("value").UnwrapOr(""))
}
}
}
})
return fields, buttons
}
func formAttributes(u string, form *goquery.Selection, schemeAndHost ...string) (string, string) {
method := form.Attr("method").UnwrapOr("GET")
action := form.Attr("action").UnwrapOr(u)
action, ok := MakeUrl(action, schemeAndHost...)
if !ok {
return "", ""
}
return strings.ToUpper(method), action
}
================================================
FILE: app/spider/common/form_test.go
================================================
package common
import (
"strings"
"testing"
spider "github.com/andeya/pholcus/app/spider"
"github.com/andeya/pholcus/common/goquery"
)
func TestNewForm(t *testing.T) {
html := ``
doc := goquery.NewDocumentFromReader(strings.NewReader(html))
if doc.IsErr() {
t.Fatalf("parse html: %v", doc.UnwrapErr())
}
formSel := doc.Unwrap().Find("form").First()
sp := &spider.Spider{RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}}}
ctx := spider.GetContext(sp, nil)
defer spider.PutContext(ctx)
form := NewForm(ctx, "r1", "https://example.com/page", formSel)
if form == nil {
t.Fatal("NewForm returned nil")
}
if form.Method() != "GET" {
t.Errorf("Method() = %q, want GET", form.Method())
}
if form.Action() != "https://example.com/search" {
t.Errorf("Action() = %q, want https://example.com/search", form.Action())
}
if form.Dom() != formSel {
t.Error("Dom() mismatch")
}
form.Input("q", "newval")
form.Inputs(map[string]string{"q": "v2"})
if !form.Submit() {
t.Error("Submit() = false")
}
if !form.Click("btn") {
t.Error("Click(btn) = false")
}
if form.Click("nonexistent") {
t.Error("Click(nonexistent) should be false")
}
}
func TestNewForm_Post(t *testing.T) {
html := ``
doc := goquery.NewDocumentFromReader(strings.NewReader(html))
if doc.IsErr() {
t.Fatalf("parse html: %v", doc.UnwrapErr())
}
formSel := doc.Unwrap().Find("form").First()
sp := &spider.Spider{RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}}}
ctx := spider.GetContext(sp, nil)
defer spider.PutContext(ctx)
form := NewForm(ctx, "r1", "https://example.com/", formSel)
if form == nil {
t.Fatal("NewForm returned nil")
}
if form.Method() != "POST" {
t.Errorf("Method() = %q, want POST", form.Method())
}
form.Submit()
}
func TestNewForm_Multipart(t *testing.T) {
html := ``
doc := goquery.NewDocumentFromReader(strings.NewReader(html))
if doc.IsErr() {
t.Fatalf("parse html: %v", doc.UnwrapErr())
}
formSel := doc.Unwrap().Find("form").First()
sp := &spider.Spider{RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}}}
ctx := spider.GetContext(sp, nil)
defer spider.PutContext(ctx)
form := NewForm(ctx, "r1", "https://example.com/", formSel)
if form == nil {
t.Fatal("NewForm returned nil")
}
form.Submit()
}
func TestNewForm_NoSchemeAndHost(t *testing.T) {
html := ``
doc := goquery.NewDocumentFromReader(strings.NewReader(html))
if doc.IsErr() {
t.Fatalf("parse html: %v", doc.UnwrapErr())
}
formSel := doc.Unwrap().Find("form").First()
sp := &spider.Spider{RuleTree: &spider.RuleTree{Trunk: map[string]*spider.Rule{}}}
ctx := spider.GetContext(sp, nil)
defer spider.PutContext(ctx)
form := NewForm(ctx, "r1", "https://example.com/page", formSel)
if form == nil {
t.Fatal("NewForm returned nil")
}
if form.Action() != "https://example.com/path" {
t.Errorf("Action() = %q", form.Action())
}
}
================================================
FILE: app/spider/context.go
================================================
package spider
import (
"bytes"
"io"
"mime"
"net/http"
"path"
"strings"
"sync"
"time"
"unsafe"
"golang.org/x/net/html/charset"
"github.com/andeya/pholcus/app/downloader/request"
"github.com/andeya/pholcus/app/pipeline/collector/data"
"github.com/andeya/pholcus/common/goquery"
"github.com/andeya/pholcus/common/util"
"github.com/andeya/pholcus/logs"
)
// Context carries the state for a single crawl request through its lifecycle.
type Context struct {
spider *Spider
Request *request.Request
Response *http.Response // URL is copied from *request.Request
text []byte // response body as raw bytes
dom *goquery.Document // parsed HTML DOM (lazy-initialized)
items []data.DataCell // collected text output results
files []data.FileCell // collected file output results
err error
sync.Mutex
}
var (
contextPool = &sync.Pool{
New: func() interface{} {
return &Context{
items: []data.DataCell{},
files: []data.FileCell{},
}
},
}
)
// --- Initialization ---
// GetContext retrieves a Context from the pool and binds it to the given spider and request.
func GetContext(sp *Spider, req *request.Request) *Context {
ctx := contextPool.Get().(*Context)
ctx.spider = sp
ctx.Request = req
return ctx
}
// PutContext resets a Context and returns it to the pool.
func PutContext(ctx *Context) {
if ctx.Response != nil {
if ctx.Response.Body != nil {
ctx.Response.Body.Close()
}
ctx.Response = nil
}
ctx.items = ctx.items[:0]
ctx.files = ctx.files[:0]
ctx.spider = nil
ctx.Request = nil
ctx.text = nil
ctx.dom = nil
ctx.err = nil
contextPool.Put(ctx)
}
// SetResponse binds the HTTP response to this context.
func (ctx *Context) SetResponse(resp *http.Response) *Context {
ctx.Response = resp
return ctx
}
// SetError marks a download error on this context.
func (ctx *Context) SetError(err error) {
ctx.err = err
}
// --- Public Set/Exec Methods ---
// AddQueue validates and enqueues a new crawl request.
//
// Required fields: Request.URL, Request.Rule.
// Request.Spider is set automatically; Request.EnableCookie is inherited from Spider.
//
// Fields with defaults (may be omitted):
// - Method: GET
// - DialTimeout: request.DefaultDialTimeout (negative = unlimited)
// - ConnTimeout: request.DefaultConnTimeout (negative = unlimited)
// - TryTimes: request.DefaultTryTimes (negative = unlimited retries)
// - RedirectTimes: unlimited by default (negative = disable redirects)
// - RetryPause: request.DefaultRetryPause
// - DownloaderID: 0 = Surf (fast, full-featured), 1 = PhantomJS (slow, JS-capable)
//
// Referer is auto-filled from the current response URL if not set.
func (ctx *Context) AddQueue(req *request.Request) *Context {
if ctx.spider.tryStop() != nil {
return ctx
}
prepareResult := req.
SetSpiderName(ctx.spider.GetName()).
SetEnableCookie(ctx.spider.GetEnableCookie()).
Prepare()
if prepareResult.IsErr() {
logs.Log().Error(prepareResult.UnwrapErr().Error())
return ctx
}
if req.GetReferer() == "" && ctx.Response != nil {
req.SetReferer(ctx.GetURL())
}
ctx.spider.RequestPush(req)
return ctx
}
// jsToInt64 extracts an integer from a JS value; supports int64, float64, and int types.
func jsToInt64(v interface{}) (int64, bool) {
switch n := v.(type) {
case int64:
return n, true
case float64:
return int64(n), true
case int:
return int64(n), true
}
return 0, false
}
// JsAddQueue adds crawl requests from dynamic (JavaScript) rule definitions.
func (ctx *Context) JsAddQueue(jreq map[string]interface{}) *Context {
if ctx.spider.tryStop() != nil {
return ctx
}
req := &request.Request{}
u, ok := jreq["URL"].(string)
if !ok {
return ctx
}
req.URL = u
req.Rule, _ = jreq["Rule"].(string)
req.Method, _ = jreq["Method"].(string)
req.Header = http.Header{}
if header, ok := jreq["Header"].(map[string]interface{}); ok {
for k, values := range header {
if vals, ok := values.([]string); ok {
for _, v := range vals {
req.Header.Add(k, v)
}
}
}
}
req.PostData, _ = jreq["PostData"].(string)
req.Reloadable, _ = jreq["Reloadable"].(bool)
if t, ok := jsToInt64(jreq["DialTimeout"]); ok {
req.DialTimeout = time.Duration(t)
}
if t, ok := jsToInt64(jreq["ConnTimeout"]); ok {
req.ConnTimeout = time.Duration(t)
}
if t, ok := jsToInt64(jreq["RetryPause"]); ok {
req.RetryPause = time.Duration(t)
}
if t, ok := jsToInt64(jreq["TryTimes"]); ok {
req.TryTimes = int(t)
}
if t, ok := jsToInt64(jreq["RedirectTimes"]); ok {
req.RedirectTimes = int(t)
}
if t, ok := jsToInt64(jreq["Priority"]); ok {
req.Priority = int(t)
}
if t, ok := jsToInt64(jreq["DownloaderID"]); ok {
req.DownloaderID = int(t)
}
if t, ok := jreq["Temp"].(map[string]interface{}); ok {
req.Temp = t
}
prepareResult := req.
SetSpiderName(ctx.spider.GetName()).
SetEnableCookie(ctx.spider.GetEnableCookie()).
Prepare()
if prepareResult.IsErr() {
logs.Log().Error(prepareResult.UnwrapErr().Error())
return ctx
}
if req.GetReferer() == "" && ctx.Response != nil {
req.SetReferer(ctx.GetURL())
}
ctx.spider.RequestPush(req)
return ctx
}
// Output collects a text result item.
//
// When item is map[int]interface{}, fields are mapped using the existing ItemFields of ruleName.
// When item is map[string]interface{}, missing ItemFields are auto-added.
// An empty ruleName defaults to the current rule.
func (ctx *Context) Output(item interface{}, ruleName ...string) {
_ruleName, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: Output() called with non-existent rule name", ctx.spider.GetName())
return
}
var _item map[string]interface{}
switch item2 := item.(type) {
case map[int]interface{}:
_item = ctx.CreateItem(item2, _ruleName)
case request.Temp:
for k := range item2 {
ctx.spider.UpsertItemField(rule, k)
}
_item = item2
case map[string]interface{}:
for k := range item2 {
ctx.spider.UpsertItemField(rule, k)
}
_item = item2
}
ctx.Lock()
if ctx.spider.NotDefaultField {
ctx.items = append(ctx.items, data.GetDataCell(_ruleName, _item, "", "", ""))
} else {
ctx.items = append(ctx.items, data.GetDataCell(_ruleName, _item, ctx.GetURL(), ctx.GetReferer(), time.Now().Format("2006-01-02 15:04:05")))
}
ctx.Unlock()
}
// FileOutput collects a file result from the response body.
// nameOrExt optionally specifies a file name or extension; empty keeps the original.
// Errors are logged internally; no return value for JS VM compatibility.
func (ctx *Context) FileOutput(nameOrExt ...string) {
if ctx.Response == nil || ctx.Response.Body == nil {
logs.Log().Warning(" * [FileOutput]: Response or Body is nil for %s", ctx.GetURL())
return
}
body, err := io.ReadAll(ctx.Response.Body)
ctx.Response.Body.Close()
if err != nil {
logs.Log().Error(" * [FileOutput]: %v", err)
return
}
_, s := path.Split(ctx.GetURL())
n := strings.Split(s, "?")[0]
var baseName, ext string
if len(nameOrExt) > 0 {
p, n := path.Split(nameOrExt[0])
ext = path.Ext(n)
if baseName2 := strings.TrimSuffix(n, ext); baseName2 != "" {
baseName = p + baseName2
}
}
if baseName == "" {
baseName = strings.TrimSuffix(n, path.Ext(n))
}
if ext == "" {
ext = path.Ext(n)
}
if ext == "" {
ext = ".html"
}
ctx.Lock()
ctx.files = append(ctx.files, data.GetFileCell(ctx.GetRuleName(), baseName+ext, body))
ctx.Unlock()
}
// CreateItem builds a text result map keyed by field names using the ItemFields of ruleName.
// An empty ruleName defaults to the current rule.
func (ctx *Context) CreateItem(item map[int]interface{}, ruleName ...string) map[string]interface{} {
_, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: CreateItem() called with non-existent rule name", ctx.spider.GetName())
return nil
}
var item2 = make(map[string]interface{}, len(item))
for k, v := range item {
field := ctx.spider.GetItemField(rule, k)
item2[field] = v
}
return item2
}
// SetTemp stores temporary data in the current request.
func (ctx *Context) SetTemp(key string, value interface{}) *Context {
ctx.Request.SetTemp(key, value)
return ctx
}
func (ctx *Context) SetURL(url string) *Context {
ctx.Request.URL = url
return ctx
}
func (ctx *Context) SetReferer(referer string) *Context {
ctx.Request.Header.Set("Referer", referer)
return ctx
}
// UpsertItemField adds a result field name to the given rule and returns its index.
// If the field already exists, the existing index is returned.
// An empty ruleName defaults to the current rule.
func (ctx *Context) UpsertItemField(field string, ruleName ...string) (index int) {
_, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: UpsertItemField() called with non-existent rule name", ctx.spider.GetName())
return
}
return ctx.spider.UpsertItemField(rule, field)
}
// Aid invokes the AidFunc of the specified rule.
// An empty ruleName defaults to the current rule.
func (ctx *Context) Aid(aid map[string]interface{}, ruleName ...string) interface{} {
if ctx.spider.tryStop() != nil {
return nil
}
_, rule, found := ctx.getRule(ruleName...)
if !found {
if len(ruleName) > 0 {
logs.Log().Error("spider %s: Aid() called with non-existent rule: %s", ctx.spider.GetName(), ruleName[0])
} else {
logs.Log().Error("spider %s: Aid() called without specifying a rule name", ctx.spider.GetName())
}
return nil
}
if rule.AidFunc == nil {
logs.Log().Error("spider %s: rule %s has no AidFunc defined", ctx.spider.GetName(), ruleName[0])
return nil
}
return rule.AidFunc(ctx, aid)
}
// Parse dispatches the response to the ParseFunc of the specified rule.
// An empty ruleName defaults to Root().
func (ctx *Context) Parse(ruleName ...string) *Context {
if ctx.spider.tryStop() != nil {
return ctx
}
_ruleName, rule, found := ctx.getRule(ruleName...)
if ctx.Response != nil {
ctx.Request.SetRuleName(_ruleName)
}
if !found {
ctx.spider.RuleTree.Root(ctx)
return ctx
}
if rule.ParseFunc == nil {
logs.Log().Error("spider %s: rule %s has no ParseFunc defined", ctx.spider.GetName(), ruleName[0])
return ctx
}
rule.ParseFunc(ctx)
return ctx
}
// SetKeyin sets the custom keyword/configuration input.
func (ctx *Context) SetKeyin(keyin string) *Context {
ctx.spider.SetKeyin(keyin)
return ctx
}
// SetLimit sets the maximum number of items to crawl.
func (ctx *Context) SetLimit(max int) *Context {
ctx.spider.SetLimit(int64(max))
return ctx
}
// SetPausetime sets a custom pause interval (randomized: pause/2 ~ pause*2).
// Overrides the externally configured value. Only overwrites an existing value when runtime[0] is true.
func (ctx *Context) SetPausetime(pause int64, runtime ...bool) *Context {
ctx.spider.SetPausetime(pause, runtime...)
return ctx
}
// SetTimer configures a timer identified by id.
// When bell is nil, tol is a sleep duration (countdown timer).
// When bell is non-nil, tol specifies the wake-up point (the tol-th bell occurrence from now).
func (ctx *Context) SetTimer(id string, tol time.Duration, bell *Bell) bool {
return ctx.spider.SetTimer(id, tol, bell)
}
// RunTimer starts the timer and reports whether it can continue to be used.
func (ctx *Context) RunTimer(id string) bool {
return ctx.spider.RunTimer(id)
}
// ResetText replaces the downloaded text content and invalidates the DOM cache.
func (ctx *Context) ResetText(body string) *Context {
x := (*[2]uintptr)(unsafe.Pointer(&body))
h := [3]uintptr{x[0], x[1], x[1]}
ctx.text = *(*[]byte)(unsafe.Pointer(&h))
ctx.dom = nil
return ctx
}
// --- Public Get Methods ---
// GetError returns the download error, or the spider's stop error if stopping.
func (ctx *Context) GetError() error {
if err := ctx.spider.tryStop(); err != nil {
return err
}
return ctx.err
}
// Log returns the global logger instance.
func (*Context) Log() logs.Logs {
return logs.Log()
}
// GetSpider returns the spider bound to this context.
func (ctx *Context) GetSpider() *Spider {
return ctx.spider
}
// GetResponse returns the HTTP response.
func (ctx *Context) GetResponse() *http.Response {
return ctx.Response
}
// GetStatusCode returns the HTTP response status code, or 0 if no response.
func (ctx *Context) GetStatusCode() int {
if ctx.Response == nil {
return 0
}
return ctx.Response.StatusCode
}
// GetRequest returns the original request.
func (ctx *Context) GetRequest() *request.Request {
return ctx.Request
}
// CopyRequest returns a deep copy of the original request.
func (ctx *Context) CopyRequest() *request.Request {
return ctx.Request.Copy().Unwrap()
}
// GetItemFields returns the result field name list for the given rule.
func (ctx *Context) GetItemFields(ruleName ...string) []string {
_, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: GetItemFields() called with non-existent rule name", ctx.spider.GetName())
return nil
}
return ctx.spider.GetItemFields(rule)
}
// GetItemField returns the field name at the given index, or "" if not found.
// An empty ruleName defaults to the current rule.
func (ctx *Context) GetItemField(index int, ruleName ...string) (field string) {
_, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: GetItemField() called with non-existent rule name", ctx.spider.GetName())
return
}
return ctx.spider.GetItemField(rule, index)
}
// GetItemFieldIndex returns the index of the given field name, or -1 if not found.
// An empty ruleName defaults to the current rule.
func (ctx *Context) GetItemFieldIndex(field string, ruleName ...string) (index int) {
_, rule, found := ctx.getRule(ruleName...)
if !found {
logs.Log().Error("spider %s: GetItemFieldIndex() called with non-existent rule name", ctx.spider.GetName())
return
}
return ctx.spider.GetItemFieldIndex(rule, field)
}
// PullItems drains and returns all collected data items, resetting the internal buffer.
func (ctx *Context) PullItems() (ds []data.DataCell) {
ctx.Lock()
ds = ctx.items
ctx.items = []data.DataCell{}
ctx.Unlock()
return
}
// PullFiles drains and returns all collected file results, resetting the internal buffer.
func (ctx *Context) PullFiles() (fs []data.FileCell) {
ctx.Lock()
fs = ctx.files
ctx.files = []data.FileCell{}
ctx.Unlock()
return
}
// GetKeyin returns the custom keyword/configuration input.
func (ctx *Context) GetKeyin() string {
return ctx.spider.GetKeyin()
}
// GetLimit returns the maximum number of items to crawl.
func (ctx *Context) GetLimit() int {
return int(ctx.spider.GetLimit())
}
// GetName returns the spider name.
func (ctx *Context) GetName() string {
return ctx.spider.GetName()
}
// GetRules returns the full rule map.
func (ctx *Context) GetRules() map[string]*Rule {
return ctx.spider.GetRules()
}
// GetRule returns the rule with the given name.
func (ctx *Context) GetRule(ruleName string) *Rule {
return ctx.spider.GetRule(ruleName)
}
// GetRuleName returns the current rule name from the request.
func (ctx *Context) GetRuleName() string {
return ctx.Request.GetRuleName()
}
// GetTemp retrieves temporary data from the request by key.
// defaultValue must not be a nil interface{}.
func (ctx *Context) GetTemp(key string, defaultValue interface{}) interface{} {
return ctx.Request.GetTemp(key, defaultValue)
}
// GetTemps returns all temporary data from the request.
func (ctx *Context) GetTemps() request.Temp {
return ctx.Request.GetTemps()
}
// CopyTemps returns a shallow copy of the request's temporary data.
func (ctx *Context) CopyTemps() request.Temp {
temps := make(request.Temp)
for k, v := range ctx.Request.GetTemps() {
temps[k] = v
}
return temps
}
// GetURL returns the URL from the original request, preserving the unencoded form.
func (ctx *Context) GetURL() string {
return ctx.Request.URL
}
// GetMethod returns the HTTP method of the request.
func (ctx *Context) GetMethod() string {
return ctx.Request.GetMethod()
}
// GetHost returns the host from the response URL, or "" if unavailable.
func (ctx *Context) GetHost() string {
if ctx.Response == nil || ctx.Response.Request == nil || ctx.Response.Request.URL == nil {
return ""
}
return ctx.Response.Request.URL.Host
}
// GetHeader returns the response headers.
func (ctx *Context) GetHeader() http.Header {
if ctx.Response == nil {
return http.Header{}
}
return ctx.Response.Header
}
// GetRequestHeader returns the request headers from the actual HTTP request made.
func (ctx *Context) GetRequestHeader() http.Header {
if ctx.Response == nil || ctx.Response.Request == nil {
return http.Header{}
}
return ctx.Response.Request.Header
}
// GetReferer returns the Referer header from the actual HTTP request made.
func (ctx *Context) GetReferer() string {
if ctx.Response == nil || ctx.Response.Request == nil {
return ""
}
return ctx.Response.Request.Header.Get("Referer")
}
// GetCookie returns the Set-Cookie header from the response.
func (ctx *Context) GetCookie() string {
if ctx.Response == nil {
return ""
}
return ctx.Response.Header.Get("Set-Cookie")
}
// GetDom returns the parsed HTML DOM, initializing it lazily from the response body.
// Errors are stored in ctx.err and can be retrieved via GetError().
func (ctx *Context) GetDom() *goquery.Document {
if ctx.dom == nil {
if ctx.Response == nil {
logs.Log().Warning(" * [GetDom]: Response is nil for %s", ctx.GetURL())
return nil
}
dom, err := ctx.initDom()
if err != nil {
ctx.err = err
logs.Log().Error(" * [GetDom][%s]: %v", ctx.GetURL(), err)
return nil
}
return dom
}
return ctx.dom
}
// GetText returns the response body as a UTF-8 string, initializing it lazily.
// Errors are stored in ctx.err and can be retrieved via GetError().
func (ctx *Context) GetText() string {
if ctx.text == nil {
if ctx.Response == nil {
logs.Log().Warning(" * [GetText]: Response is nil for %s", ctx.GetURL())
return ""
}
if err := ctx.initText(); err != nil {
ctx.err = err
logs.Log().Error(" * [GetText][%s]: %v", ctx.GetURL(), err)
return ""
}
}
return util.Bytes2String(ctx.text)
}
// --- Private Methods ---
// getRule resolves a rule by name, defaulting to the current request's rule.
func (ctx *Context) getRule(ruleName ...string) (name string, rule *Rule, found bool) {
if len(ruleName) == 0 {
if ctx.Response == nil {
return
}
name = ctx.GetRuleName()
} else {
name = ruleName[0]
}
rule = ctx.spider.GetRule(name)
return name, rule, rule != nil
}
// initDom parses the text body into a goquery Document.
func (ctx *Context) initDom() (*goquery.Document, error) {
if ctx.text == nil {
if err := ctx.initText(); err != nil {
return nil, err
}
}
r := goquery.NewDocumentFromReader(bytes.NewReader(ctx.text))
if r.IsErr() {
return nil, r.UnwrapErr()
}
ctx.dom = r.Unwrap()
return ctx.dom, nil
}
// initText reads the response body and converts it to UTF-8 if needed.
func (ctx *Context) initText() error {
body, err := io.ReadAll(ctx.Response.Body)
ctx.Response.Body.Close()
if err != nil {
return err
}
responseCT := ctx.Response.Header.Get("Content-Type")
requestCT := ctx.Request.Header.Get("Content-Type")
pageEncode := detectCharset(responseCT, requestCT)
if ctx.Request.DownloaderID == request.SurfID && !isUTF8(pageEncode) {
converted, convErr := convertEncoding(body, pageEncode)
if convErr == nil {
ctx.text = converted
return nil
}
logs.Log().Warning(" * [convert][%v]: %v (ignore transcoding)\n", ctx.GetURL(), convErr)
}
ctx.text = body
return nil
}
// detectCharset extracts charset from Content-Type headers (response first, then request).
func detectCharset(responseContentType, requestContentType string) string {
for _, ct := range []string{responseContentType, requestContentType} {
if _, params, err := mime.ParseMediaType(ct); err == nil {
if cs, ok := params["charset"]; ok {
return strings.ToLower(strings.TrimSpace(cs))
}
}
}
return ""
}
func isUTF8(charset string) bool {
switch charset {
case "utf8", "utf-8", "unicode-1-1-utf-8":
return true
}
return false
}
// convertEncoding converts body from the given charset to UTF-8.
func convertEncoding(body []byte, charsetLabel string) ([]byte, error) {
var destReader io.Reader
var err error
r := bytes.NewReader(body)
if charsetLabel == "" {
destReader, err = charset.NewReader(r, "")
} else {
destReader, err = charset.NewReaderLabel(charsetLabel, r)
}
if err != nil {
return nil, err
}
return io.ReadAll(destReader)
}
================================================
FILE: app/spider/parsejs.go
================================================
package spider
import (
"encoding/xml"
"log"
"os"
"path"
"path/filepath"
"regexp"
"runtime/debug"
"strings"
"sync"
"github.com/robertkrimen/otto"
"github.com/andeya/gust/result"
"github.com/andeya/pholcus/config"
"github.com/andeya/pholcus/logs"
)
var (
scriptTagRe = regexp.MustCompile(`(?s)()`)
registerDynOnce sync.Once
)
// evalScript executes JS and returns Result with Catch.
func evalScript(vm *otto.Otto, script string) (r result.Result[otto.Value]) {
defer r.Catch()
val, err := vm.Eval(script)
result.Ret(val, err).Unwrap()
return result.Ok(val)
}
// SpiderModle is the XML model for dynamic (JavaScript-based) spider rules.
type (
SpiderModle struct {
Name string `xml:"Name"`
Description string `xml:"Description"`
Pausetime int64 `xml:"Pausetime"`
EnableLimit bool `xml:"EnableLimit"`
EnableKeyin bool `xml:"EnableKeyin"`
EnableCookie bool `xml:"EnableCookie"`
NotDefaultField bool `xml:"NotDefaultField"`
Namespace string `xml:"Namespace>Script"`
SubNamespace string `xml:"SubNamespace>Script"`
Root string `xml:"Root>Script"`
Trunk []RuleModle `xml:"Rule"`
}
// RuleModle is the XML model for a single dynamic rule node.
RuleModle struct {
Name string `xml:"name,attr"`
ParseFunc string `xml:"ParseFunc>Script"`
AidFunc string `xml:"AidFunc>Script"`
}
)
// RegisterDynamicSpiders loads and registers all dynamic (JS-based) spider
// rules from config.Conf().SpiderDir.
// Safe to call multiple times; only the first call performs registration.
func RegisterDynamicSpiders() {
registerDynOnce.Do(doRegisterDynamicSpiders)
}
func doRegisterDynamicSpiders() {
for _, _m := range getSpiderModles() {
m := _m
var sp = &Spider{
Name: m.Name,
Description: m.Description,
Pausetime: m.Pausetime,
EnableCookie: m.EnableCookie,
NotDefaultField: m.NotDefaultField,
RuleTree: &RuleTree{Trunk: map[string]*Rule{}},
}
if m.EnableLimit {
sp.Limit = LIMIT
}
if m.EnableKeyin {
sp.Keyin = KEYIN
}
if m.Namespace != "" {
sp.Namespace = func(self *Spider) string {
vm := otto.New()
vm.Set("self", self)
r := evalScript(vm, m.Namespace)
if r.IsErr() {
logs.Log().Error(" * dynamic rule [Namespace]: %v\n", r.UnwrapErr())
return ""
}
s, _ := r.Unwrap().ToString()
return s
}
}
if m.SubNamespace != "" {
sp.SubNamespace = func(self *Spider, dataCell map[string]interface{}) string {
vm := otto.New()
vm.Set("self", self)
vm.Set("dataCell", dataCell)
r := evalScript(vm, m.SubNamespace)
if r.IsErr() {
logs.Log().Error(" * dynamic rule [SubNamespace]: %v\n", r.UnwrapErr())
return ""
}
s, _ := r.Unwrap().ToString()
return s
}
}
sp.RuleTree.Root = func(ctx *Context) {
vm := otto.New()
vm.Set("ctx", ctx)
r := evalScript(vm, m.Root)
if r.IsErr() {
logs.Log().Error(" * dynamic rule [Root]: %v\n", r.UnwrapErr())
}
}
for _, rule := range m.Trunk {
r := new(Rule)
r.ParseFunc = func(parse string) func(*Context) {
return func(ctx *Context) {
vm := otto.New()
vm.Set("ctx", ctx)
ev := evalScript(vm, parse)
if ev.IsErr() {
logs.Log().Error(" * dynamic rule [ParseFunc]: %v\n", ev.UnwrapErr())
}
}
}(rule.ParseFunc)
r.AidFunc = func(parse string) func(*Context, map[string]interface{}) interface{} {
return func(ctx *Context, aid map[string]interface{}) interface{} {
vm := otto.New()
vm.Set("ctx", ctx)
vm.Set("aid", aid)
ev := evalScript(vm, parse)
if ev.IsErr() {
logs.Log().Error(" * dynamic rule [AidFunc]: %v\n", ev.UnwrapErr())
return nil
}
return ev.Unwrap()
}
}(rule.AidFunc)
sp.RuleTree.Trunk[rule.Name] = r
}
sp.Register()
}
}
// wrapScriptCDATA wraps
Package testing
Package testing provides support for automated testing of Go packages.
It is intended to be used in concert with the “go test” command, which automates
execution of any function of the form
func TestXxx(*testing.T)
where Xxx can be any alphanumeric string (but the first letter must not be in
[a-z]) and serves to identify the test routine.
These TestXxx routines should be declared within the package they are testing.
Functions of the form
func BenchmarkXxx(*testing.B)
are considered benchmarks, and are executed by the "go test" command when
the -test.bench flag is provided.
A sample benchmark function looks like this:
func BenchmarkHello(b *testing.B) {
for i := 0; i < b.N; i++ {
fmt.Sprintf("hello")
}
}
The benchmark package will vary b.N until the benchmark function lasts
long enough to be timed reliably. The output
testing.BenchmarkHello 10000000 282 ns/op
means that the loop ran 10000000 times at a speed of 282 ns per loop.
If a benchmark needs some expensive setup before running, the timer
may be stopped:
func BenchmarkBigLen(b *testing.B) {
b.StopTimer()
big := NewBig()
b.StartTimer()
for i := 0; i < b.N; i++ {
big.Len()
}
}
The package also runs and verifies example code. Example functions may
include a concluding comment that begins with "Output:" and is compared with
the standard output of the function when the tests are run, as in these
examples of an example:
func ExampleHello() {
fmt.Println("hello")
// Output: hello
}
func ExampleSalutations() {
fmt.Println("hello, and")
fmt.Println("goodbye")
// Output:
// hello, and
// goodbye
}
Example functions without output comments are compiled but not executed.
The naming convention to declare examples for a function F, a type T and
method M on type T are:
func ExampleF() { ... }
func ExampleT() { ... }
func ExampleT_M() { ... }
Multiple example functions for a type/function/method may be provided by
appending a distinct suffix to the name. The suffix must start with a
lower-case letter.
func ExampleF_suffix() { ... }
func ExampleT_suffix() { ... }
func ExampleT_M_suffix() { ... }
The entire test file is presented as the example when it contains a single
example function, at least one other function, type, variable, or constant
declaration, and no test or benchmark functions.
Index
func Main(matchString func(pat, str string) (bool, error), tests []InternalTest, benchmarks []InternalBenchmark, examples []InternalExample)
func RunBenchmarks(matchString func(pat, str string) (bool, error), benchmarks []InternalBenchmark)
func RunExamples(matchString func(pat, str string) (bool, error), examples []InternalExample) (ok bool)
func RunTests(matchString func(pat, str string) (bool, error), tests []InternalTest) (ok bool)
func Short() bool
type B
func (c *B) Error(args ...interface{})
func (c *B) Errorf(format string, args ...interface{})
func (c *B) Fail()
func (c *B) FailNow()
func (c *B) Failed() bool
func (c *B) Fatal(args ...interface{})
func (c *B) Fatalf(format string, args ...interface{})
func (c *B) Log(args ...interface{})
func (c *B) Logf(format string, args ...interface{})
func (b *B) ResetTimer()
func (b *B) SetBytes(n int64)
func (b *B) StartTimer()
func (b *B) StopTimer()
type BenchmarkResult
func Benchmark(f func(b *B)) BenchmarkResult
func (r BenchmarkResult) NsPerOp() int64
func (r BenchmarkResult) String() string
type InternalBenchmark
type InternalExample
type InternalTest
type T
func (c *T) Error(args ...interface{})
func (c *T) Errorf(format string, args ...interface{})
func (c *T) Fail()
func (c *T) FailNow()
func (c *T) Failed() bool
func (c *T) Fatal(args ...interface{})
func (c *T) Fatalf(format string, args ...interface{})
func (c *T) Log(args ...interface{})
func (c *T) Logf(format string, args ...interface{})
func (t *T) Parallel()
Package files
benchmark.go
example.go
testing.go
func Main(matchString func(pat, str string) (bool, error), tests []InternalTest, benchmarks []InternalBenchmark, examples []InternalExample)
An internal function but exported because it is cross-package; part of the implementation
of the "go test" command.
func RunBenchmarks(matchString func(pat, str string) (bool, error), benchmarks []InternalBenchmark)
An internal function but exported because it is cross-package; part of the implementation
of the "go test" command.
func RunExamples(matchString func(pat, str string) (bool, error), examples []InternalExample) (ok bool)
func RunTests(matchString func(pat, str string) (bool, error), tests []InternalTest) (ok bool)
func Short() bool
Short reports whether the -test.short flag is set.
type B
type B struct {
N int
}
B is a type passed to Benchmark functions to manage benchmark
timing and to specify the number of iterations to run.
func (c *B) Error(args ...interface{})
Error is equivalent to Log() followed by Fail().
func (c *B) Errorf(format string, args ...interface{})
Errorf is equivalent to Logf() followed by Fail().
func (*B) Fail
func (c *B) Fail()
Fail marks the function as having failed but continues execution.
func (c *B) FailNow()
FailNow marks the function as having failed and stops its execution.
Execution will continue at the next test or benchmark.
func (c *B) Failed() bool
Failed returns whether the function has failed.
func (c *B) Fatal(args ...interface{})
Fatal is equivalent to Log() followed by FailNow().
func (c *B) Fatalf(format string, args ...interface{})
Fatalf is equivalent to Logf() followed by FailNow().
func (*B) Log
func (c *B) Log(args ...interface{})
Log formats its arguments using default formatting, analogous to Println(),
and records the text in the error log.
func (*B) Logf
func (c *B) Logf(format string, args ...interface{})
Logf formats its arguments according to the format, analogous to Printf(),
and records the text in the error log.
func (b *B) ResetTimer()
ResetTimer sets the elapsed benchmark time to zero.
It does not affect whether the timer is running.
func (b *B) SetBytes(n int64)
SetBytes records the number of bytes processed in a single operation.
If this is called, the benchmark will report ns/op and MB/s.
func (b *B) StartTimer()
StartTimer starts timing a test. This function is called automatically
before a benchmark starts, but it can also used to resume timing after
a call to StopTimer.
func (b *B) StopTimer()
StopTimer stops timing a test. This can be used to pause the timer
while performing complex initialization that you don't
want to measure.
type BenchmarkResult struct {
N int
T time.Duration
Bytes int64
}
The results of a benchmark run.
func Benchmark(f func(b *B)) BenchmarkResult
Benchmark benchmarks a single function. Useful for creating
custom benchmarks that do not use the "go test" command.
func (BenchmarkResult) NsPerOp
func (r BenchmarkResult) NsPerOp() int64
func (BenchmarkResult) String
func (r BenchmarkResult) String() string
type InternalBenchmark struct {
Name string
F func(b *B)
}
An internal type but exported because it is cross-package; part of the implementation
of the "go test" command.
type InternalExample struct {
Name string
F func()
Output string
}
type InternalTest struct {
Name string
F func(*T)
}
An internal type but exported because it is cross-package; part of the implementation
of the "go test" command.
type T
type T struct {
}
T is a type passed to Test functions to manage test state and support formatted test logs.
Logs are accumulated during execution and dumped to standard error when done.
func (c *T) Error(args ...interface{})
Error is equivalent to Log() followed by Fail().
func (c *T) Errorf(format string, args ...interface{})
Errorf is equivalent to Logf() followed by Fail().
func (*T) Fail
func (c *T) Fail()
Fail marks the function as having failed but continues execution.
func (c *T) FailNow()
FailNow marks the function as having failed and stops its execution.
Execution will continue at the next test or benchmark.
func (c *T) Failed() bool
Failed returns whether the function has failed.
func (c *T) Fatal(args ...interface{})
Fatal is equivalent to Log() followed by FailNow().
func (c *T) Fatalf(format string, args ...interface{})
Fatalf is equivalent to Logf() followed by FailNow().
func (*T) Log
func (c *T) Log(args ...interface{})
Log formats its arguments using default formatting, analogous to Println(),
and records the text in the error log.
func (*T) Logf
func (c *T) Logf(format string, args ...interface{})
Logf formats its arguments according to the format, analogous to Printf(),
and records the text in the error log.
func (t *T) Parallel()
Parallel signals that this test is to be run in parallel with (and only with)
other parallel tests in this CPU group.
Subdirectories
Name
Synopsis
..
iotest
Package iotest implements Readers and Writers useful mainly for testing.
quick
Package quick implements utility functions to help with black box testing.