Repository: DeanThompson/zhihu-go Branch: master Commit: 97580e6df9fe Files: 17 Total size: 81.5 KB Directory structure: gitextract_l8vbcfjv/ ├── .gitignore ├── LICENSE ├── README.md ├── answer.go ├── collection.go ├── examples/ │ ├── config-example.json │ └── example.go ├── log.go ├── log_test.go ├── question.go ├── question_test.go ├── session.go ├── session_test.go ├── topic.go ├── user.go ├── util.go └── util_test.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Created by .ignore support plugin (hsz.mobi) ### Go template # Compiled Object files, Static and Dynamic libs (Shared Objects) *.o *.a *.so # Folders _obj _test # Architecture specific extensions/prefixes *.[568vq] [568vq].out *.cgo1.go *.cgo2.c _cgo_defun.c _cgo_gotypes.go _cgo_export.* _testmain.go *.exe *.test *.prof verify.gif examples/config.json ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2016 Yangliang Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ zhihu-go:知乎非官方 API 库 with Go ================================= [![GoDoc](https://godoc.org/github.com/DeanThompson/zhihu-go?status.svg)](https://godoc.org/github.com/DeanThompson/zhihu-go) 这是一个非官方的 [知乎](https://www.zhihu.com/) API 库,用 Go 实现。 本项目基本上是把 [zhihu-python](https://github.com/egrcc/zhihu-python) 和 [zhihu-py3](https://github.com/7sDream/zhihu-py3) 从 Python 移植到了 Go. 相比之下,比 zhihu-python 的 API 更丰富,比 zhihu-py3 少了活动相关的 API. **注意:知乎的 API、前端等都可能随时会更新,所以本项目的接口可能会有过时的情况。如果遇到此类问题,欢迎提交 issue 或 pull requests.** ## Table of Contents * [Table of Contents](#table-of-contents) * [Install](#install) * [Documentation](#documentation) * [Usage](#usage) * [Login:登录](#login) * [User:获取用户信息](#user) * [Question:获取问题信息](#question) * [Answer:获取答案信息](#answer) * [Collection:获取收藏夹信息](#collection) * [Topic:获取话题信息](#topic) * [Known Issues](#known-issues) * [TODO](#todo) * [LICENSE](#license) ## Install 直接使用 `go get`: ```bash go get github.com/DeanThompson/zhihu-go ``` 依赖以下第三方库: * [goquery](https://github.com/PuerkitoBio/goquery): 用于解析 HTML,语法操作类似 jQuery * [color](https://github.com/fatih/color):用于输出带颜色的日志 * [persistent-cookiejar](https://github.com/juju/persistent-cookiejar):用于维护一个持久化的 cookiejar,实现保持登录 ## Documentation 请点击链接前往 GoDoc 查看:[zhihu-go](https://godoc.org/github.com/DeanThompson/zhihu-go) ## Usage 目前已经实现了用户(User),问题(Question),回答(Answer),收藏夹(Collection),话题(Topic)相关的 API,都是信息获取类的,暂无操作类的。 zhihu-go 包名为 `zhihu`,使用前需要先 import: ```go import "github.com/DeanThompson/zhihu-go" ``` ### Login 调用 API 之前需要先登录。在 zhihu-go 内部,使用一个全局的 session 来访问所有页面,并自动处理 cookies. 创建一个 JSON 格式的配置文件,提供一个账号和密码,格式如 [config-example.json](examples/config-example.json). 登录(初始化 session): ```go zhihu.Init("/path/to/config.json") ``` 第一次登录会调用图像界面打开验证码文件,需要手动输入验证码到控制台。如果登录成功,后续的请求会沿用此次登录的 cookie, 不需要重复登录。 ### User `zhihu.User` 表示一个知乎用户,可以用于获取一个用户的各种数据。 创建一个 `User` 对象需要传入用户主页的 URL 及其知乎 ID(用户名),如: ```go link := "https://www.zhihu.com/people/jixin" userID := "黄继新" user := zhihu.NewUser(link, userID) ``` 获取用户的数据(代码见:[example.go](examples/example.go#L159)): ```go func showUser(user *zhihu.User) { logger.Info("User fields:") logger.Info(" is anonymous: %v", user.IsAnonymous()) // 是否匿名用户:false logger.Info(" userId: %s", user.GetUserID()) // 知乎ID:黄继新 logger.Info(" dataId: %s", user.GetDataID()) // hash ID:b6f80220378c8b0b78175dd6a0b9c680 logger.Info(" bio: %s", user.GetBio()) // BIO:和知乎在一起 logger.Info(" location: %s", user.GetLocation()) // 位置:北京 logger.Info(" business: %s", user.GetBusiness()) // 行业:互联网 logger.Info(" gender: %s", user.GetGender()) // 性别:male logger.Info(" education: %s", user.GetEducation()) // 学校:北京第二外国语学院 logger.Info(" followers num: %d", user.GetFollowersNum()) // 粉丝数:756632 logger.Info(" followees num: %d", user.GetFolloweesNum()) // 关注的人数: 9249 logger.Info(" followed columns num: %d", user.GetFollowedColumnsNum()) // 关注的专栏数:631 logger.Info(" followed topics num: %d", user.GetFollowedTopicsNum()) // 关注的话题数:131 logger.Info(" agree num: %d", user.GetAgreeNum()) // 获得的赞同数:68557 logger.Info(" thanks num: %d", user.GetThanksNum()) // 获得的感谢数:17651 logger.Info(" asks num: %d", user.GetAsksNum()) // 提问数:1336 logger.Info(" answers num: %d", user.GetAnswersNum()) // 回答数:785 logger.Info(" posts num: %d", user.GetPostsNum()) // 专栏文章数:92 logger.Info(" collections num: %d", user.GetCollectionsNum()) // 收藏夹数量:44 logger.Info(" logs num: %d", user.GetLogsNum()) // 公共编辑数:51596 // // // // // for i, topic := range user.GetFollowedTopicsN(5) { logger.Info(" top followed topic-%d: %s", i+1, topic.String()) } // // // // // for i, follower := range user.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } // // // // // for i, followee := range user.GetFolloweesN(5) { logger.Info(" top followee-%d: %s", i+1, followee.String()) } // // // // // for i, ask := range user.GetAsksN(5) { logger.Info(" top ask-%d: %s", i+1, ask.String()) } // - https://www.zhihu.com/question/40394171/answer/86692178> // - https://www.zhihu.com/question/19952708/answer/84561308> // - https://www.zhihu.com/question/35987345/answer/72981016> // - https://www.zhihu.com/question/24980451/answer/29789141> // - https://www.zhihu.com/question/24816698/answer/29229733> for i, answer := range user.GetAnswersN(5) { logger.Info(" top answer-%d: %s", i+1, answer.String()) } // // // // // for i, collection := range user.GetCollectionsN(5) { logger.Info(" top collection-%d: %s", i+1, collection.String()) } for i, like := range user.GetLikes() { logger.Info(" like-%d: %s", i+1, like.String()) } } ``` ### Question `zhihu.Question` 表示一个知乎问题,用于获取问题相关的数据。初始化需要提供 url 和标题(可为空): ```go link := "https://www.zhihu.com/question/28966220" title := "Python 编程,应该养成哪些好的习惯?" question := zhihu.NewQuestion(link, title) ``` 获取问题数据:(代码见:[example.go](examples/example.go#L51)) ```go func showQuestion(question *zhihu.Question) { logger.Info("Question fields:") // 链接:https://www.zhihu.com/question/28966220 logger.Info(" url: %s", question.Link) // 标题:Python 编程,应该养成哪些好的习惯? logger.Info(" title: %s", question.GetTitle()) // 描述:我以为编程习惯很重要的,一开始就养成这些习惯,不仅可以提高编程速度,还可以减少 bug 出现的概率。希望各位分享好的编程习惯。 logger.Info(" detail: %s", question.GetDetail()) logger.Info(" answers num: %d", question.GetAnswersNum()) // 回答数:15 logger.Info(" followers num: %d", question.GetFollowersNum()) // 关注者数量:1473 // // // // for i, topic := range question.GetTopics() { logger.Info(" topic-%d: %s", i+1, topic.String()) } // // // // // for i, follower := range question.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } for i, follower := range question.GetFollowers() { // 关注者列表 logger.Info(" follower-%d: %s", i+1, follower.String()) if i >= 10 { logger.Info(" %d followers not shown.", question.GetFollowersNum()-i-1) break } } allAnswers := question.GetAllAnswers() // 所有回答 for i, answer := range allAnswers { logger.Info(" answer-%d: %s", i+1, answer.String()) filename := fmt.Sprintf("/tmp/%s-%s的回答.html", question.GetTitle(), answer.GetAuthor().GetUserID()) dumpAnswerHTML(filename, answer) if i >= 10 { logger.Info(" %d answers not shown.", len(allAnswers)-i-1) break } } topXAnswers := question.GetTopXAnswers(25) // 前 25 个回答 for i, answer := range topXAnswers { logger.Info(" top-%d answer: %s", i+1, answer.String()) } // 排名第一的回答 // - https://www.zhihu.com/question/28966220/answer/43346747> logger.Info(" top-1 answer: %s", question.GetTopAnswer().String()) logger.Info(" visit times: %d", question.GetVisitTimes()) // 查看次数:32942 } ``` ### Answer `zhihu.Answer` 表示一个知乎答案,初始化时需要指定页面链接,也支持指定对应的问题(`*Question`,可以为 `nil`)和作者(`*User`,可以为 `nil`): ```go // 龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」?豆子 的答案 answer := zhihu.NewAnswer("https://www.zhihu.com/question/23759686/answer/41997389", nil, nil) ``` 获取回答数据:(代码见:[example.go](examples/example.go#L95)) ```go func showAnswer(answer *zhihu.Answer) { logger.Info("Answer fields:") // 链接:https://www.zhihu.com/question/23759686/answer/41997389 logger.Info(" url: %s", answer.Link) // 所属问题 // 链接:https://www.zhihu.com/question/23759686 // 标题:龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」? question := answer.GetQuestion() logger.Info(" question url: %s", question.Link) logger.Info(" question title: %s", question.GetTitle()) // 作者: logger.Info(" author: %s", answer.GetAuthor().String()) logger.Info(" upvote num: %d", answer.GetUpvote()) // 赞同数:26486 logger.Info(" comments num: %d", answer.GetCommentsNum()) // 评论数:20 logger.Info(" collected num: %d", answer.GetCollectedNum()) // 被收藏次数:22929 logger.Info(" data ID: %d", answer.GetID()) // 数字 ID:12191779 // 点赞的用户 voters := answer.GetVoters() for i, voter := range voters { logger.Info(" voter-%d: %s", i+1, voter.String()) if i >= 10 { remain := len(voters) - i - 1 logger.Info(" %d votes not shown.", remain) break } } } ``` ### Collection `zhihu.Collection` 表示一个收藏夹,初始化时必须指定页面 url,支持指定名称(`string` 可以为 `""`)和创建者(`creator *User`,可以为 `nil`): ```go // 黄继新 A4U collection := zhihu.NewCollection("https://www.zhihu.com/collection/19677733", "", nil) ``` 获取收藏夹数据:(代码见:[example.go](examples/example.go#L124)) ```go func showCollection(collection *zhihu.Collection) { logger.Info("Collection fields:") // 链接:https://www.zhihu.com/collection/19677733 logger.Info(" url: %s", collection.Link) // 名称:A4U logger.Info(" name: %s", collection.GetName()) // 作者: logger.Info(" creator: %s", collection.GetCreator().String()) logger.Info(" followers num: %d", collection.GetFollowersNum()) // 关注者数量:29 // 获取 5 个关注者 for i, follower := range collection.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } // 获取 5 个问题 for i, question := range collection.GetQuestionsN(5) { logger.Info(" top question-%d: %s", i+1, question.String()) } // 获取 5 个回答 for i, answer := range collection.GetAnswersN(5) { logger.Info(" top answer-%d: %s", i+1, answer.String()) } } ``` ### Topic `zhihu.Collection` 表示一个话题,初始化时必须指定页面 url,支持指定名称(`string` 可以为 `""`): ```go // Python topic := zhihu.NewTopic("https://www.zhihu.com/topic/19552832", "") ``` 获取收藏夹数据:(代码见:[example.go](examples/example.go#L237)) ```go func showTopic(topic *zhihu.Topic) { logger.Info("Topic fields:") // 链接:https://www.zhihu.com/topic/19552832 logger.Info(" url: %s", topic.Link) // 名称:Python logger.Info(" name: %s", topic.GetName()) // 描述:Python 是一种面向对象的解释型计算机程序设计语言,在设计中注重代码的可读性,同时也是一种功能强大的通用型语言。 logger.Info(" description: %s", topic.GetDescription()) // 关注者数量:82805 logger.Info(" followers num: %d", topic.GetFollowersNum()) // 最佳答主,一般为 5 个 // // // // // for i, author := range topic.GetTopAuthors() { logger.Info(" top-%d author: %s", i+1, author.String()) } } ``` ## Known Issues 无,欢迎 [提交 issues](https://github.com/DeanThompson/zhihu-go/issues) ## TODO 按优先级降序排列: * [X] 获取回答的收藏数 * [X] 获取收藏夹的答案数量 * [X] 获取用户的头像 * [X] 获取用户的微博地址 * [ ] 把答案导出到 markdown 文件 * [ ] 更多的登录方式,不需要依赖图形界面打开验证码文件 * [ ] 增加评论相关的 API * [ ] 增加活动相关的 API * [ ] 增加专栏相关的 API * [ ] test(暂时没想好怎么做) 很可能不会做: * [ ] 增加用户的操作,如点赞、关注等 欢迎 [提交 pull requests](https://github.com/DeanThompson/zhihu-go/pulls) ## LICENSE [The MIT license](LICENSE). ================================================ FILE: answer.go ================================================ package zhihu import ( "fmt" "net/url" "strconv" "strings" "github.com/PuerkitoBio/goquery" ) // Answer 是一个知乎的答案 type Answer struct { *Page // question 是该答案对应的问题 question *Question // author 是该答案的作者 author *User } // NewAnswer 用于创建一个 Answer 对象,其中 link 是必传的,question, author 可以为 nil func NewAnswer(link string, question *Question, author *User) *Answer { return &Answer{ Page: newZhihuPage(link), question: question, author: author, } } // GetID 返回该答案的数字 ID func (a *Answer) GetID() int { if got, ok := a.getIntField("data-aid"); ok { return got } doc := a.Doc() text, _ := doc.Find("div.zm-item-answer.zm-item-expanded").Attr("data-aid") aid, _ := strconv.Atoi(text) a.setField("data-aid", aid) return aid } // GetQuestion 返回该回答所属的问题,如果 NewAnswer 时 question 不为 nil,则直接返回该值; // 否则会抓取页面并分析得到问题的链接和标题,再新建一个 Question 对象 func (a *Answer) GetQuestion() *Question { if a.question != nil { return a.question } doc := a.Doc() href, _ := doc.Find("h2.zm-item-title>a").Attr("href") link := makeZhihuLink(href) title := strip(doc.Find("h2.zm-item-title").First().Text()) return NewQuestion(link, title) } // Author 返回该答案的作者 func (a *Answer) GetAuthor() *User { if a.author != nil { return a.author } doc := a.Doc() sel := doc.Find("div.zm-item-answer-author-info").First() return newUserFromAnswerAuthorTag(sel) } // GetUpvote 返回赞同数 func (a *Answer) GetUpvote() int { if got, ok := a.getIntField("upvote"); ok { return got } doc := a.Doc() text := strip(doc.Find("span.count").First().Text()) upvote := upvoteTextToNum(text) a.setField("upvote", upvote) return upvote } // ToMarkdown 把回答导出到 markdown 文件 func (a *Answer) ToMarkdown(filename string) error { if !strings.HasSuffix(filename, ".md") && !strings.HasSuffix(filename, ".markdown") { filename += ".md" } // TODO convert to markdown md := "" return saveString(filename, md) } // ToHtml 把网页源码导出到 html 文件 func (a *Answer) ToHtml(filename string) error { if !strings.HasSuffix(filename, ".html") { filename += ".html" } html, err := a.Doc().Html() if err != nil { return err } return saveString(filename, html) } // GetContent 返回回答的内容,HTML 格式 func (a *Answer) GetContent() string { if got, ok := a.getStringField("content"); ok { return got } sel := a.Doc().Find("div#zh-question-answer-wrap").Find("div.zm-editable-content") content, err := answerSelectionToHtml(sel) if err != nil { logger.Error("导出 HTML 失败:%s", err.Error()) return "" } a.setField("content", content) return content } // GetVotersN 返回 n 个点赞的用户,如果 n < 0,返回所有点赞的用户 func (a *Answer) GetVotersN(n int) []*User { if n == 0 { return nil } querystring := fmt.Sprintf(`params={"answer_id":"%d"}`, a.GetID()) url := makeZhihuLink("/node/AnswerFullVoteInfoV2" + "?" + querystring) doc, err := newDocumentFromURL(url) if err != nil { return nil } sel := doc.Find(".voters span") capacity := n if capacity < 0 || capacity > sel.Length() { capacity = sel.Length() } voters := make([]*User, 0, capacity) sel.EachWithBreak(func(index int, span *goquery.Selection) bool { userId := strings.Trim(strip(span.Text()), "、") var userLink string if !(userId == "匿名用户" || userId == "知乎用户") { path, _ := span.Find("a").Attr("href") userLink = makeZhihuLink(path) } voters = append(voters, NewUser(userLink, userId)) if n > 0 && len(voters) == n { return false } return true }) return voters } // GetVoters 返回点赞的用户 func (a *Answer) GetVoters() []*User { return a.GetVotersN(-1) } // GetCommentsNum 返回评论数量 func (a *Answer) GetCommentsNum() int { if value, ok := a.getIntField("comment-num"); ok { return value } doc := a.Doc() text := strip(doc.Find("a.meta-item.toggle-comment").Text()) rv := reMatchInt(text) a.setField("comment-num", rv) return rv } // GetCollectedNum 返回被收藏次数 func (a *Answer) GetCollectedNum() int { if value, ok := a.getIntField("collected-num"); ok { return value } text := strip(a.Doc().Find(`a[data-za-l="sidebar_answer_collected_count"]`).Text()) value, _ := strconv.Atoi(text) a.setField("collected-num", value) return value } func (a *Answer) String() string { return fmt.Sprintf("", a.GetAuthor().String(), a.Link) } func (a *Answer) setContent(value string) { a.setField("content", value) } func (a *Answer) setUpvote(value int) { a.setField("upvote", value) } func upvoteTextToNum(text string) int { rv := 0 if strings.HasSuffix(text, "K") { num, _ := strconv.Atoi(text[0 : len(text)-1]) rv = num * 1000 } else if strings.HasPrefix(text, "W") { num, _ := strconv.Atoi(text[0 : len(text)-1]) rv = num * 10000 } else { rv, _ = strconv.Atoi(text) } return rv } // 把一个回答的主体部分导出成 HTML 代码,与原码相比,做了这些操作: // 1. 去掉无用的 noscript 标签 // 2. 修复 img 的 src 值 // 3. 移除无用的 icon // 4. 如果是自己的回答,移除末尾的 “修改” 链接 func answerSelectionToHtml(sel *goquery.Selection) (string, error) { sel.RemoveClass() sel.Find("noscript").Each(func(_ int, tag *goquery.Selection) { tag.Remove() // 把无用的 noscript 去掉 }) sel.Find("i.icon-external").Each(func(_ int, tag *goquery.Selection) { tag.Remove() // 把无用的 icon 去掉 }) sel.Find("a.zu-edit-button").Remove() // 把 “修改” 链接去掉 // 修复 img 的 src sel.Find("img").Each(func(_ int, tag *goquery.Selection) { var src string if tag.HasClass("origin_image") { src, _ = tag.Attr("data-original") } else { src, _ = tag.Attr("data-actualsrc") } tag.SetAttr("src", src) if tag.Next().Size() == 0 { tag.AfterHtml("
") } }) // 修复 a 标签的 href,因为知乎的外链都是这种形式:https://link.zhihu.com/?target=xxx sel.Find("a").Each(func(_ int, tag *goquery.Selection) { href, _ := tag.Attr("href") if strings.Contains(href, "target=") { link, err := url.Parse(href) if err != nil { return } target := link.Query().Get("target") tag.SetAttr("href", target) } }) wrapper := `` doc, _ := goquery.NewDocumentFromReader(strings.NewReader(wrapper)) doc.Find("body").AppendSelection(sel) return doc.Html() } func newUserFromAnswerAuthorTag(sel *goquery.Selection) *User { if strip(sel.Text()) == "匿名用户" { return ANONYMOUS } node := sel.Find("a.author-link") userId := strip(node.Text()) urlPath, _ := node.Attr("href") userLink := makeZhihuLink(urlPath) return NewUser(userLink, userId) } ================================================ FILE: collection.go ================================================ package zhihu import ( "encoding/json" "fmt" "net/url" "strconv" "strings" "github.com/PuerkitoBio/goquery" ) // Collection 是一个知乎的收藏夹页面 type Collection struct { *Page // creator 是该收藏夹的创建者 creator *User // name 是该收藏夹的名称 name string } // NewCollection 创建一个收藏夹对象,返回 *Collection func NewCollection(link string, name string, creator *User) *Collection { if !validCollectionURL(link) { panic("收藏夹链接不正确:" + link) } return &Collection{ Page: newZhihuPage(link), creator: creator, name: name, } } // GetName 返回收藏夹的名字 func (c *Collection) GetName() string { if c.name != "" { return c.name } doc := c.Doc() //

// 恩恩恩 大力一点,不要停~ //

c.name = strip(doc.Find("h2#zh-fav-head-title").Text()) return c.name } // GetCreator 返回收藏夹的创建者 func (c *Collection) GetCreator() *User { if c.creator != nil { return c.creator } doc := c.Doc() //

// 李阳良 //

sel := doc.Find("h2.zm-list-content-title a") userId := strip(sel.Text()) linkPath, _ := sel.Attr("href") c.creator = NewUser(makeZhihuLink(linkPath), userId) return c.creator } // GetFollowersNum 返回收藏夹的关注者数量 func (c *Collection) GetFollowersNum() int { if got, ok := c.getIntField("followers-num"); ok { return got } doc := c.Doc() // // 7516 // text := strip(doc.Find(`a[data-za-a="visit_collection_followers"]`).Text()) num, _ := strconv.Atoi(text) c.setField("followers-num", num) return num } // GetFollowersN 返回 n 个关注该收藏夹的用户,如果 n < 0,返回所有关注者 func (c *Collection) GetFollowersN(n int) []*User { var ( link = urlJoin(c.Link, "/followers") xsrf = c.GetXSRF() ) users, err := ajaxGetFollowers(link, xsrf, n) if err != nil { return nil } return users } // GetFollowers 返回关注该收藏夹的用户 func (c *Collection) GetFollowers() []*User { return c.GetFollowersN(c.GetFollowersNum()) } // GetQuestionsN 返回前 n 个问题,如果 n < 0,返回所有问题 func (c *Collection) GetQuestionsN(n int) []*Question { if n == 0 { return nil } // 先获取第一页的问题 questions := getQuestionsFromDoc(c.Doc()) totalPages := c.totalPages() if totalPages == 1 { if n < 0 || n > len(questions) { return questions } return questions[0:n] } // 再分页查询其他问题 currentPage := 2 for currentPage <= totalPages { link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) doc, err := newDocumentFromURL(link) if err != nil { logger.Error("解析页面失败:%s, %s", link, err.Error()) return nil } newQuestions := getQuestionsFromDoc(doc) questions = append(questions, newQuestions...) if n > 0 && len(questions) >= n { return questions[0:n] } currentPage++ } return questions } // GetQuestions 返回收藏夹里所有的问题 func (c *Collection) GetQuestions() []*Question { return c.GetQuestionsN(-1) } // GetAnswersN 返回 n 个回答,如果 n < 0,返回所有回答 func (c *Collection) GetAnswersN(n int) []*Answer { if n == 0 { return nil } // 先获取第一页的回答 answers := getAnswersFromDoc(c.Doc()) totalPages := c.totalPages() if totalPages == 1 { if n < 0 || n > len(answers) { return answers } return answers[0:n] } // 在分页查询 currentPage := 2 for currentPage <= totalPages { link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) doc, err := newDocumentFromURL(link) if err != nil { logger.Error("解析页面失败:%s, %s", link, err.Error()) return nil } newAnswers := getAnswersFromDoc(doc) answers = append(answers, newAnswers...) if n > 0 && len(answers) >= n { return answers[0:n] } currentPage++ } return answers } // GetAnswers 返回收藏夹里所有的回答 func (c *Collection) GetAnswers() []*Answer { return c.GetAnswersN(-1) } // GetQuestionsNum 返回收藏夹的问题数量 func (c *Collection) GetQuestionsNum() int { if value, ok := c.getIntField("question-num"); ok { return value } // 根据分页情况来计算问题数量 // 收藏夹页面,每一页固定 10 个问题,每个问题下可能有多个答案; totalPages := c.totalPages() lastPage := c.Doc() if totalPages > 1 { lp, err := newDocumentFromURL(fmt.Sprintf("%s?page=%d", c.Link, totalPages)) if err != nil { logger.Error("获取收藏夹最后一页失败:%s", err.Error()) return 0 } lastPage = lp } numOnLastPage := lastPage.Find("#zh-list-answer-wrap h2.zm-item-title").Size() rv := (totalPages-1)*10 + numOnLastPage c.setField("question-num", rv) return rv } // GetAnswersNum 返回收藏夹的答案数量 // 获取答案数量有这几种方式: // 1. 在收藏夹页面(/collections/1234567),遍历每一页,累计每页的回答数量。总请求数等于分页数。 // 2. 在收藏夹创建者的个人主页,收藏夹栏目(people/xxyy/collections),有每个收藏夹的简介, // 其中就有回答数。遍历每一页(20个/页),找到对应的收藏夹,然后获取回答数。 // 总请求数不确定,最好情况下 1 次;但考虑到每个用户的收藏夹并不会很多(如达到100个),可以认为最坏情况下需要 5 次。 // 最终的方案可以综合以上两种方式,以收藏夹页面分页数做依据: // 如果页数大于 3(经验值),则采用方法 2;否则用方法 1 // 希望能通过这样的方式来减少请求数,获得更好的性能。 func (c *Collection) GetAnswersNum() int { if value, ok := c.getIntField("answer-num"); ok { return value } rv := 0 totalPages := c.totalPages() if totalPages > 3 { // 从个人主页上获取 page := 1 linkFmt := urlJoin(c.GetCreator().Link, "/collections?page=%d") collectionHref := strings.Split(c.Link, "zhihu.com")[1] selector := fmt.Sprintf(`a.zm-profile-fav-item-title[href="%s"]`, collectionHref) for { creatorCollectionLink := fmt.Sprintf(linkFmt, page) doc, err := newDocumentFromURL(creatorCollectionLink) if err != nil { logger.Error("获取用户的收藏夹主页失败:%s", err.Error()) return 0 } titleTag := doc.Find(selector).First() if titleTag.Size() == 1 { rv = reMatchInt(titleTag.Parent().Next().Contents().Eq(0).Text()) break } else { // 本页没找到,下一页 if doc.Find("div.border-pager").Size() == 0 { return 0 } else { pages := getTotalPages(doc) if page == pages { return 0 } page++ } } } } else { selector := "#zh-list-answer-wrap div.zm-item-fav" rv = c.Doc().Find(selector).Size() currentPage := 2 for currentPage <= totalPages { link := fmt.Sprintf("%s?page=%d", c.Link, currentPage) doc, err := newDocumentFromURL(link) if err != nil { logger.Error("解析页面失败:%s, %s", link, err.Error()) return 0 } rv += doc.Find(selector).Size() currentPage++ } } c.setField("answer-num", rv) return rv } // GetCommentsNum 返回评论数量 func (c *Collection) GetCommentsNum() int { if value, ok := c.getIntField("comment-num"); ok { return value } doc := c.Doc() text := strip(doc.Find("div#zh-list-meta-wrap a.toggle-comment").Text()) rv := reMatchInt(text) c.setField("comment-num", rv) return rv } func (c *Collection) String() string { return fmt.Sprintf("", c.GetName(), c.Link) } func ajaxGetFollowers(link string, xsrf string, total int) ([]*User, error) { if total == 0 { return nil, nil } var ( offset = 0 gotDataNum = pageSize initCap = total ) if initCap < 0 { initCap = pageSize } users := make([]*User, 0, initCap) form := url.Values{} form.Set("_xsrf", xsrf) for gotDataNum == pageSize { form.Set("offset", strconv.Itoa(offset)) doc, dataNum, err := newDocByNormalAjax(link, form) if err != nil { return nil, err } doc.Find("div.zm-profile-card").Each(func(index int, sel *goquery.Selection) { thisUser := newUserFromSelector(sel) users = append(users, thisUser) }) if total > 0 && len(users) >= total { return users[:total], nil } gotDataNum = dataNum offset += gotDataNum } return users, nil } func newDocByNormalAjax(link string, form url.Values) (*goquery.Document, int, error) { gotDataNum := 0 body := strings.NewReader(form.Encode()) resp, err := gSession.Ajax(link, body, link) if err != nil { logger.Error("查询关注的话题失败, 链接:%s, 参数:%s,错误:%s", link, form.Encode(), err.Error()) return nil, gotDataNum, err } defer resp.Body.Close() result := normalAjaxResult{} err = json.NewDecoder(resp.Body).Decode(&result) if err != nil { logger.Error("解析返回值 json 失败:%s", err.Error()) return nil, gotDataNum, err } topicsHtml := result.Msg[1].(string) doc, err := goquery.NewDocumentFromReader(strings.NewReader(topicsHtml)) if err != nil { logger.Error("解析返回的 HTML 失败:%s", err.Error()) return nil, gotDataNum, err } gotDataNum = int(result.Msg[0].(float64)) return doc, gotDataNum, err } func getQuestionsFromDoc(doc *goquery.Document) []*Question { questions := make([]*Question, 0, pageSize) items := doc.Find("div#zh-list-answer-wrap").Find("h2.zm-item-title") items.Each(func(index int, sel *goquery.Selection) { a := sel.Find("a") qTitle := strip(a.Text()) qHref, _ := a.Attr("href") thisQuestion := NewQuestion(makeZhihuLink(qHref), qTitle) questions = append(questions, thisQuestion) }) return questions } func getAnswersFromDoc(doc *goquery.Document) []*Answer { var answers []*Answer var lastQuestion *Question doc.Find("div.zm-item").Each(func(index int, sel *goquery.Selection) { // 回答 contentTag := sel.Find("div.zm-item-rich-text") if contentTag.Size() == 0 { // 回答被建议修改 reason := strip(sel.Find("div.answer-status").Text()) logger.Warn("忽略一个问题,原因:%s", reason) return } // 获取问题,如果同一个问题下收藏了多个回答,则除了第一个外,后面的回答的 HTML 部分, // 也就是 div.zm-item 里面不会有该问题的链接(a 标签),所以用 lastQuestion 标记 // 最近的一个问题 var thisQuestion *Question if qTag := sel.Find("h2.zm-item-title").Find("a"); qTag.Size() > 0 { qTitle := strip(qTag.Text()) qHref, _ := qTag.Attr("href") thisQuestion = NewQuestion(makeZhihuLink(qHref), qTitle) lastQuestion = thisQuestion } else { thisQuestion = lastQuestion } // 答主 author := newUserFromAnswerAuthorTag(sel.Find("div.zm-item-answer-author-info")) answerHref, _ := contentTag.Attr("data-entry-url") voteText, _ := sel.Find("a.zm-item-vote-count").Attr("data-votecount") vote, _ := strconv.Atoi(voteText) thisAnswer := NewAnswer(makeZhihuLink(answerHref), thisQuestion, author) thisAnswer.setUpvote(vote) answers = append(answers, thisAnswer) }) return answers } ================================================ FILE: examples/config-example.json ================================================ { "account": "email-or-phonenum", "password": "your-password-here" } ================================================ FILE: examples/example.go ================================================ package main import ( "fmt" "io/ioutil" "github.com/DeanThompson/zhihu-go" ) var ( logger = zhihu.Logger{true} ) func main() { zhihu.Init("./config.json") // 黄继新,和知乎在一起 user := zhihu.NewUser("https://www.zhihu.com/people/jixin", "") showUser(user) logger.Success("========== split ==========") // Python 编程,应该养成哪些好的习惯? questionUrl := "https://www.zhihu.com/question/28966220" question := zhihu.NewQuestion(questionUrl, "") showQuestion(question) logger.Success("========== split ==========") // 龙有九个儿子,是跟谁生的?为什么「龙生九子,各不成龙」?豆子 的答案 answer := zhihu.NewAnswer("https://www.zhihu.com/question/23759686/answer/41997389", nil, nil) showAnswer(answer) logger.Success("========== split ==========") // 程序员为了期权加入创业公司,值得吗? 匿名用户的答案 answer2 := zhihu.NewAnswer("https://www.zhihu.com/question/28023819/answer/49723406", nil, nil) showAnswer(answer2) logger.Success("========== split ==========") // 黄继新 A4U collection := zhihu.NewCollection("https://www.zhihu.com/collection/19677733", "", nil) showCollection(collection) // Python topic := zhihu.NewTopic("https://www.zhihu.com/topic/19552832", "") showTopic(topic) } func showQuestion(question *zhihu.Question) { logger.Info("Question fields:") logger.Info(" url: %s", question.Link) logger.Info(" title: %s", question.GetTitle()) logger.Info(" detail: %s", question.GetDetail()) logger.Info(" answers num: %d", question.GetAnswersNum()) logger.Info(" followers num: %d", question.GetFollowersNum()) logger.Info(" comments num: %d", question.GetCommentsNum()) for i, topic := range question.GetTopics() { logger.Info(" topic-%d: %s", i+1, topic.String()) } for i, follower := range question.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } for i, follower := range question.GetFollowers() { logger.Info(" follower-%d: %s", i+1, follower.String()) if i >= 10 { logger.Info(" %d followers not shown.", question.GetFollowersNum()-i-1) break } } allAnswers := question.GetAllAnswers() for i, answer := range allAnswers { logger.Info(" answer-%d: %s", i+1, answer.String()) filename := fmt.Sprintf("/tmp/%s-%s的回答.html", question.GetTitle(), answer.GetAuthor().GetUserID()) dumpAnswerHTML(filename, answer) if i >= 10 { logger.Info(" %d answers not shown.", len(allAnswers)-i-1) break } } topXAnswers := question.GetTopXAnswers(25) for i, answer := range topXAnswers { logger.Info(" top-%d answer: %s", i+1, answer.String()) } logger.Info(" top-1 answer: %s", question.GetTopAnswer().String()) logger.Info(" visit times: %d", question.GetVisitTimes()) } func showAnswer(answer *zhihu.Answer) { logger.Info("Answer fields:") logger.Info(" url: %s", answer.Link) question := answer.GetQuestion() logger.Info(" question url: %s", question.Link) logger.Info(" question title: %s", question.GetTitle()) logger.Info(" author: %s", answer.GetAuthor().String()) logger.Info(" upvote num: %d", answer.GetUpvote()) logger.Info(" comments num: %d", answer.GetCommentsNum()) logger.Info(" collected num: %d", answer.GetCollectedNum()) logger.Info(" data ID: %d", answer.GetID()) // dump content filename := fmt.Sprintf("/tmp/answer_%d.html", answer.GetID()) dumpAnswerHTML(filename, answer) voters := answer.GetVoters() for i, voter := range voters { logger.Info(" voter-%d: %s", i+1, voter.String()) if i >= 10 { remain := len(voters) - i - 1 logger.Info(" %d votes not shown.", remain) break } } } func showCollection(collection *zhihu.Collection) { logger.Info("Collection fields:") logger.Info(" url: %s", collection.Link) logger.Info(" name: %s", collection.GetName()) logger.Info(" creator: %s", collection.GetCreator().String()) logger.Info(" followers num: %d", collection.GetFollowersNum()) logger.Info(" comments num: %d", collection.GetCommentsNum()) logger.Info(" questions num: %d", collection.GetQuestionsNum()) logger.Info(" answers num: %d", collection.GetAnswersNum()) for i, follower := range collection.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } for i, follower := range collection.GetFollowers() { logger.Info(" follower-%d: %s", i+1, follower.String()) } for i, question := range collection.GetQuestionsN(5) { logger.Info(" top question-%d: %s", i+1, question.String()) } for i, question := range collection.GetQuestions() { logger.Info(" question-%d: %s", i+1, question.String()) } for i, answer := range collection.GetAnswersN(5) { logger.Info(" top answer-%d: %s", i+1, answer.String()) } for i, answer := range collection.GetAnswers() { logger.Info(" answer-%d: %s", i+1, answer.String()) } } func showUser(user *zhihu.User) { logger.Info("User fields:") logger.Info(" is anonymous: %v", user.IsAnonymous()) logger.Info(" userId: %s", user.GetUserID()) logger.Info(" dataId: %s", user.GetDataID()) logger.Info(" avatar: %s", user.GetAvatar()) logger.Info(" avatar with size hd: %s", user.GetAvatarWithSize("hd")) logger.Info(" bio: %s", user.GetBio()) logger.Info(" location: %s", user.GetLocation()) logger.Info(" business: %s", user.GetBusiness()) logger.Info(" education: %s", user.GetEducation()) logger.Info(" gender: %s", user.GetGender()) logger.Info(" weibo: %s", user.GetWeiboURL()) logger.Info(" followers num: %d", user.GetFollowersNum()) logger.Info(" followees num: %d", user.GetFolloweesNum()) logger.Info(" followed columns num: %d", user.GetFollowedColumnsNum()) logger.Info(" followed topics num: %d", user.GetFollowedTopicsNum()) logger.Info(" agree num: %d", user.GetAgreeNum()) logger.Info(" thanks num: %d", user.GetThanksNum()) logger.Info(" asks num: %d", user.GetAsksNum()) logger.Info(" answers num: %d", user.GetAnswersNum()) logger.Info(" posts num: %d", user.GetPostsNum()) logger.Info(" collections num: %d", user.GetCollectionsNum()) logger.Info(" logs num: %d", user.GetLogsNum()) for i, topic := range user.GetFollowedTopicsN(5) { logger.Info(" top followed topic-%d: %s", i+1, topic.String()) } // for i, topic := range user.GetFollowedTopics() { // logger.Info(" followed topic-%d: %s", i+1, topic.String()) // } for i, follower := range user.GetFollowersN(5) { logger.Info(" top follower-%d: %s", i+1, follower.String()) } // for i, follower := range user.GetFollowers() { // logger.Info(" follower-%d: %s", i+1, follower.String()) // } for i, followee := range user.GetFolloweesN(5) { logger.Info(" top followee-%d: %s", i+1, followee.String()) } // for i, followee := range user.GetFollowees() { // logger.Info(" followee-%d: %s", i+1, followee.String()) // } for i, ask := range user.GetAsksN(5) { logger.Info(" top ask-%d: %s", i+1, ask.String()) } // for i, ask := range user.GetAsks() { // logger.Info(" ask-%d: %s", i+1, ask.String()) // } for i, answer := range user.GetAnswersN(5) { logger.Info(" top answer-%d: %s", i+1, answer.String()) } // for i, answer := range user.GetAnswers() { // logger.Info(" answer-%d: %s", i+1, answer.String()) // } for i, collection := range user.GetCollectionsN(5) { logger.Info(" top collection-%d: %s", i+1, collection.String()) } // for i, collection := range user.GetCollections() { // logger.Info(" collection-%d: %s", i+1, collection.String()) // } for i, like := range user.GetLikes() { logger.Info(" like-%d: %s", i+1, like.String()) } } func showTopic(topic *zhihu.Topic) { logger.Info("Topic fields:") logger.Info(" url: %s", topic.Link) logger.Info(" name: %s", topic.GetName()) logger.Info(" description: %s", topic.GetDescription()) logger.Info(" followers num: %d", topic.GetFollowersNum()) for i, author := range topic.GetTopAuthors() { logger.Info(" top-%d author: %s", i+1, author.String()) } } func dumpAnswerHTML(filename string, answer *zhihu.Answer) error { err := ioutil.WriteFile(filename, []byte(answer.GetContent()), 0666) if err == nil { logger.Info(" content dumped to %s", filename) } return err } ================================================ FILE: log.go ================================================ package zhihu import ( "fmt" "github.com/fatih/color" ) // Logger 是一个简单的输出工具,可以输出不同颜色的信息 // TODO simple level type Logger struct { Enabled bool } func (logger *Logger) log(a ...interface{}) { if logger.Enabled { fmt.Println(a...) } } // Error 输出 error 级别的日志 func (logger *Logger) Error(msg string, a ...interface{}) { logger.log(color.RedString("ERROR: "+msg, a...)) } // Warn 输出 warning 级别的日志 func (logger *Logger) Warn(msg string, a ...interface{}) { logger.log(color.YellowString("WARN: "+msg, a...)) } // Warning 是 Warn 的别名 func (logger *Logger) Warning(msg string, a ...interface{}) { logger.Warn(msg, a...) } // Info 输出 info 级别的日志 func (logger *Logger) Info(msg string, a ...interface{}) { logger.log(color.BlueString("INFO: "+msg, a...)) } // Debug 输出 debug 级别的日志 func (logger *Logger) Debug(msg string, a ...interface{}) { logger.log(color.WhiteString("DEBUG: "+msg, a...)) } // Success 输出 success 的日志,基本上与 info 一样,除了使用了绿色 func (logger *Logger) Success(msg string, a ...interface{}) { logger.log(color.GreenString("SUCCESS: "+msg, a...)) } ================================================ FILE: log_test.go ================================================ package zhihu import ( "testing" ) func Test_Error(t *testing.T) { var logger = Logger{Enabled: true} logger.Error("测试:输出一条 ERROR 的信息") logger.Error("测试:从 1 到 5 分别是:%d, %d, %d, %d, %d", 1, 2, 3, 4, 5) } func Test_Info(t *testing.T) { var logger = Logger{Enabled: true} logger.Info("测试:输出一条 INFO 的信息") logger.Info("测试:从 1 到 5 分别是:%d, %d, %d, %d, %d", 1, 2, 3, 4, 5) } ================================================ FILE: question.go ================================================ package zhihu import ( "encoding/json" "errors" "fmt" "net/url" "strconv" "strings" "github.com/PuerkitoBio/goquery" ) // Question 表示一个知乎问题,可以用于获取其标题、详情、答案等信息 type Question struct { *Page // title 是该问题的标题 title string } // NewQuestion 通过给定的 URL 创建一个 Question 对象 func NewQuestion(link string, title string) *Question { if !validQuestionURL(link) { panic("问题链接不正确: " + link) } return &Question{ Page: newZhihuPage(link), title: title, } } // GetTitle 获取问题标题 func (q *Question) GetTitle() string { if q.title != "" { return q.title } doc := q.Doc() q.title = strip(doc.Find("h2.zm-item-title").First().Text()) return q.title } // GetDetail 获取问题描述 func (q *Question) GetDetail() string { if got, ok := q.getStringField("detail"); ok { return got } doc := q.Doc() detail := strip(doc.Find("div#zh-question-detail").First().Text()) q.setField("detail", detail) return detail } // GetAnswersNum 获取问题回答数量 func (q *Question) GetAnswersNum() int { if got, ok := q.getIntField("answers-num"); ok { return got } doc := q.Doc() data, exists := doc.Find("h3#zh-question-answer-num").Attr("data-num") answerNum := 0 if exists { answerNum, _ = strconv.Atoi(data) } q.setField("answers-num", answerNum) return answerNum } // GetFollowersNum 获取问题关注数量 func (q *Question) GetFollowersNum() int { if got, ok := q.getIntField("followers-num"); ok { return got } doc := q.Doc() text := doc.Find("div.zg-gray-normal>a>strong").Text() followersNum, _ := strconv.Atoi(text) q.setField("followers-num", followersNum) return followersNum } // GetTopics 获取问题的话题列表 func (q *Question) GetTopics() []*Topic { var topics []*Topic q.Doc().Find("a.zm-item-tag").Each(func(index int, sel *goquery.Selection) { name := strip(sel.Text()) href, _ := sel.Attr("href") thisTopic := NewTopic(makeZhihuLink(href), name) topics = append(topics, thisTopic) }) return topics } // GetFollowersN 返回 n 个关注者,如果 n < 0,返回所有关注者 func (q *Question) GetFollowersN(n int) []*User { var ( link = urlJoin(q.Link, "/followers") xsrf = q.GetXSRF() ) users, err := ajaxGetFollowers(link, xsrf, n) if err != nil { return nil } return users } // GetFollowers 获取关注该问题的用户 func (q *Question) GetFollowers() []*User { return q.GetFollowersN(q.GetFollowersNum()) } // GetAllAnswers 获取问题的所有答案 func (q *Question) GetAllAnswers() []*Answer { return q.GetTopXAnswers(q.GetAnswersNum()) } // GetTopXAnswers 获取问题 Top X 的答案 func (q *Question) GetTopXAnswers(x int) []*Answer { if x < 0 || x > q.GetAnswersNum() { x = q.GetAnswersNum() } // 1. 首页的回答 answers := q.getAnswersOnIndex() if x < len(answers) { return answers[:x] } // 2. "更多",调用 Ajax 接口 moreCount := x - pageSize if moreCount > 0 { answers = append(answers, q.getMoreAnswers(moreCount)...) } return answers } // GetTopAnswer 获取问题排名第一的答案 func (q *Question) GetTopAnswer() *Answer { topAnswers := q.GetTopXAnswers(1) if len(topAnswers) >= 1 { return topAnswers[0] } return nil } // GetCommentsNum 返回问题的评论数量 func (q *Question) GetCommentsNum() int { if value, ok := q.getIntField("comment-num"); ok { return value } doc := q.Doc() text := doc.Find("div.zm-meta-panel a.toggle-comment").Text() rv := reMatchInt(strip(text)) q.setField("comment-num", rv) return rv } // GetVisitTimes 获取问题的访问次数 func (q *Question) GetVisitTimes() int { if got, ok := q.getIntField("visit-times"); ok { return got } doc := q.Doc() content, exists := doc.Find(`meta[itemprop="visitsCount"]`).Attr("content") visitTimes := 0 if exists { visitTimes, _ = strconv.Atoi(content) } q.setField("visit-times", visitTimes) return visitTimes } func (q *Question) String() string { return fmt.Sprintf("", q.GetTitle(), q.Link) } // getAnswersOnIndex 解析问题页面,返回第一页的回答 func (q *Question) getAnswersOnIndex() []*Answer { totalNum := q.GetAnswersNum() answers := make([]*Answer, 0, minInt(pageSize, totalNum)) doc := q.Doc() doc.Find("div.zm-item-answer").Each(func(index int, sel *goquery.Selection) { answers = append(answers, q.processSingleAnswer(sel)) }) return answers } // getAnswersByAjax 处理 “更多” 回答,调用 Ajax 接口 func (q *Question) getAnswersByAjax(page int) ([]*Answer, error) { offset := page * pageSize if offset > q.GetAnswersNum() { return nil, errors.New("No more answers.") } // 如果 URL 是 https://www.zhihu.com/question/23759686,则 urlToken 是 23759686 urlToken, _ := strconv.Atoi(q.Link[len(q.Link)-8 : len(q.Link)]) form := url.Values{} form.Set("_xsrf", q.GetXSRF()) form.Set("method", "next") form.Set("params", fmt.Sprintf(`{"url_token":%d,"pagesize":%d,"offset":%d}`, urlToken, pageSize, offset)) link := makeZhihuLink("/node/QuestionAnswerListV2") body := strings.NewReader(form.Encode()) resp, err := gSession.Ajax(link, body, q.Link) if err != nil { return nil, err } defer resp.Body.Close() result := nodeListResult{} err = json.NewDecoder(resp.Body).Decode(&result) if err != nil { return nil, err } answers := make([]*Answer, 0, len(result.Msg)) for _, answerHtml := range result.Msg { doc, err := goquery.NewDocumentFromReader(strings.NewReader(answerHtml)) if err != nil { return nil, err } thisAnswer := q.processSingleAnswer(doc.Selection) answers = append(answers, thisAnswer) } return answers, nil } // getMoreAnswers 执行多次“更多” func (q *Question) getMoreAnswers(limit int) []*Answer { answers := make([]*Answer, 0, limit) index := 0 totalPage := (limit + pageSize - 1) / pageSize for index < totalPage { page := index + 1 moreAnswers, err := q.getAnswersByAjax(page) if err != nil { logger.Error("加载第 %d 页回答失败,问题:%s,错误:%s", page, q.Link, err.Error()) } else { answers = append(answers, moreAnswers...) } index++ } return answers } // processSingleAnswer 处理一个回答的 HTML 片段, // 这段 HTML 可能来自问题页面,也可能来自 Ajax 接口 func (q *Question) processSingleAnswer(sel *goquery.Selection) *Answer { // 1. 获取链接 answerHref, _ := sel.Find("a.answer-date-link").Attr("href") answerLink := makeZhihuLink(answerHref) // 2. 获取作者 authorSel := sel.Find("div.zm-item-answer-author-info") var author *User if authorSel.Find("a.author-link").Size() == 0 { // 匿名用户 author = ANONYMOUS } else { // 具名用户 x := authorSel.Find("a.author-link") userID := strip(x.Text()) userHref, _ := x.Attr("href") author = NewUser(makeZhihuLink(userHref), userID) } answer := NewAnswer(answerLink, q, author) // 3. 获取赞同数 dataIsOwner, _ := sel.Attr("data-isowner") isOwner := dataIsOwner == "1" // 判断是否本人的回答 var voteText string if isOwner { voteText = strip(sel.Find("a.zm-item-vote-count").Text()) } else { voteText = strip(sel.Find("div.zm-votebar").Find("span.count").Text()) } answer.setUpvote(upvoteTextToNum(voteText)) // 4. 获取内容 content, _ := answerSelectionToHtml(sel.Find("div.zm-editable-content")) answer.setContent(content) return answer } func (q *Question) setFollowersNum(value int) { q.setField("followers-num", value) } func (q *Question) setAnswersNum(value int) { q.setField("answers-num", value) } func (q *Question) setVisitTimes(value int) { q.setField("visit-times", value) } ================================================ FILE: question_test.go ================================================ package zhihu import "testing" func init_session() { Init("./examples/config.json") } func Test_GetTitle(t *testing.T) { init_session() question := NewQuestion("https://www.zhihu.com/question/41171543", "") got := question.GetTitle() want := "如何评价第一局比赛 AlphaGo 战胜李世石?" logger.Info("got title: %s", got) logger.Info("expected title: %s", want) if got != want { t.Error("GetTitle() returns error result") } } func Test_GetDetail(t *testing.T) { init_session() question := NewQuestion("https://www.zhihu.com/question/41171543", "") got := question.GetDetail() want := "本题已收录至知乎圆桌 » 对弈人工智能,更多关于李世石对战人工智能的解读欢迎关注讨论。" logger.Info("got detail: %s", got) logger.Info("expected detail: %s", want) if got != want { t.Error("GetDetail() returns error result") } } ================================================ FILE: session.go ================================================ package zhihu import ( "encoding/json" "fmt" "io" "io/ioutil" "net/http" "net/url" "os" "path/filepath" "regexp" "strconv" "strings" "time" "github.com/juju/persistent-cookiejar" ) // Auth 是用于登录的信息,保存了用户名和密码 type Auth struct { Account string `json:"account"` Password string `json:"password"` loginType string // phone_num 或 email loginURL string // 通过 Account 判断 } // isEmail 判断是否通过邮箱登录 func (auth *Auth) isEmail() bool { return isEmail(auth.Account) } // isPhone 判断是否通过手机号登录 func (auth *Auth) isPhone() bool { return regexp.MustCompile(`^1[0-9]{10}$`).MatchString(auth.Account) } func (auth *Auth) toForm() url.Values { if auth.isEmail() { auth.loginType = "email" auth.loginURL = makeZhihuLink("/login/email") } else if auth.isPhone() { auth.loginType = "phone_num" auth.loginURL = makeZhihuLink("/login/phone_num") } else { panic("无法判断登录类型: " + auth.Account) } values := url.Values{} logger.Info("登录类型:%s, 登录地址:%s", auth.loginType, auth.loginURL) values.Set(auth.loginType, auth.Account) values.Set("password", auth.Password) values.Set("remember_me", "true") // import! return values } // Session 保持和知乎服务器的会话,用于向服务器发起请求获取 HTML 或 JSON 数据 type Session struct { auth *Auth client *http.Client } type loginResult struct { R int `json:"r"` Msg string `json:"msg"` ErrorCode int `json:"errcode"` Data interface{} `json:"data"` } // NewSession 创建并返回一个 *Session 对象, // 这里没有初始化登录账号信息,账号信息用 `LoadConfig` 通过配置文件进行设置 func NewSession() *Session { s := new(Session) cookieJar, _ := cookiejar.New(nil) s.client = &http.Client{ Jar: cookieJar, } return s } // LoadConfig 从配置文件中读取账号信息 // 配置文件 是 JSON 格式: // { // "account": "xyz@example.com", // "password": "p@ssw0rd" // } func (s *Session) LoadConfig(cfg string) { fd, err := os.Open(cfg) if err != nil { panic("无法打开配置文件 config.json: " + err.Error()) } defer fd.Close() auth := new(Auth) err = json.NewDecoder(fd).Decode(&auth) if err != nil { panic("解析配置文件出错: " + err.Error()) } s.auth = auth // TODO 如果设置了与上一次不一样的账号,最好把 cookies 重置 } // Login 登录并保存 cookies func (s *Session) Login() error { if s.authenticated() { logger.Success("已经是登录状态,不需要重复登录") return nil } form := s.buildLoginForm().Encode() body := strings.NewReader(form) req, err := http.NewRequest("POST", s.auth.loginURL, body) if err != nil { logger.Error("构造登录请求失败:%s", err.Error()) return err } headers := newHTTPHeaders(true) headers.Set("Content-Length", strconv.Itoa(len(form))) headers.Set("Content-Type", "application/x-www-form-urlencoded") headers.Set("Referer", baseZhihuURL) req.Header = headers logger.Info("登录中,用户名:%s", s.auth.Account) resp, err := s.client.Do(req) if err != nil { logger.Error("登录失败:%s", err.Error()) return err } if strings.ToLower(resp.Header.Get("Content-Type")) != "application/json" { logger.Error("服务器没有返回 json 数据") return fmt.Errorf("未知的 Content-Type: %s", resp.Header.Get("Content-Type")) } defer resp.Body.Close() result := loginResult{} content, err := ioutil.ReadAll(resp.Body) if err != nil { logger.Error("读取响应内容失败:%s", err.Error()) } logger.Info("登录响应内容:%s", strings.Replace(string(content), "\n", "", -1)) err = json.Unmarshal(content, &result) if err != nil { logger.Error("JSON 解析失败:%s", err.Error()) return err } if result.R == 0 { logger.Success("登录成功!") s.client.Jar.(*cookiejar.Jar).Save() return nil } if result.R == 1 { logger.Warn("登录失败!原因:%s", result.Msg) return fmt.Errorf("登录失败!原因:%s", result.Msg) } logger.Error("登录出现未知错误:%s", string(content)) return fmt.Errorf("登录失败,未知错误:%s", string(content)) } // Get 发起一个 GET 请求,自动处理 cookies func (s *Session) Get(url string) (*http.Response, error) { logger.Info("GET %s", url) req, err := http.NewRequest("GET", url, nil) if err != nil { logger.Error("NewRequest failed with URL: %s", url) return nil, err } req.Header = newHTTPHeaders(false) return s.client.Do(req) } // Post 发起一个 POST 请求,自动处理 cookies func (s *Session) Post(url string, bodyType string, body io.Reader) (*http.Response, error) { logger.Info("POST %s, %s", url, bodyType) req, err := http.NewRequest("POST", url, body) if err != nil { return nil, err } headers := newHTTPHeaders(false) headers.Set("Content-Type", bodyType) req.Header = headers return s.client.Do(req) } // Ajax 发起一个 Ajax 请求,自动处理 cookies func (s *Session) Ajax(url string, body io.Reader, referer string) (*http.Response, error) { logger.Info("AJAX %s, referrer %s", url, referer) req, err := http.NewRequest("POST", url, body) if err != nil { return nil, err } headers := newHTTPHeaders(true) headers.Set("Content-Type", "application/x-www-form-urlencoded") headers.Set("Referer", referer) req.Header = headers return s.client.Do(req) } // authenticated 检查是否已经登录(cookies 没有失效) func (s *Session) authenticated() bool { originURL := makeZhihuLink("/settings/profile") resp, err := s.Get(originURL) if err != nil { logger.Error("访问 profile 页面出错: %s", err.Error()) return false } // 如果没有登录,会跳转到 http://www.zhihu.com/?next=%2Fsettings%2Fprofile lastURL := resp.Request.URL.String() logger.Info("获取 profile 的请求,跳转到了:%s", lastURL) return lastURL == originURL } func (s *Session) buildLoginForm() url.Values { values := s.auth.toForm() values.Set("_xsrf", s.searchXSRF()) values.Set("captcha", s.downloadCaptcha()) return values } // 从 cookies 获取 _xsrf 用于 POST 请求 func (s *Session) searchXSRF() string { resp, err := s.Get(baseZhihuURL) if err != nil { panic("获取 _xsrf 失败:" + err.Error()) } // retrieve from cookies for _, cookie := range resp.Cookies() { if cookie.Name == "_xsrf" { return cookie.Value } } return "" } // downloadCaptcha 获取验证码,用于登录 func (s *Session) downloadCaptcha() string { url := makeZhihuLink(fmt.Sprintf("/captcha.gif?r=%d&type=login", 1000*time.Now().Unix())) logger.Info("获取验证码:%s", url) resp, err := s.Get(url) if err != nil { panic("获取验证码失败:" + err.Error()) } if resp.StatusCode != http.StatusOK { panic(fmt.Sprintf("获取验证码失败,StatusCode = %d", resp.StatusCode)) } defer resp.Body.Close() fileExt := strings.Split(resp.Header.Get("Content-Type"), "/")[1] verifyImg := filepath.Join(getCwd(), "verify."+fileExt) fd, err := os.OpenFile(verifyImg, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0777) if err != nil { panic("打开验证码文件失败:" + err.Error()) } defer fd.Close() io.Copy(fd, resp.Body) // 保存验证码文件 openCaptchaFile(verifyImg) // 调用外部程序打开 captcha := readCaptchaInput() // 读取用户输入 return captcha } var ( gSession = NewSession() // 全局的 Session,调用 Init() 初始化 ) // Init 用于传入配置文件,配置全局的 Session func Init(cfgFile string) { // 配置账号信息 gSession.LoadConfig(cfgFile) // 登录 gSession.Login() } // SetSession 用于替换默认的 session func SetSession(s *Session) { gSession = s } ================================================ FILE: session_test.go ================================================ package zhihu import ( "testing" ) const cfgFile = "./examples/config.json" func Test_searchXsrf(t *testing.T) { s := NewSession() logger.Debug("_xsrf: %s", s.searchXSRF()) } //func Test_downloadCaptcha(t *testing.T) { // s := NewSession("./example/config.json") // s.downloadCaptcha() //} //func Test_buildLoginForm(t *testing.T) { // s := &Session{} // s.LoadConfig() // values := s.buildLoginForm() // fmt.Println(values.Encode()) //} ================================================ FILE: topic.go ================================================ package zhihu import ( "fmt" "strconv" "github.com/PuerkitoBio/goquery" ) type Topic struct { *Page // name 是改话题的名称 name string } func NewTopic(link string, name string) *Topic { if !validTopicURL(link) { panic("非法的 Topic 链接:%s" + link) } return &Topic{ Page: newZhihuPage(link), name: name, } } // GetName 返回话题名称 func (t *Topic) GetName() string { if t.name != "" { return t.name } //

Python

t.name = strip(t.Doc().Find("h1.zm-editable-content").Text()) return t.name } // GetDescription 返回话题的描述 func (t *Topic) GetDescription() string { if got, ok := t.getStringField("description"); ok { return got } //
// Python 是一种面向对象的解释型计算机程序设计语言,在设计中注重代码的可读性,同时也是一种功能强大的通用型语言。 // // 修改 // //
description := strip(t.Doc().Find("div.zm-editable-content").Text()) t.setField("description", description) return description } // GetFollowersNum 返回关注者数量 func (t *Topic) GetFollowersNum() int { if got, ok := t.getIntField("followers-num"); ok { return got } //
// // 82155 // 人关注了该话题 //
text := strip(t.Doc().Find("div.zm-topic-side-followers-info strong").Text()) num, _ := strconv.Atoi(text) t.setField("followers-num", num) return num } // GetTopAuthors 返回最佳回答者,一般来说是 5 个 func (t *Topic) GetTopAuthors() []*User { authors := make([]*User, 0, 5) div := t.Doc().Find("div#zh-topic-top-answerer") div.Find("div.zm-topic-side-person-item-content").Each(func(index int, sel *goquery.Selection) { tag := sel.Find("a").First() uHref, _ := tag.Attr("href") uId := strip(tag.Text()) thisAuthor := NewUser(makeZhihuLink(uHref), uId) bio, _ := sel.Find("div.zm-topic-side-bio").Attr("title") thisAuthor.setBio(bio) authors = append(authors, thisAuthor) }) return authors } func (t *Topic) String() string { return fmt.Sprintf("", t.GetName(), t.Link) } ================================================ FILE: user.go ================================================ package zhihu import ( "encoding/json" "fmt" "net/url" "strconv" "strings" "github.com/PuerkitoBio/goquery" ) var ( ANONYMOUS = NewUser("", "匿名用户") ) // User 表示一个知乎用户 type User struct { *Page // userId 表示用户的知乎 ID(用户名) userID string } // NewUser 创建一个用户对象。 // link 为空的时候表示匿名用户,此时 userId 仅允许 "匿名用户" 或 "知乎用户"; // userId 可以为空,这种情况下调用 GetUserID 会去解析用户主页 func NewUser(link string, userID string) *User { if link == "" && !isAnonymous(userID) { panic("调用 NewUser 的参数不合法") } return &User{ Page: newZhihuPage(link), userID: userID, } } // GetUserID 返回用户的知乎 ID func (user *User) GetUserID() string { if user.userID != "" { return user.userID } doc := user.Doc() //
// 黄继新, // 和知乎在一起 //
user.userID = strip(doc.Find("div.title-section.ellipsis").Find("span.name").Text()) return user.userID } // GetDataID 返回用户的 data-id func (user *User) GetDataID() string { if user.IsAnonymous() { return "" } if got, ok := user.getStringField("data-id"); ok { return got } doc := user.Doc() // 分两种情况:自己和其他用户 // 1. 其他用户 //
// //
// // 2. 自己 // var dataID string btns := doc.Find("div.zm-profile-header-op-btns") if btns.Size() > 0 { // 1. 其他用户 dataID, _ = btns.Find("button").Attr("data-id") } else { // 2. 自己 script := doc.Find(`script[data-name="ga_vars"]`).Text() data := make(map[string]interface{}) json.Unmarshal([]byte(script), &data) dataID = data["user_hash"].(string) } user.setField("data-id", dataID) return dataID } // GetBio 返回用户的 BIO func (user *User) GetBio() string { if user.IsAnonymous() { return "" } if got, ok := user.getStringField("bio"); ok { return got } doc := user.Doc() // 程序员,用 Python 和 Go 做服务端开发。 bio := strip(doc.Find("span.bio").Eq(0).Text()) user.setField("bio", bio) return bio } // GetLocation 返回用户所在地 func (user *User) GetLocation() string { return user.getProfile("location") } // GetBusiness 返回用户的所在行业 func (user *User) GetBusiness() string { return user.getProfile("business") } // GetEducation 返回用户的教育信息 func (user *User) GetEducation() string { return user.getProfile("education") } // GetGender 返回用户的性别(male/female/unknown) func (user *User) GetGender() string { gender := "unknown" if user.IsAnonymous() { return gender } if got, ok := user.getStringField("gender"); ok { return got } doc := user.Doc() // sel := doc.Find("span.gender").Find("i") if sel.HasClass("icon-profile-male") { gender = "male" } else { gender = "female" } user.setField("gender", gender) return gender } // GetAvatar 返回用户的头像 URL,默认的尺寸 func (user *User) GetAvatar() string { if user.IsAnonymous() { return "" } if got, ok := user.getStringField("avatar"); ok { return got } img := user.Doc().Find("div.body").Find("img.Avatar").First() avatar, _ := img.Attr("src") user.setField("avatar", avatar) return avatar } // GetAvatarWithSize 返回指定尺寸的的头像 URL,size 支持的值:s, xs, m, l, xl, hd, "" func (user *User) GetAvatarWithSize(size string) string { defaultAvatar := user.GetAvatar() if defaultAvatar == "" { return defaultAvatar } if !validateAvatarSize(size) { return defaultAvatar } return replaceAvatarSize(defaultAvatar, size) } // GetWeiboURL 返回用户的微博主页 URL func (user *User) GetWeiboURL() string { if user.IsAnonymous() { return "" } if got, ok := user.getStringField("weibo-url"); ok { return got } value := "" tag := user.Doc().Find("a.zm-profile-header-user-weibo") if tag.Size() > 0 { value, _ = tag.First().Attr("href") } user.setField("weibo-url", value) return value } // GetFollowersNum 返回用户的粉丝数量 func (user *User) GetFollowersNum() int { return user.getFollowersNumOrFolloweesNum("followers-num") } // GetFolloweesNum 返回用户关注的数量 func (user *User) GetFolloweesNum() int { return user.getFollowersNumOrFolloweesNum("followees-num") } // GetFollowedColumnsNum 返回用户关注的专栏数量 func (user *User) GetFollowedColumnsNum() int { return user.getFollowedColumnsOrTopicsNum("followed-columns-num") } // GetFollowedTopicsNum 返回用户关注的话题数量 func (user *User) GetFollowedTopicsNum() int { return user.getFollowedColumnsOrTopicsNum("followed-topics-num") } // GetAgreeNum 返回用户的点赞数 func (user *User) GetAgreeNum() int { return user.getAgreeOrThanksNum("agree-num") } // GetThanksNum 返回用户的感谢数 func (user *User) GetThanksNum() int { return user.getAgreeOrThanksNum("thanks-num") } // GetAsksNum 返回用户的提问数 func (user *User) GetAsksNum() int { return user.getProfileNum("asks-num") } // GetAnswersNum 返回用户的回答数 func (user *User) GetAnswersNum() int { return user.getProfileNum("answers-num") } // GetPostsNum 返回用户的专栏文章数量 func (user *User) GetPostsNum() int { return user.getProfileNum("posts-num") } // GetCollectionsNum 返回用户的收藏夹数量 func (user *User) GetCollectionsNum() int { return user.getProfileNum("collections-num") } // GetLogsNum 返回用户公共编辑数量 func (user *User) GetLogsNum() int { return user.getProfileNum("logs-num") } // GetFolloweesN 返回前 n 个用户关注的人,如果 n < 0,返回所有关注的人 func (user *User) GetFolloweesN(n int) []*User { users, err := user.getFolloweesOrFollowers("followees", n) if err != nil { logger.Error("获取 %s 关注的人失败:%s", user.String(), err.Error()) return nil } return users } // GetFollowees 返回用户关注的人 func (user *User) GetFollowees() []*User { return user.GetFolloweesN(-1) } // GetFollowersN 返回前 n 个粉丝,如果 n < 0,返回所有粉丝 func (user *User) GetFollowersN(n int) []*User { users, err := user.getFolloweesOrFollowers("followers", n) if err != nil { logger.Error("获取 %s 的粉丝失败:%s", user.String(), err.Error()) return nil } return users } // GetFollowers 返回用户的粉丝列表 func (user *User) GetFollowers() []*User { return user.GetFollowersN(-1) } // GetAsksN 返回用户前 n 个提问,如果 n < 0, 返回所有提问 func (user *User) GetAsksN(n int) []*Question { if user.IsAnonymous() { return nil } total := user.GetAsksNum() if n < 0 || n > total { n = total } if n == 0 { return nil } page := 1 questions := make([]*Question, 0, n) for page < ((n-1)/pageSize + 2) { link := urlJoin(user.Link, fmt.Sprintf("/asks?page=%d", page)) doc, err := newDocumentFromURL(link) if err != nil { return nil } doc.Find("div#zh-profile-ask-list").Children().Each(func(index int, sel *goquery.Selection) { a := sel.Find("a.question_link") title := strip(a.Text()) href, _ := a.Attr("href") questionLink := makeZhihuLink(href) thisQuestion := NewQuestion(questionLink, title) // 获取回答数 answersNum := reMatchInt(strip(sel.Find("div.meta").Contents().Eq(4).Text())) thisQuestion.setAnswersNum(answersNum) // 获取关注数 followersNum := reMatchInt(strip(sel.Find("div.meta").Contents().Eq(6).Text())) thisQuestion.setFollowersNum(followersNum) // 获取浏览量 visitTimes, _ := strconv.Atoi(strip(sel.Find("div.zm-profile-vote-num").Text())) thisQuestion.setVisitTimes(visitTimes) questions = append(questions, thisQuestion) }) if n > 0 && len(questions) >= n { return questions[:n] } page++ } return questions } // GetAsks 返回用户所有的提问 func (user *User) GetAsks() []*Question { return user.GetAsksN(-1) } // GetAnswersN 返回用户前 n 个回答,如果 n < 0,返回所有回答 func (user *User) GetAnswersN(n int) []*Answer { if user.IsAnonymous() { return nil } total := user.GetAnswersNum() if n < 0 || n > total { n = total } if n == 0 { return nil } page := 1 answers := make([]*Answer, 0, n) for page < ((n-1)/pageSize + 2) { link := urlJoin(user.Link, fmt.Sprintf("/answers?page=%d", page)) doc, err := newDocumentFromURL(link) if err != nil { return nil } doc.Find("div#zh-profile-answer-list").Children().Each(func(index int, sel *goquery.Selection) { a := sel.Find("a.question_link") qTitle := strip(a.Text()) answerHref, _ := a.Attr("href") qLink := makeZhihuLink(answerHref[0:strings.Index(answerHref, "/answer")]) question := NewQuestion(qLink, qTitle) thisAnswer := NewAnswer(makeZhihuLink(answerHref), question, user) voteText, _ := sel.Find("a.zm-item-vote-count").Attr("data-votecount") vote, _ := strconv.Atoi(voteText) thisAnswer.setUpvote(vote) answers = append(answers, thisAnswer) }) if n > 0 && len(answers) >= n { return answers[:n] } page++ } return answers } // GetAnswers 返回用户所有的回答 func (user *User) GetAnswers() []*Answer { return user.GetAnswersN(-1) } // GetCollectionsN 返回用户前 n 个收藏夹,如果 n < 0,返回所有收藏夹 func (user *User) GetCollectionsN(n int) []*Collection { if user.IsAnonymous() { return nil } total := user.GetCollectionsNum() if n < 0 || n > total { n = total } if n == 0 { return nil } page := 1 collections := make([]*Collection, 0, n) for page < ((n-1)/pageSize + 2) { link := urlJoin(user.Link, fmt.Sprintf("/collections?page=%d", page)) doc, err := newDocumentFromURL(link) if err != nil { return nil } doc.Find("div.zm-profile-section-item").Each(func(index int, sel *goquery.Selection) { a := sel.Find("a.zm-profile-fav-item-title") cName := strip(a.Text()) href, _ := a.Attr("href") cLink := makeZhihuLink(href) thisCollection := NewCollection(cLink, cName, user) collections = append(collections, thisCollection) }) if n > 0 && len(collections) >= n { return collections[:n] } page++ } return collections } // GetCollections 返回用户的收藏夹 func (user *User) GetCollections() []*Collection { return user.GetCollectionsN(-1) } // GetFollowedTopicsN 返回用户前 n 个关注的话题,如果 n < 0,返回所有话题 func (user *User) GetFollowedTopicsN(n int) []*Topic { if user.IsAnonymous() { return nil } total := user.GetFollowedTopicsNum() if n < 0 || n > total { n = total } if n == 0 { return nil } var ( link = urlJoin(user.Link, "/topics") gotDataNum = pageSize offset = 0 topics = make([]*Topic, 0, n) ) form := url.Values{} form.Set("_xsrf", user.GetXSRF()) form.Set("start", "0") for gotDataNum == pageSize { form.Set("offset", strconv.Itoa(offset)) doc, dataNum, err := newDocByNormalAjax(link, form) if err != nil { return nil } doc.Find("div.zm-profile-section-item").Each(func(index int, sel *goquery.Selection) { tName := strip(sel.Find("strong").Text()) tHref, _ := sel.Find("a.zm-list-avatar-link").Attr("href") thisTopic := NewTopic(makeZhihuLink(tHref), tName) topics = append(topics, thisTopic) }) if n > 0 && len(topics) >= n { return topics[:n] } gotDataNum = dataNum offset += gotDataNum } return topics } // GetFollowedTopics 返回用户关注的话题 func (user *User) GetFollowedTopics() []*Topic { return user.GetFollowedTopicsN(-1) } // GetLikes 返回用户赞过的回答 func (user *User) GetLikes() []*Answer { if user.IsAnonymous() { return nil } // TODO return nil } // GetVotedAnswers 是 GetLikes 的别名 func (user *User) GetVotedAnswers() []*Answer { return user.GetLikes() } // IsAnonymous 表示该用户是否匿名用户 func (user *User) IsAnonymous() bool { return isAnonymous(user.userID) } func (user *User) String() string { if user.IsAnonymous() { return fmt.Sprintf("", user.userID) } return fmt.Sprintf("", user.userID, user.Link) } func (user *User) getProfile(cacheKey string) string { if user.IsAnonymous() { return "" } if got, ok := user.getStringField(cacheKey); ok { return got } doc := user.Doc() // 深圳 // ... // ... value, _ := doc.Find(fmt.Sprintf("span.%s", cacheKey)).Attr("title") user.setField(cacheKey, value) return value } func (user *User) getFollowersNumOrFolloweesNum(cacheKey string) int { if user.IsAnonymous() { return 0 } if got, ok := user.getIntField(cacheKey); ok { return got } var index int switch cacheKey { case "followees-num": index = 0 case "followers-num": index = 1 default: return 0 } doc := user.Doc() // value := doc.Find("div.zm-profile-side-following a strong").Eq(index).Text() num, _ := strconv.Atoi(value) user.setField(cacheKey, num) return num } func (user *User) getFollowedColumnsOrTopicsNum(cacheKey string) int { if user.IsAnonymous() { return 0 } if got, ok := user.getIntField(cacheKey); ok { return got } var selector string switch cacheKey { case "followed-topics-num": selector = "div.zm-profile-side-topics" case "followed-columns-num": selector = "div.zm-profile-side-columns" default: return 0 } doc := user.Doc() result := 0 sel := doc.Find(selector) if sel.Size() > 0 { text := sel.Parent().Find("a.zg-link-litblue").Find("strong").Text() result = reMatchInt(strip(text)) } user.setField(cacheKey, result) return result } func (user *User) getAgreeOrThanksNum(cacheKey string) int { if user.IsAnonymous() { return 0 } var selector string switch cacheKey { case "agree-num": selector = "span.zm-profile-header-user-agree > strong" case "thanks-num": selector = "span.zm-profile-header-user-thanks > strong" default: return 0 } if got, ok := user.getIntField(cacheKey); ok { return got } doc := user.Doc() //
//
// 获得 // 68200赞同 // 17511感谢 //
//
num, _ := strconv.Atoi(doc.Find(selector).Text()) user.setField(cacheKey, num) return num } func (user *User) getProfileNum(cacheKey string) int { if user.IsAnonymous() { return 0 } if got, ok := user.getIntField(cacheKey); ok { return got } var index int switch cacheKey { case "asks-num": index = 0 case "answers-num": index = 1 case "posts-num": index = 2 case "collections-num": index = 3 case "logs-num": index = 4 default: return 0 } doc := user.Doc() // value := doc.Find("div.profile-navbar").Find("span.num").Eq(index).Text() num, _ := strconv.Atoi(value) user.setField(cacheKey, num) return num } func (user *User) getFolloweesOrFollowers(eeOrEr string, limit int) ([]*User, error) { if user.IsAnonymous() { return nil, nil } if limit == 0 { return nil, nil } var ( referer, ajaxURL string offset, totalNum int hashID = user.GetDataID() ) if eeOrEr == "followees" { referer = urlJoin(user.Link, "/followees") ajaxURL = makeZhihuLink("/node/ProfileFolloweesListV2") totalNum = user.GetFollowersNum() } else { referer = urlJoin(user.Link, "/followers") ajaxURL = makeZhihuLink("/node/ProfileFollowersListV2") totalNum = user.GetFolloweesNum() } if limit < 0 || limit > totalNum { limit = totalNum } form := url.Values{} form.Set("_xsrf", user.GetXSRF()) form.Set("method", "next") users := make([]*User, 0, limit) for { form.Set("params", fmt.Sprintf(`{"offset":%d,"order_by":"created","hash_id":"%s"}`, offset, hashID)) body := strings.NewReader(form.Encode()) resp, err := gSession.Ajax(ajaxURL, body, referer) if err != nil { return nil, err } defer resp.Body.Close() result := nodeListResult{} err = json.NewDecoder(resp.Body).Decode(&result) if err != nil { logger.Error("json decode failed: %s", err.Error()) return nil, err } for _, userHTML := range result.Msg { thisUser, err := newUserFromHTML(userHTML) if err != nil { return nil, err } users = append(users, thisUser) if len(users) == limit { break } } // 已经获取了需要的数量,或者数量不够,但是已经到了最后一页 if len(users) == limit || len(result.Msg) < pageSize { break } else { offset += pageSize } } return users, nil } func (user *User) setFollowersNum(value int) { user.setField("followers-num", value) } func (user *User) setAsksNum(value int) { user.setField("asks-num", value) } func (user *User) setAnswersNum(value int) { user.setField("answers-num", value) } func (user *User) setAgreeNum(value int) { user.setField("agree-num", value) } func (user *User) setBio(value string) { user.setField("bio", value) } func isAnonymous(userID string) bool { return userID == "匿名用户" || userID == "知乎用户" } func newUserFromHTML(html string) (*User, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { logger.Error("NewDocumentFromReader failed: %s", err.Error()) return nil, err } return newUserFromSelector(doc.Selection), nil } func newUserFromSelector(sel *goquery.Selection) *User { a := sel.Find("h2.zm-list-content-title").Find("a.zg-link") if a.Size() == 0 { // 匿名用户,没有用户主页入口 return ANONYMOUS } userId := strip(a.Text()) link, _ := a.Attr("href") user := NewUser(link, userId) // 获取 BIO bio := strip(sel.Find("div.zg-big-gray").Text()) user.setField("bio", bio) // 获取关注者数量 followersNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(0).Text())) user.setFollowersNum(followersNum) // 获取提问数 asksNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(1).Text())) user.setAsksNum(asksNum) // 获取回答数 answersNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(2).Text())) user.setAnswersNum(answersNum) // 获取赞同数 agreeNum := reMatchInt(strip(sel.Find("div.details").Find("a").Eq(3).Text())) user.setAgreeNum(agreeNum) return user } ================================================ FILE: util.go ================================================ package zhihu import ( "fmt" "io/ioutil" "net/http" "os" "os/exec" "path/filepath" "regexp" "runtime" "strconv" "strings" "github.com/PuerkitoBio/goquery" "github.com/fatih/color" ) const ( userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36" baseZhihuURL = "https://www.zhihu.com" pageSize = 20 ) var ( reQuestionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/question/[0-9]{8}$") reCollectionURL = regexp.MustCompile("^(http|https)://www.zhihu.com/collection/[0-9]{8,9}$") // bugfix: for private collection reTopicURL = regexp.MustCompile("^(http|https)://www.zhihu.com/topic/[0-9]{8}$") reGetNumber = regexp.MustCompile(`([0-9])+`) reAvatarReplacer = regexp.MustCompile(`_(s|xs|m|l|xl|hd).(png|jpg)`) reIsEmail = regexp.MustCompile(`^[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,4}$`) logger = Logger{Enabled: true} ) func validQuestionURL(value string) bool { return reQuestionURL.MatchString(value) } func validCollectionURL(value string) bool { return reCollectionURL.MatchString(value) } func validTopicURL(value string) bool { return reTopicURL.MatchString(value) } func reMatchInt(raw string) int { matched := reGetNumber.FindStringSubmatch(raw) if len(matched) == 0 { return 0 } rv, _ := strconv.Atoi(matched[0]) return rv } func validateAvatarSize(size string) bool { for _, x := range []string{"s", "xs", "m", "l", "xl", "hd"} { if size == x { return true } } return false } func replaceAvatarSize(origin string, size string) string { return reAvatarReplacer.ReplaceAllString(origin, fmt.Sprintf("_%s.$2", size)) } func isEmail(value string) bool { return reIsEmail.MatchString(value) } func newHTTPHeaders(isXhr bool) http.Header { headers := make(http.Header) headers.Set("Accept", "*/*") headers.Set("Connection", "keep-alive") headers.Set("Host", "www.zhihu.com") headers.Set("Origin", "http://www.zhihu.com") headers.Set("Pragma", "no-cache") headers.Set("User-Agent", userAgent) if isXhr { headers.Set("X-Requested-With", "XMLHttpRequest") } return headers } func strip(s string) string { return strings.TrimSpace(s) } func minInt(a, b int) int { if a > b { return b } return a } func getCwd() string { cwd, err := os.Getwd() if err != nil { panic("获取 CWD 失败:" + err.Error()) } return cwd } func save(filename string, content []byte) error { return ioutil.WriteFile(filename, content, 0666) } func saveString(filename string, content string) error { return ioutil.WriteFile(filename, []byte(content), 0666) } func openCaptchaFile(filename string) error { logger.Info("调用外部程序渲染验证码……") var args []string switch runtime.GOOS { case "linux": args = []string{"xdg-open", filename} case "darwin": args = []string{"open", filename} case "freebsd": args = []string{"open", filename} case "netbsd": args = []string{"open", filename} case "windows": var ( cmd = "url.dll,FileProtocolHandler" runDll32 = filepath.Join(os.Getenv("SYSTEMROOT"), "System32", "rundll32.exe") ) args = []string{runDll32, cmd, filename} default: fmt.Printf("无法确定操作系统,请自行打开验证码 %s 文件,并输入验证码。", filename) } logger.Info("Command: %s", strings.Join(args, " ")) err := exec.Command(args[0], args[1:]...).Run() if err != nil { return err } return nil } func readCaptchaInput() string { var captcha string fmt.Print(color.CyanString("请输入验证码:")) fmt.Scanf("%s", &captcha) return captcha } func makeZhihuLink(path string) string { return urlJoin(baseZhihuURL, path) } func urlJoin(base, path string) string { if strings.HasSuffix(base, "/") { base = strings.TrimRight(base, "/") } if strings.HasPrefix(path, "/") { path = strings.TrimLeft(path, "/") } return base + "/" + path } // newDocumentFromUrl 会请求给定的 url,并返回一个 goquery.Document 对象用于解析 func newDocumentFromURL(url string) (*goquery.Document, error) { resp, err := gSession.Get(url) if err != nil { logger.Error("请求 %s 失败:%s", url, err.Error()) return nil, err } doc, err := goquery.NewDocumentFromResponse(resp) if err != nil { logger.Error("解析页面失败:%s", err.Error()) } return doc, err } // ZhihuPage 是一个知乎页面,User, Question, Answer, Collection 的公共部分 type Page struct { // Link 是该页面的链接 Link string // doc 是 HTML document doc *goquery.Document // fields 是字段缓存,避免重复解析页面 fields map[string]interface{} } // newZhihuPage 是 private 的构造器 func newZhihuPage(link string) *Page { return &Page{ Link: link, fields: make(map[string]interface{}), } } // Doc 用于获取当前问题页面的 HTML document,惰性求值 func (page *Page) Doc() *goquery.Document { if page.doc != nil { return page.doc } err := page.Refresh() if err != nil { return nil } return page.doc } // Refresh 会重新载入当前页面,获取最新的数据 func (page *Page) Refresh() (err error) { page.fields = make(map[string]interface{}) // 清空缓存 page.doc, err = newDocumentFromURL(page.Link) // 重载页面 return err } // GetXsrf 从当前页面内容抓取 xsrf 的值 func (page *Page) GetXSRF() string { doc := page.Doc() value, _ := doc.Find(`input[name="_xsrf"]`).Attr("value") return value } // totalPages 获取总页数 func (page *Page) totalPages() int { return getTotalPages(page.Doc()) } func (page *Page) setField(field string, value interface{}) { page.fields[field] = value } func (page *Page) getIntField(field string) (value int, exists bool) { if got, ok := page.fields[field]; ok { return got.(int), true } return 0, false } func (page *Page) getStringField(field string) (value string, exists bool) { if got, ok := page.fields[field]; ok { return got.(string), true } return "", false } func getTotalPages(doc *goquery.Document) int { pager := doc.Find("div.zm-invite-pager") if pager.Size() == 0 { return 1 } text := pager.Find("span").Eq(-2).Text() pages, _ := strconv.Atoi(text) return pages } // nodeListResult 是形如 /node/XXListV2 这样的 Ajax 请求的 JSON 返回值 type nodeListResult struct { R int `json:"r"` // 状态码,正确的情况为 0 Msg []string `json:"msg"` // 回答内容,每个元素都是一段 HTML 片段 } // normalAjaxResult 是页面内,目标 URL 和当前页面 URL 相同的 Ajax 请求返回的 JSON 数据 type normalAjaxResult struct { R int `json:"r"` Msg []interface{} `json:"msg"` // 两个元素,第一个为话题数量,第二个是 HTML 片段 } ================================================ FILE: util_test.go ================================================ package zhihu import ( "testing" ) func Test_validQuestionURL(t *testing.T) { ioMap := map[string]bool{ "https://www.zhihu.com/question/37284137": true, "http://www.zhihu.com/question/41114729": true, "https://www.zhihu.com/question/41114729x": false, "https://www.zhihu.com/question/4111472": false, "https://www.zhihu.com/": false, } for value, expectedResult := range ioMap { if validQuestionURL(value) != expectedResult { t.Error("validQuestionURL returns error result") } } }