Repository: airyland/we-extract Branch: master Commit: 3b168e0bfef9 Files: 13 Total size: 442.7 KB Directory structure: gitextract_dgt8bub_/ ├── .gitignore ├── .npmignore ├── README.MD ├── errors.js ├── index.js ├── links.md ├── package.json ├── parse-wechat-url.js ├── test/ │ ├── mp-link.html │ └── mp-links.js ├── test.js ├── util.js └── video.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .DS_Store node_modules links temp ================================================ FILE: .npmignore ================================================ links temp .DS_Store test ================================================ FILE: README.MD ================================================ # we-extract ## 介绍 `we-extract` 用以解析微信公众号文章的账号及文章信息，居家旅行、采集微信公众号文章必备工具。 `we-extract` 是微信公众号 RSS 订阅服务 `WeRss` 的核心解析工具，欢迎使用：

## 安装 ``` npm install we-extract // or yarn add we-extract ``` ## 使用 > Node 版本需要支持 async ``` js const extract = require('we-extract').extract const rs = await extract('微信文章 url 或者文章内容') // 选项 const rs = await extract('微信文章 url 或者文章内容', { shouldReturnContent: true, // 是否返回内容，默认返回 shouldExtractMpLinks: false, // v2.1.0 是否返回文章中出现的所有公众号文章链接，如果为 true，将返回 mp_links 数组 shouldExtractTags: false, // v2.2.0 是否解析文章中的收录标签 shouldExtractRepostMeta: false // v2.2.3 是否解析转载文章来源 }) ``` ## 返回结果说明 > 正确返回 ``` js { done: true, code: 0, data: { account_name: '微信派', account_alias: 'wx-pai', account_avatar: 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM7Xb5Qbdia5AuGTX4AeZSWYlv5TEqD1FicUDOrnEIwVak1A/132', account_description: '微信第一手官方活动信息发布，线下沙龙活动在线互动平台。独家分享微信公众平台优秀案例，以及权威专家的精彩观点。', account_id: 'gh_bc5ec2ee663f', account_biz: 'MjM5NjM4MDAxMg==', account_biz_number: 2396380012, account_qr_code: 'https://open.weixin.qq.com/qr/code?username=gh_bc5ec2ee663f', msg_has_copyright: false, // 是否原创 msg_content: '省略的文章内容', msg_author: null, // 作者 msg_sn: '9a0a54f2e7c8ac4019812aa78bd4b3e0', msg_idx: 1, msg_mid: 2655078412, msg_title: '重磅 | 微信订阅号全新改版上线！', msg_desc: '今后，头图也很重要', msg_link: 'http://mp.weixin.qq.com/s?__biz=MjM5NjM4MDAxMg==&mid=2655078412&idx=1&sn=9a0a54f2e7c8ac4019812aa78bd4b3e0&chksm=bd5fc40f8a284d19360e956074ffced37d8e2d78cb01a4ecdfaae40247823e7056b9d31ae3ef#rd', msg_source_url: null, // 音频，视频时，此处为音频、视频链接 msg_cover: 'http://mmbiz.qpic.cn/mmbiz_jpg/OiaFLUqewuIDldpxsV3ZYJzzyH9HTFsSwOEPX82WEvBZozGiam3LbRSzpIIKGzj72nxjhLjnscWsibDPFmnpFZykg/0?wx_fmt=jpeg', msg_article_type: null, // 文章分类 msg_publish_time: '2018-06-20T10:52:35.000Z', // date 类型 msg_publish_time_str: '2018/06/20 18:52:35', msg_type: 'post', // 可能为 post text repost voice video image mp_links: [{ // 在 shouldExtractMpLinks = true 时返回 title: '', href: '' }], tags: [{ // 在 shouldExtractTags = true 时返回 id: '', url: '', name: '', count: 1 }], repost_meta: { // 在 shouldExtractRepostMeta = true 时返回 account_name: '文章来源账号名字' } } } ``` > 错误返回 ``` js { done: false, code: 2002, msg: '链接已过期' } ``` ## 常见错误 `we-extract` 定义了详细的错误信息方便开发和出错处理，`1` 开头错误表示可能需要重试(或者暂时将内容保存下来 debug)，`2` 表示没有疑问的错误，可以不处理。请使用 code(数字类型) 来判断而不是 message 内容，因为 message 可能会变化。 ``` js module.exports = { '1000': '解析失败，可能文章内容不完整', '1001': '字段缺失', '1002': '请求文章内容失败', '1003': '请求文章内容为空', '1004': '访问过于频繁(URL模式)', // 可以换 ip 重新请求，注意与 2010 的区别 '1005': 'js 变量解析出错', '2001': '参数缺失', '2002': '链接已过期', '2003': '该内容被投诉且经审核涉嫌侵权，无法查看', '2004': '公众号迁移但文章未同步', '2005': '该内容已被发布者删除', '2006': '此内容因违规无法查看', '2007': '涉嫌违反相关法律法规和政策发送失败', '2008': '微信文章系统出错', '2009': '链接不正确', '2010': '访问过于频繁(HTML模式)', // 解析参数为直接的文章内容，此时该篇内容已经无效，可以丢弃 '2011': '由用户投诉并经平台审核，涉嫌过度营销、骚扰用户', '2012': '此帐号已被屏蔽', '2013': '此帐号已自主注销', '2014': '不实信息', '2016': '冒名侵权' } ``` ## 经验 + 一个微信由 biz+mid+idx 组成，mid 在单个公众号内唯一。 + 文章所属账号信息以文章解析结果为准，采集搜狗时不要相信账号名字，因为搜狗显示的可能是改名或者迁移前的账号信息。 + 如果在搜狗微信搜不到账号，极有可能是因为公众号改了名字，试试以前的名字应该能搜索到。 + 微信链接的 search 拼接符可能为 `&` 需要做一个替换处理，否则解析链接参数时会有问题。 + 一个 ip 获取微信文章内容有限制，需要限制速率或者轮换 ip。 ## 链接类型图片：https://mp.weixin.qq.com/s/5tpbsFR1k_3744P0Egdnxg ================================================ FILE: errors.js ================================================ module.exports = { '1000': '解析失败，可能文章内容不完整', '1001': '字段缺失', '1002': '请求文章内容失败', '1003': '请求文章内容为空', '1004': '访问过于频繁(url模式)', // 可以换 ip 重新请求，注意与 2010 的区别 '1005': 'js 变量解析出错', '1006': '链接重定向', // 开启不 follow transfer link 后会有这个错误 '2001': '参数缺失', '2002': '链接已过期', '2003': '该内容被投诉且经审核涉嫌侵权，无法查看', '2004': '公众号迁移但文章未同步', '2005': '该内容已被发布者删除', '2006': '此内容因违规无法查看', '2007': '涉嫌违反相关法律法规和政策发送失败', '2008': '微信文章系统出错', '2009': '链接不正确', '2010': '访问过于频繁(HTML模式)', // 解析参数为直接的文章内容，此时该篇内容已经无效，可以丢弃 '2011': '由用户投诉并经平台审核，涉嫌过度营销、骚扰用户', '2012': '帐号已被屏蔽, 内容无法查看', '2013': '此帐号已自主注销', '2014': '不实信息', '2015': '此帐号处于帐号迁移流程中', '2016': '冒名侵权' } ================================================ FILE: index.js ================================================ const qs = require('qs') const dayjs = require('dayjs') const request = require('request-promise') const cheerio = require('cheerio') const parseUrl = require('./parse-wechat-url') const errors = require('./errors') const unescape = require('lodash.unescape') const { getParameterByName, normalizeUrl } = require('./util') const video = require('./video') const defaultConfig = { shouldReturnRawMeta: false, shouldReturnContent: true, shouldFollowTransferLink: true, shouldExtractMpLinks: false, shouldExtractTags: false, shouldExtractRepostMeta: false } const basic = {} basic.accountId = '' basic.accountAvatar = '' basic.accountBiz = null basic.accountBizNumber = null basic.accountName = null const getError = function(code) { return { done: false, code: code, msg: errors[code] } } const extract = async function(html, options = {}) { const { shouldReturnRawMeta, shouldReturnContent, shouldFollowTransferLink, shouldExtractMpLinks, shouldExtractTags, shouldExtractRepostMeta } = Object.assign({}, defaultConfig, options) let paramType = 'HTML' // 参数为 URL 还是 HTML let url = null let rawUrl = null if (options.url) { url = normalizeUrl(options.url) } let type = 'post' let hasCopyright = false let shareContentTpl if (!html) { return getError(2001) } // 参数错误 // 支持地址 if (/^http/.test(html)) { html = normalizeUrl(html) if (!/http(s?):\/\/mp.weixin.qq.com/.test(html) && !/http(s?):\/\/weixin.sogou.com/.test(html)) { return getError(2009) } paramType = 'URL' rawUrl = html if (!url) { url = html } let host = 'mp.weixin.qq.com' if (/http(s?):\/\/weixin.sogou.com/.test(html)) { host = 'weixin.sogou.com' } try { html = await request({ uri: html, method: 'GET', headers: { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': host } }) // unknown purpose // if (html.includes('location.replace')) { // const rs = html.match(/([\s\S]*?)<\/script>/gi) // if (rs && rs[0]) { // const code = rs[0].split('\n').filter(one => { // return !one.includes('location.replace') && !one.includes('script>') // }).join('\n') + '\n return url;' // try { // const fn = new Function(code) // return await extract(fn(), options) // } catch (e) { // return getError(1005) // } // } // } } catch (e) { return getError(1002) } } else { html = html.replace(/\\n/g, '') } if (!html) { return getError(1003) } if (html.includes('访问过于频繁') && !html.includes('js_content')) { return paramType === 'URL' ? getError(1004) : getError(2010) } else if (html.includes('链接已过期') && !html.includes('js_content')) { return getError(2002) } else if (html.includes('被投诉且经审核涉嫌侵权，无法查看')) { return getError(2003) } else if (html.includes('该公众号已迁移')) { const match = html.match(/var\stransferTargetLink\s=\s'(.*?)';/) if (match && match[1]) { if (shouldFollowTransferLink) { return await extract(match[1]) } else { return { ...getError(1006), url: match[1] } } } else { return getError(2004) } } else if (html.includes('该内容已被发布者删除')) { return getError(2005) } else if (html.includes('此内容因违规无法查看')) { return getError(2006) } else if (html.includes('此内容发送失败无法查看')) { return getError(2007) } else if (html.includes('由用户投诉并经平台审核，涉嫌过度营销、骚扰用户')) { return getError(2011) } else if (html.includes('此帐号已被屏蔽') && !html.includes('id="js_content"')) { return getError(2012) } else if (html.includes('此帐号已自主注销') && !html.includes('id="js_content"')) { return getError(2013) } else if (!html.includes('id="js_content"') && html.includes('此帐号处于帐号迁移流程中')) { return getError(2015) } else if (html.includes('page_rumor') && !html.includes('id="js_content"')) { return getError(2014) } else if (html.includes('投诉类型') && html.includes('冒名侵权')) { return getError(2016) } else if (!html.includes('id="js_content"') && html.includes('参数错误') && html.includes('appmsg/error.html')) { return getError(2009) } else if (!html.includes('id="js_content"') && !html.includes('id=\\"js_content\\"')) { // http://mp.weixin.qq.com/s?__biz=MjM5ODIyMTE0MA==&mid=2650971473&idx=1&sn=f529f2a74fac89ed2a8ca8f7a44d93b3&chksm=bd38396a8a4fb07ce4ebab564de2ef01c2d50d60a225328c987cbf66e6167d067bc45f1527d3#rd // 图片类型但是没有 js_content 容器 if (html.includes('cover_url')) { type = 'image' } else { return getError(1000) } } html = html.replace('>微信号', ' id="append-account-alias">微信号') .replace('>功能介绍', ' id="append-account-desc">功能介绍') .replace(/\n\s+

两家大行行长空缺

轻金融轻金融

年末银行业又一起重量级高管变动

来源：轻金融

近日，多家媒体报道，工商银行行长谷澍拟接替周慕冰，担任农业银行党委书记，并将担任董事长一职，成为2020年末又一次银行高管重量级人事变动。53岁的谷澍，也将成为五大行中最年轻的董事长。

据21世纪经济报道，目前，邮储银行是六大行中领导团队人数最多、最齐的银行；中行、建行、交行均是“一正三副”或“一正四副”；而建行和工行一样出现了行长的空缺。

轻金融据Wind资讯统计，整理了六大行目前高管与其他重要人员情况，以及2020年高管团队变动情况。详情如下：

一、工行

1、工行高管团队及其他重要人员

2、2020年工行高管及重要人员变动

二、建行

1、建行高管团队及其他重要人员

2、2020年建行高管及重要人员变动

三、农行

1、农行高管团队及其他重要人员

2、2020年农行高管及重要人员变动

四、中行

1、中行高管团队及其他重要人员

2、2020年中行高管及重要人员人变动

五、交行

1、交行高管团队及其他重要人员

2、2020年交行高管及重要人员变动

六、邮储银行

邮储银行高管团队及其他重要人员：一正六副

【轻金融好文】

1、谁是银行业金融科技之王？

2、2019银行业十佳文章【轻金融】

3、300万银行人迎来一位“新同事”

4、2020银行金融科技最新布局！

5、30万亿工行金融科技革命！

6、某城商行消费贷规模，比农行、招行还大

7、银行云大爆发！三类玩家角逐千亿市场

8、各大银行零售业务含金量比拼

9、工行、建行的“较量” | 轻金融

10、互联网银行：ROE最高的民营银行“物种”

11、中小银行正遭遇一场金融科技生死战

12、中小银行信用卡业务报告！（史上最全）

================================================ FILE: test/mp-links.js ================================================ const extract = require('..').extract const assert = require('assert') const fs = require('fs') const path = require('path') const url = 'https://mp.weixin.qq.com/s?__biz=MzIwNjU2ODk1MQ==&mid=2247497639&idx=1&sn=bba35c164cbae04da8d78808151d35aa&chksm=971d1b3fa06a922971367bca63cce1f8a754b582c4ce7c2d07dfb28e5dd2b38d7695ff72b56b&scene=132#wechat_redirect' const content = fs.readFileSync(path.join(__dirname, './mp-link.html'), 'utf-8') ;(async () => { const res = await extract(content, { shouldReturnContent: false, shouldExtractMpLinks: true }) console.log(JSON.stringify(res, null, 2)) })() ================================================ FILE: test.js ================================================ const extract = require('.').extract const assert = require('assert') const fs = require('fs') ;(async function () { // const url = fs.readFileSync('./links/post.html', 'utf-8') // const postRs = await extract(url, { // shouldReturnContent: false // }) // // console.log(postRs) // // const expected = { // account_name: '微信派', // account_alias: 'wx-pai', // account_id: 'gh_bc5ec2ee663f', // account_biz: 'MjM5NjM4MDAxMg==', // account_biz_number: 2396380012, // account_qr_code: 'https://open.weixin.qq.com/qr/code?username=gh_bc5ec2ee663f', // msg_has_copyright: false, // msg_content: null, // msg_author: null, // msg_sn: '9a0a54f2e7c8ac4019812aa78bd4b3e0', // msg_idx: 1, // msg_mid: 2655078412, // msg_title: '重磅 | 微信订阅号全新改版上线！', // msg_desc: '今后，头图也很重要', // msg_link: 'http://mp.weixin.qq.com/s?__biz=MjM5NjM4MDAxMg==&mid=2655078412&idx=1&sn=9a0a54f2e7c8ac4019812aa78bd4b3e0&chksm=bd5fc40f8a284d19360e956074ffced37d8e2d78cb01a4ecdfaae40247823e7056b9d31ae3ef#rd', // msg_source_url: null, // msg_cover: 'http://mmbiz.qpic.cn/mmbiz_jpg/OiaFLUqewuIDldpxsV3ZYJzzyH9HTFsSwOEPX82WEvBZozGiam3LbRSzpIIKGzj72nxjhLjnscWsibDPFmnpFZykg/0?wx_fmt=jpeg', // msg_article_type: null, // msg_publish_time_str: '2018/06/20 18:52:35', // msg_type: 'post' // } // for (let i in expected) { // assert(postRs.data[i] === expected[i]) // } // // const link001 = fs.readFileSync('./links/quota_limit.html', 'utf-8') // const res001 = await extract(link001) // console.log(res001) // assert(res001.code === 2010) // // const imageUrl = fs.readFileSync('./links/image.html', 'utf-8') // const imageRs = await extract(imageUrl) // // console.log(imageRs) // // const videoUrl = fs.readFileSync('./links/video.html', 'utf-8') // const videoRs = await extract(videoUrl) // // console.log(videoRs) // // const documentWriteUrl = fs.readFileSync('./links/document.write.html', 'utf-8') // const documentWriteRs = await extract(documentWriteUrl) // // console.log(documentWriteRs) // // const _20181021WriteUrl = fs.readFileSync('./links/20181021.issue.html', 'utf-8') // const _20181021WriteRs = await extract(_20181021WriteUrl) // // console.log(_20181021WriteRs) // const issue3WriteUrl = fs.readFileSync('./links/issue3.html', 'utf-8') // const issue3WriteRs = await extract(issue3WriteUrl) // console.log(issue3WriteRs) // const sogou = await extract('https://mp.weixin.qq.com/s?src=3×tamp=1559431337&ver=1&signature=K*8sgrrv9y5KoQr22U2gh3Tut0DIldkcZ67t4Oc3BzcyNEQMtX3l459-K2JvxxeLvWbdhtjtuzSWorY-zsW-Nm2Rloy30WAJi82JmQGYI2GlWpIcFuXNh53g1jY*Dh8XRczrRrjewQgRj*N1Kg8FK0j5W-3wb*NdM3JzzhO4jWc=') // console.log(sogou) const url = fs.readFileSync('./links/20240107.html', 'utf-8') const postRs = await extract(url, { shouldReturnContent: true }) console.log(postRs) })() ================================================ FILE: util.js ================================================ const qs = require('qs') function getParameterByName(name, url = window.location.href) { name = name.replace(/[\[\]]/g, '\\$&'); var regex = new RegExp('[?&]' + name + '(=([^&#]*)|&|#|$)'), results = regex.exec(url); if (!results) return null; if (!results[2]) return ''; return decodeURIComponent(results[2].replace(/\+/g, ' ')); } function normalizeUrl (url = '') { const parts = url.replace(/&/g, '&').split('?') const rs = qs.parse(parts[1]) const querys = qs.stringify(rs) if (querys) { return parts[0] + '?' + querys } else { return parts[0] } } module.exports = { getParameterByName, normalizeUrl } ================================================ FILE: video.js ================================================ const dayjs = require('dayjs') module.exports = function({ post, basic, script, getError, html, $, shouldReturnRawMeta }) { const lines = script.split('\n') const lines2 = lines.filter(line => !!line.trim()) let code = lines2.filter((line, index) => { return /d\./.test(line) || (lines2[index - 1] && lines2[index - 1].includes('d.') && !line.includes('}')) }) code = `var d = {}; \nfunction getXmlValue (path) { return false }\n` + code.join('\n').replace('var d = _g.cgiData;', 'var d = {}') + '\n return d;' let data = {} code = `var _g = {};` + code const fn = new Function(code) data = fn() basic.accountName = data.nick_name basic.accountAvatar = data.hd_head_img basic.accountId = data.user_name // biz if (!basic.accountBiz && data.biz) { basic.accountBiz = data.biz basic.accountBizNumber = Buffer.from(basic.accountBiz, 'base64').toString() * 1 } // 标题 post.msg_title = data.title post.msg_desc = null post.msg_cover = null post.msg_link = data.msg_link || null post.msg_article_type = null // sn, idx, mid post.msg_sn = data.sn || null post.msg_idx = data.idx ? data.idx * 1 : null post.msg_mid = data.mid ? data.mid * 1 : null // 视频链接赋值于 source_url const vidMatch = html.match(/vid\s*:\s*'(.*?)'/) if (vidMatch && vidMatch[1]) { data.vid = vidMatch[1] // 旧版 vid 已经不适用 // post.msg_source_url = 'http://v.qq.com/x/page/' + vid + '.html' } if (!post.msg_cover) { // 旧版废弃 // post.msg_cover = `https://vpic.video.qq.com/60643382/${vid}.png` post.msg_cover = $("meta[property='og:image']").attr("content") } // 视频只有标题 + 内容，内容直接从 meta 里取 const description = $("meta[name='description']").attr("content") post.msg_content = description // 发布时间 if (data.create_time) { post.msg_publish_time = new Date(data.create_time * 1000) post.msg_publish_time_str = dayjs(post.msg_publish_time).format('YYYY/MM/DD HH:mm:ss') } // 如果没有，使用 ct_str if (!data.create_time && data.ct_str) { post.msg_publish_time = new Date(data.ct_str) post.msg_publish_time_str = dayjs(post.msg_publish_time).format('YYYY/MM/DD HH:mm:ss') } if (shouldReturnRawMeta) { post.raw_data = data } }