Repository: airyland/we-extract Branch: master Commit: 3b168e0bfef9 Files: 13 Total size: 442.7 KB Directory structure: gitextract_dgt8bub_/ ├── .gitignore ├── .npmignore ├── README.MD ├── errors.js ├── index.js ├── links.md ├── package.json ├── parse-wechat-url.js ├── test/ │ ├── mp-link.html │ └── mp-links.js ├── test.js ├── util.js └── video.js ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .DS_Store node_modules links temp ================================================ FILE: .npmignore ================================================ links temp .DS_Store test ================================================ FILE: README.MD ================================================ # we-extract ## 介绍 `we-extract` 用以解析微信公众号文章的账号及文章信息,居家旅行、采集微信公众号文章必备工具。 `we-extract` 是微信公众号 RSS 订阅服务 `WeRss` 的核心解析工具,欢迎使用:


## 安装 ``` npm install we-extract // or yarn add we-extract ``` ## 使用 > Node 版本需要支持 async ``` js const extract = require('we-extract').extract const rs = await extract('微信文章 url 或者 文章内容') // 选项 const rs = await extract('微信文章 url 或者 文章内容', { shouldReturnContent: true, // 是否返回内容,默认返回 shouldExtractMpLinks: false, // v2.1.0 是否返回文章中出现的所有公众号文章链接,如果为 true,将返回 mp_links 数组 shouldExtractTags: false, // v2.2.0 是否解析文章中的收录标签 shouldExtractRepostMeta: false // v2.2.3 是否解析转载文章来源 }) ``` ## 返回结果说明 > 正确返回 ``` js { done: true, code: 0, data: { account_name: '微信派', account_alias: 'wx-pai', account_avatar: 'http://wx.qlogo.cn/mmhead/Q3auHgzwzM7Xb5Qbdia5AuGTX4AeZSWYlv5TEqD1FicUDOrnEIwVak1A/132', account_description: '微信第一手官方活动信息发布,线下沙龙活动在线互动平台。独家分享微信公众平台优秀案例,以及权威专家的精彩观点。', account_id: 'gh_bc5ec2ee663f', account_biz: 'MjM5NjM4MDAxMg==', account_biz_number: 2396380012, account_qr_code: 'https://open.weixin.qq.com/qr/code?username=gh_bc5ec2ee663f', msg_has_copyright: false, // 是否原创 msg_content: '省略的文章内容', msg_author: null, // 作者 msg_sn: '9a0a54f2e7c8ac4019812aa78bd4b3e0', msg_idx: 1, msg_mid: 2655078412, msg_title: '重磅 | 微信订阅号全新改版上线!', msg_desc: '今后,头图也很重要', msg_link: 'http://mp.weixin.qq.com/s?__biz=MjM5NjM4MDAxMg==&mid=2655078412&idx=1&sn=9a0a54f2e7c8ac4019812aa78bd4b3e0&chksm=bd5fc40f8a284d19360e956074ffced37d8e2d78cb01a4ecdfaae40247823e7056b9d31ae3ef#rd', msg_source_url: null, // 音频,视频时,此处为音频、视频链接 msg_cover: 'http://mmbiz.qpic.cn/mmbiz_jpg/OiaFLUqewuIDldpxsV3ZYJzzyH9HTFsSwOEPX82WEvBZozGiam3LbRSzpIIKGzj72nxjhLjnscWsibDPFmnpFZykg/0?wx_fmt=jpeg', msg_article_type: null, // 文章分类 msg_publish_time: '2018-06-20T10:52:35.000Z', // date 类型 msg_publish_time_str: '2018/06/20 18:52:35', msg_type: 'post', // 可能为 post text repost voice video image mp_links: [{ // 在 shouldExtractMpLinks = true 时返回 title: '', href: '' }], tags: [{ // 在 shouldExtractTags = true 时返回 id: '', url: '', name: '', count: 1 }], repost_meta: { // 在 shouldExtractRepostMeta = true 时返回 account_name: '文章来源账号名字' } } } ``` > 错误返回 ``` js { done: false, code: 2002, msg: '链接已过期' } ``` ## 常见错误 `we-extract` 定义了详细的错误信息方便开发和出错处理,`1` 开头错误表示可能需要重试(或者暂时将内容保存下来 debug),`2` 表示没有疑问的错误,可以不处理。 请使用 code(数字类型) 来判断而不是 message 内容,因为 message 可能会变化。 ``` js module.exports = { '1000': '解析失败,可能文章内容不完整', '1001': '字段缺失', '1002': '请求文章内容失败', '1003': '请求文章内容为空', '1004': '访问过于频繁(URL模式)', // 可以换 ip 重新请求,注意与 2010 的区别 '1005': 'js 变量解析出错', '2001': '参数缺失', '2002': '链接已过期', '2003': '该内容被投诉且经审核涉嫌侵权,无法查看', '2004': '公众号迁移但文章未同步', '2005': '该内容已被发布者删除', '2006': '此内容因违规无法查看', '2007': '涉嫌违反相关法律法规和政策发送失败', '2008': '微信文章系统出错', '2009': '链接不正确', '2010': '访问过于频繁(HTML模式)', // 解析参数为直接的文章内容,此时该篇内容已经无效,可以丢弃 '2011': '由用户投诉并经平台审核,涉嫌过度营销、骚扰用户', '2012': '此帐号已被屏蔽', '2013': '此帐号已自主注销', '2014': '不实信息', '2016': '冒名侵权' } ``` ## 经验 + 一个微信由 biz+mid+idx 组成,mid 在单个公众号内唯一。 + 文章所属账号信息以文章解析结果为准,采集搜狗时不要相信账号名字,因为搜狗显示的可能是改名或者迁移前的账号信息。 + 如果在搜狗微信搜不到账号,极有可能是因为公众号改了名字,试试以前的名字应该能搜索到。 + 微信链接的 search 拼接符可能为 `&` 需要做一个替换处理,否则解析链接参数时会有问题。 + 一个 ip 获取微信文章内容有限制,需要限制速率或者轮换 ip。 ## 链接类型 图片:https://mp.weixin.qq.com/s/5tpbsFR1k_3744P0Egdnxg ================================================ FILE: errors.js ================================================ module.exports = { '1000': '解析失败,可能文章内容不完整', '1001': '字段缺失', '1002': '请求文章内容失败', '1003': '请求文章内容为空', '1004': '访问过于频繁(url模式)', // 可以换 ip 重新请求,注意与 2010 的区别 '1005': 'js 变量解析出错', '1006': '链接重定向', // 开启不 follow transfer link 后会有这个错误 '2001': '参数缺失', '2002': '链接已过期', '2003': '该内容被投诉且经审核涉嫌侵权,无法查看', '2004': '公众号迁移但文章未同步', '2005': '该内容已被发布者删除', '2006': '此内容因违规无法查看', '2007': '涉嫌违反相关法律法规和政策发送失败', '2008': '微信文章系统出错', '2009': '链接不正确', '2010': '访问过于频繁(HTML模式)', // 解析参数为直接的文章内容,此时该篇内容已经无效,可以丢弃 '2011': '由用户投诉并经平台审核,涉嫌过度营销、骚扰用户', '2012': '帐号已被屏蔽, 内容无法查看', '2013': '此帐号已自主注销', '2014': '不实信息', '2015': '此帐号处于帐号迁移流程中', '2016': '冒名侵权' } ================================================ FILE: index.js ================================================ const qs = require('qs') const dayjs = require('dayjs') const request = require('request-promise') const cheerio = require('cheerio') const parseUrl = require('./parse-wechat-url') const errors = require('./errors') const unescape = require('lodash.unescape') const { getParameterByName, normalizeUrl } = require('./util') const video = require('./video') const defaultConfig = { shouldReturnRawMeta: false, shouldReturnContent: true, shouldFollowTransferLink: true, shouldExtractMpLinks: false, shouldExtractTags: false, shouldExtractRepostMeta: false } const basic = {} basic.accountId = '' basic.accountAvatar = '' basic.accountBiz = null basic.accountBizNumber = null basic.accountName = null const getError = function(code) { return { done: false, code: code, msg: errors[code] } } const extract = async function(html, options = {}) { const { shouldReturnRawMeta, shouldReturnContent, shouldFollowTransferLink, shouldExtractMpLinks, shouldExtractTags, shouldExtractRepostMeta } = Object.assign({}, defaultConfig, options) let paramType = 'HTML' // 参数为 URL 还是 HTML let url = null let rawUrl = null if (options.url) { url = normalizeUrl(options.url) } let type = 'post' let hasCopyright = false let shareContentTpl if (!html) { return getError(2001) } // 参数错误 // 支持地址 if (/^http/.test(html)) { html = normalizeUrl(html) if (!/http(s?):\/\/mp.weixin.qq.com/.test(html) && !/http(s?):\/\/weixin.sogou.com/.test(html)) { return getError(2009) } paramType = 'URL' rawUrl = html if (!url) { url = html } let host = 'mp.weixin.qq.com' if (/http(s?):\/\/weixin.sogou.com/.test(html)) { host = 'weixin.sogou.com' } try { html = await request({ uri: html, method: 'GET', headers: { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': host } }) // unknown purpose // if (html.includes('location.replace')) { // const rs = html.match(/([\s\S]*?)<\/script>/gi) // if (rs && rs[0]) { // const code = rs[0].split('\n').filter(one => { // return !one.includes('location.replace') && !one.includes('script>') // }).join('\n') + '\n return url;' // try { // const fn = new Function(code) // return await extract(fn(), options) // } catch (e) { // return getError(1005) // } // } // } } catch (e) { return getError(1002) } } else { html = html.replace(/\\n/g, '') } if (!html) { return getError(1003) } if (html.includes('访问过于频繁') && !html.includes('js_content')) { return paramType === 'URL' ? getError(1004) : getError(2010) } else if (html.includes('链接已过期') && !html.includes('js_content')) { return getError(2002) } else if (html.includes('被投诉且经审核涉嫌侵权,无法查看')) { return getError(2003) } else if (html.includes('该公众号已迁移')) { const match = html.match(/var\stransferTargetLink\s=\s'(.*?)';/) if (match && match[1]) { if (shouldFollowTransferLink) { return await extract(match[1]) } else { return { ...getError(1006), url: match[1] } } } else { return getError(2004) } } else if (html.includes('该内容已被发布者删除')) { return getError(2005) } else if (html.includes('此内容因违规无法查看')) { return getError(2006) } else if (html.includes('此内容发送失败无法查看')) { return getError(2007) } else if (html.includes('由用户投诉并经平台审核,涉嫌过度营销、骚扰用户')) { return getError(2011) } else if (html.includes('此帐号已被屏蔽') && !html.includes('id="js_content"')) { return getError(2012) } else if (html.includes('此帐号已自主注销') && !html.includes('id="js_content"')) { return getError(2013) } else if (!html.includes('id="js_content"') && html.includes('此帐号处于帐号迁移流程中')) { return getError(2015) } else if (html.includes('page_rumor') && !html.includes('id="js_content"')) { return getError(2014) } else if (html.includes('投诉类型') && html.includes('冒名侵权')) { return getError(2016) } else if (!html.includes('id="js_content"') && html.includes('参数错误') && html.includes('appmsg/error.html')) { return getError(2009) } else if (!html.includes('id="js_content"') && !html.includes('id=\\"js_content\\"')) { // http://mp.weixin.qq.com/s?__biz=MjM5ODIyMTE0MA==&mid=2650971473&idx=1&sn=f529f2a74fac89ed2a8ca8f7a44d93b3&chksm=bd38396a8a4fb07ce4ebab564de2ef01c2d50d60a225328c987cbf66e6167d067bc45f1527d3#rd // 图片类型但是没有 js_content 容器 if (html.includes('cover_url')) { type = 'image' } else { return getError(1000) } } html = html.replace('>微信号', ' id="append-account-alias">微信号') .replace('>功能介绍', ' id="append-account-desc">功能介绍') .replace(/\n\s+

两家大行行长空缺

轻金融 轻金融
    ================================================ FILE: test/mp-links.js ================================================ const extract = require('..').extract const assert = require('assert') const fs = require('fs') const path = require('path') const url = 'https://mp.weixin.qq.com/s?__biz=MzIwNjU2ODk1MQ==&mid=2247497639&idx=1&sn=bba35c164cbae04da8d78808151d35aa&chksm=971d1b3fa06a922971367bca63cce1f8a754b582c4ce7c2d07dfb28e5dd2b38d7695ff72b56b&scene=132#wechat_redirect' const content = fs.readFileSync(path.join(__dirname, './mp-link.html'), 'utf-8') ;(async () => { const res = await extract(content, { shouldReturnContent: false, shouldExtractMpLinks: true }) console.log(JSON.stringify(res, null, 2)) })() ================================================ FILE: test.js ================================================ const extract = require('.').extract const assert = require('assert') const fs = require('fs') ;(async function () { // const url = fs.readFileSync('./links/post.html', 'utf-8') // const postRs = await extract(url, { // shouldReturnContent: false // }) // // console.log(postRs) // // const expected = { // account_name: '微信派', // account_alias: 'wx-pai', // account_id: 'gh_bc5ec2ee663f', // account_biz: 'MjM5NjM4MDAxMg==', // account_biz_number: 2396380012, // account_qr_code: 'https://open.weixin.qq.com/qr/code?username=gh_bc5ec2ee663f', // msg_has_copyright: false, // msg_content: null, // msg_author: null, // msg_sn: '9a0a54f2e7c8ac4019812aa78bd4b3e0', // msg_idx: 1, // msg_mid: 2655078412, // msg_title: '重磅 | 微信订阅号全新改版上线!', // msg_desc: '今后,头图也很重要', // msg_link: 'http://mp.weixin.qq.com/s?__biz=MjM5NjM4MDAxMg==&mid=2655078412&idx=1&sn=9a0a54f2e7c8ac4019812aa78bd4b3e0&chksm=bd5fc40f8a284d19360e956074ffced37d8e2d78cb01a4ecdfaae40247823e7056b9d31ae3ef#rd', // msg_source_url: null, // msg_cover: 'http://mmbiz.qpic.cn/mmbiz_jpg/OiaFLUqewuIDldpxsV3ZYJzzyH9HTFsSwOEPX82WEvBZozGiam3LbRSzpIIKGzj72nxjhLjnscWsibDPFmnpFZykg/0?wx_fmt=jpeg', // msg_article_type: null, // msg_publish_time_str: '2018/06/20 18:52:35', // msg_type: 'post' // } // for (let i in expected) { // assert(postRs.data[i] === expected[i]) // } // // const link001 = fs.readFileSync('./links/quota_limit.html', 'utf-8') // const res001 = await extract(link001) // console.log(res001) // assert(res001.code === 2010) // // const imageUrl = fs.readFileSync('./links/image.html', 'utf-8') // const imageRs = await extract(imageUrl) // // console.log(imageRs) // // const videoUrl = fs.readFileSync('./links/video.html', 'utf-8') // const videoRs = await extract(videoUrl) // // console.log(videoRs) // // const documentWriteUrl = fs.readFileSync('./links/document.write.html', 'utf-8') // const documentWriteRs = await extract(documentWriteUrl) // // console.log(documentWriteRs) // // const _20181021WriteUrl = fs.readFileSync('./links/20181021.issue.html', 'utf-8') // const _20181021WriteRs = await extract(_20181021WriteUrl) // // console.log(_20181021WriteRs) // const issue3WriteUrl = fs.readFileSync('./links/issue3.html', 'utf-8') // const issue3WriteRs = await extract(issue3WriteUrl) // console.log(issue3WriteRs) // const sogou = await extract('https://mp.weixin.qq.com/s?src=3×tamp=1559431337&ver=1&signature=K*8sgrrv9y5KoQr22U2gh3Tut0DIldkcZ67t4Oc3BzcyNEQMtX3l459-K2JvxxeLvWbdhtjtuzSWorY-zsW-Nm2Rloy30WAJi82JmQGYI2GlWpIcFuXNh53g1jY*Dh8XRczrRrjewQgRj*N1Kg8FK0j5W-3wb*NdM3JzzhO4jWc=') // console.log(sogou) const url = fs.readFileSync('./links/20240107.html', 'utf-8') const postRs = await extract(url, { shouldReturnContent: true }) console.log(postRs) })() ================================================ FILE: util.js ================================================ const qs = require('qs') function getParameterByName(name, url = window.location.href) { name = name.replace(/[\[\]]/g, '\\$&'); var regex = new RegExp('[?&]' + name + '(=([^&#]*)|&|#|$)'), results = regex.exec(url); if (!results) return null; if (!results[2]) return ''; return decodeURIComponent(results[2].replace(/\+/g, ' ')); } function normalizeUrl (url = '') { const parts = url.replace(/&/g, '&').split('?') const rs = qs.parse(parts[1]) const querys = qs.stringify(rs) if (querys) { return parts[0] + '?' + querys } else { return parts[0] } } module.exports = { getParameterByName, normalizeUrl } ================================================ FILE: video.js ================================================ const dayjs = require('dayjs') module.exports = function({ post, basic, script, getError, html, $, shouldReturnRawMeta }) { const lines = script.split('\n') const lines2 = lines.filter(line => !!line.trim()) let code = lines2.filter((line, index) => { return /d\./.test(line) || (lines2[index - 1] && lines2[index - 1].includes('d.') && !line.includes('}')) }) code = `var d = {}; \nfunction getXmlValue (path) { return false }\n` + code.join('\n').replace('var d = _g.cgiData;', 'var d = {}') + '\n return d;' let data = {} code = `var _g = {};` + code const fn = new Function(code) data = fn() basic.accountName = data.nick_name basic.accountAvatar = data.hd_head_img basic.accountId = data.user_name // biz if (!basic.accountBiz && data.biz) { basic.accountBiz = data.biz basic.accountBizNumber = Buffer.from(basic.accountBiz, 'base64').toString() * 1 } // 标题 post.msg_title = data.title post.msg_desc = null post.msg_cover = null post.msg_link = data.msg_link || null post.msg_article_type = null // sn, idx, mid post.msg_sn = data.sn || null post.msg_idx = data.idx ? data.idx * 1 : null post.msg_mid = data.mid ? data.mid * 1 : null // 视频链接赋值于 source_url const vidMatch = html.match(/vid\s*:\s*'(.*?)'/) if (vidMatch && vidMatch[1]) { data.vid = vidMatch[1] // 旧版 vid 已经不适用 // post.msg_source_url = 'http://v.qq.com/x/page/' + vid + '.html' } if (!post.msg_cover) { // 旧版废弃 // post.msg_cover = `https://vpic.video.qq.com/60643382/${vid}.png` post.msg_cover = $("meta[property='og:image']").attr("content") } // 视频只有标题 + 内容,内容直接从 meta 里取 const description = $("meta[name='description']").attr("content") post.msg_content = description // 发布时间 if (data.create_time) { post.msg_publish_time = new Date(data.create_time * 1000) post.msg_publish_time_str = dayjs(post.msg_publish_time).format('YYYY/MM/DD HH:mm:ss') } // 如果没有,使用 ct_str if (!data.create_time && data.ct_str) { post.msg_publish_time = new Date(data.ct_str) post.msg_publish_time_str = dayjs(post.msg_publish_time).format('YYYY/MM/DD HH:mm:ss') } if (shouldReturnRawMeta) { post.raw_data = data } }