[
  {
    "path": "README.md",
    "content": "## 公众号文章抓取工具\n使用公众号文章编辑链接的方案, 突破搜狗方案10条的限制~~~ ;-)\n\n### 2018.12\n- 新增公众号内, 百度网盘链接和密码的抓取. (指定method为baidu_pan_links)\n- 新增全部html页面抓取方法 -method whole_page\n- 添加todo.list 与 mask 变量\n```\ntodo.list 文件记录了公众号下所有文章的链接数据, 因为高频次调用文章搜索/翻页接口会导致被ban.\n所以目前的方案是使用mask记录所有索引处理记录, 保证了不会翻页相同位置, 提高了获取新增链接的几率.\n```\n\n### 2019.01\n- 添加-pl参数, 用来限制每次公众号翻页数目, 每次翻页过多会被ban.建议10以内.\n\t- N = 0: 不进行翻页, 只讲之前的url重新处理(todo.list) \n\t- N < 0: 不限制翻页(默认), 翻到底或者出错时停止.\n\t- N > 0: 翻页N次.\n\n\n\n\n### 准备\n- 首先你需要有一个 [微信公众号, 注册很简单](https://mp.weixin.qq.com)\n- python 3.6\n- [下载ChromeDriver](http://chromedriver.chromium.org/home) 在第一次登陆时, 需要使用其手动登录. \n- 安装依赖\n\n```\npip install -r requirements.txt\n``` \n\n### 结构\n```\nwxhub/\n├── README.md\n├── arti.cache.list\t\t(使用后生成)\t\n├── chromedriver\t\t\t(默认macOS版本, windows可另行下载 重命名即可)\n├── cookies.json\t\t\t(使用后生成)\n├── gongzhonghao.py\t\t(使用后生成)\n├── output\t\t\t\t(使用后生成)\n├── requirements.txt\t\n├── url.cache.list\t\t(使用后生成)\n└── wxhub.py\n\n```\n\n### 使用\n```\n(py3) isyuu:wxhub isyuu$ python wxhub.py -h\nusage: wxhub.py [-h] -biz BIZ [-chrome CHROME] [-arti ARTI] [-method METHOD]\n                [-sleep SLEEP] [-pipe PIPE] [-pl PAGE_LIMIT]\n\n公众号文章全搞定\n\noptional arguments:\n  -h, --help      show this help message and exit\n  -biz BIZ        必填:公众号名字\n  -chrome CHROME  可选:web chrome 路径, 默认使用脚本同级目录下的chromedriver\n  -arti ARTI      可选:文章名字, 默认处理全部文章\n  -method METHOD  可选, 处理方法: all_images, baidu_pan_links, whole_page\n  -sleep SLEEP    翻页休眠时间, 默认为1即 1秒每页.\n  -pipe PIPE      在method指定为pipe时, 该参数指定pipe处理流程. 例如:\"pipe_example,\n                  pipe_example1, pipe_example2, pipe_example3\"\n  -pl PAGE_LIMIT  指定最大翻页次数, 每次同一个公众号, 翻页太多次会被ban, 0:不翻页 只处理todo.list, 默认<0:无限制\n                  >0:翻页次数\n\n```\n\n现有缓存功能, 目前缓存在如下文件中.\n\n- 用户cookies\n- 已经爬取的文章链接.  --> arti.cache.list\n- 已经下载的链接. \t\t--> url.cache.list\n\n需要全部重新下载时, 删除对应文件即可.\n\n### 已知问题\n- 在某些情况下, cookies里的session过期后, 会导致\"获取页面失败!\"的错误.(此时参数cookies.json文件即可)\n- 提示\"搜索过于频繁\"问题, 这可能是又有微信对搜索接口存在反爬机制; 目前解决的方案是:删除cookies.json, 换账号登录, 或者等几个小时即可.(未来准备尝试先缓存所有链接再逐条爬取的方式...)\n\n\n\n\n"
  },
  {
    "path": "pipe_example.py",
    "content": "# -*- coding: utf-8 -*-\n'''\n扩展处理脚本案例\n'''\n\ndef crawl(arti_url, arti_dir):\n    '''\n    必要实现函数, 在处理文章链接时调用.\n    arti_url: 公众号文章链接.\n    arti_dir: 用于存储当前公众号的目录.\n    返回值: []存放所有处理完成的url链接.\n    '''\n    pass"
  },
  {
    "path": "requirements.txt",
    "content": "selenium==3.14.0\nrequests==2.18.4\nbeautifulsoup4==4.6.3\npyquery==1.4.0\n"
  },
  {
    "path": "wxhub.py",
    "content": "# -*- coding: utf-8 -*-\nfrom selenium import webdriver\nimport time\nfrom bs4 import BeautifulSoup\nimport requests\nimport re\nimport shutil\nimport os\nimport json\nimport argparse\nimport traceback\nimport random\nimport math\nimport codecs\n\nclass Input:\n    fake_name = \"\"#\"影想\"\n    out_dir = \"output\"\n    '''\n    all_images\n    baidu_pan_links\n    whole_page\n    pipe\n    '''\n    crawl_method = \"all_images\"\n    url_cache = {}\n    arti_cache = {}\n    page_sleep = 1\n    page_limit = -1\n    args = {}\n    custom_pipe = []\n\nclass Session:\n    token = ''\n    cookies = []\n    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}\n\nclass Urls:\n    index = 'https://mp.weixin.qq.com'\n    editor = 'https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit&action=edit&type=10&isMul=1&isNew=1&share=1&lang=zh_CN&token={token}'\n    query_biz = 'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&token={token}&lang=zh_CN&f=json&ajax=1&random={random}&query={query}&begin={begin}&count={count}'\n    query_arti = 'https://mp.weixin.qq.com/cgi-bin/appmsg?token={token}&lang=zh_CN&f=json&%E2%80%A65&action=list_ex&begin={begin}&count={count}&query={query}&fakeid={fakeid}&type=9'\n\nclass BaseResp:\n    def __init__(self, sjson):\n        self.data = json.loads(sjson)\n        self.base_resp = self.data['base_resp']\n        \n    @property\n    def ret(self):\n        return self.base_resp['ret']\n\n    @property\n    def err_msg(self):\n        return self.base_resp['err_msg']\n\n    @property\n    def is_ok(self):\n        return self.base_resp['ret'] == 0\n\n\nclass FakesResp(BaseResp):\n    \n    def __init__(self, sjson):\n        super(FakesResp, self).__init__(sjson)\n        self.list = self.data['list']\n        self.total = self.data['total']\n\n    @property\n    def count(self):\n        return len(self.list)\n    \n\nclass ArtisResp(BaseResp):\n\n    def __init__(self, sjson):\n        super(ArtisResp, self).__init__(sjson) \n        self.list = self.data['app_msg_list'] if self.is_ok else []\n        self.total = self.data['app_msg_cnt'] if self.is_ok else 0\n\n    @property\n    def count(self):\n        return len(self.list)\n\n\ndef execute_times(driver, times):\n    for i in range(times + 1):\n        driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n        time.sleep(3)\n\ndef login(driver):\n    pass\n\ndef read_url_set():\n    ret = {}\n    fn = os.path.join('output', '__urls.json')\n    if os.path.isdir('output') and os.path.isfile(fn):\n        with open(fn, 'rt') as f:\n            ret = json.load(f)\n    return ret\n\ndef write_url_set(urls):\n    fn = os.path.join('output', '__urls.json')\n    if not os.path.isdir('output'):\n        shutil.os.makedirs('output', exist_ok=True)\n    with open(fn, 'wb') as f:\n        f.write(json.dumps(urls).encode('utf-8'))\n\ndef set_cookies(driver, cookies):\n    Session.cookies = {}\n    for item in cookies:\n        driver.add_cookie(item)\n        Session.cookies[item['name']]=item['value']\n\n\ndef download(url, sname):    \n    for i in range(0, 3):\n        result = requests.get(url, headers=Session.headers, stream=True)\n        if result.status_code == 200:\n            with open(sname, 'wb') as f:\n                for chunk in result.iter_content(1024):\n                    f.write(chunk)\n            return True\n        else:\n            continue\n    print(f\"Error download:{url}\")\n    return False\n    \ndef pipe_fakes(fake_name):\n    begin = 0\n    count = 5\n    while(True):\n        rep = requests.get(Urls.query_biz.format(random=random.random(), token=Session.token, query=fake_name, begin=begin, count=count), cookies=Session.cookies, headers=Session.headers)\n        fakes = FakesResp(rep.text)\n        if not fakes.is_ok:\n            break\n        i = 0\n        for it in fakes.list:\n            print(f\"{i}) {it['nickname']}\")\n            i = i + 1\n        \n        while(True):\n            ic = input(\"输入数字, 选择序号;或者输入n翻页:\") \n            try:\n                if ic == 'n' or int(ic) >= 0 and int(ic) < len(fakes.list):\n                    break\n            except ValueError:\n                print(\"输入错误, 请重新输入!\")\n            continue\n\n        if ic == 'n' or ic == 'N':\n            begin = begin + fakes.count\n            continue\n        return fakes.list[int(ic)]\n\n\n\ndef pipe_articles(fakeid, query=''):\n    TIME_SLEEP = Input.page_sleep\n\n    todo = load_todo_list(Input.fake_name)\n    if not todo:\n        todo['data'] = {}\n    data = todo['data']\n    mask = list(todo['__mask'] if '__mask' in todo else '')\n    last_total = todo['__total_cnt'] if '__total_cnt' in todo else 0\n\n    begin = 0\n    pagesize = 5\n    total = 0\n    total_page = 0\n    last_total_page = math.ceil(last_total / pagesize)\n    page_limit = Input.page_limit\n\n    rep = requests.get(Urls.query_arti.format(token=Session.token, fakeid=fakeid, begin=begin, count=pagesize, query=query), cookies=Session.cookies, headers=Session.headers)\n    artis = ArtisResp(rep.text)\n    if not artis.ret and page_limit:\n        total = artis.total\n        total_page = math.ceil(total / pagesize)\n\n        if total_page > last_total_page:\n            mask = (total_page - last_total_page) * ['0'] + mask\n        \n        if artis.list[0]['link'] in data:\n            mask[0] = '0' #has new arti. reset first page.\n        print(f\"正在获取全部链接, 共发现 {artis.total} 条文章, 需要翻页 {total_page} 次, 请稍后 ...\")\n        # 当前页为0时必检查下一页..\n        for i in range(0, len(mask)):\n            if not page_limit:\n                break\n\n            if mask[i] == '1':\n                continue\n            print(f\"正在处理第{i}页...\")\n            time.sleep(TIME_SLEEP)\n            rep = requests.get(Urls.query_arti.format(token=Session.token, fakeid=fakeid, begin=i * pagesize, count=pagesize, query=query), cookies=Session.cookies, headers=Session.headers)\n            artis = ArtisResp(rep.text)\n            if artis.ret :\n                break\n\n            flag = True\n            for it in artis.list:\n                link = it['link']\n                if link in data:\n                    continue\n                flag = False\n                data[link] = it\n            mask[i] = '1'\n            # force check next page.\n            if not flag and i < len(mask) - 1:\n                mask[i + 1] = '0' \n\n            #count check limit\n            page_limit -= 1\n    else:\n        print(f\"调用搜索, 报错:{artis.ret} {artis.err_msg}\")\n        \n    curr_searched = sum(map(lambda x: 1 if x == '1' else 0, mask))\n    # if not total:\n    #     raise Exception('搜索不到文章, 或者接口被反爬, 请删除cookies.json文件 等几分钟再试, 或换个账号试试.')\n    print(f\"本次搜索到:{total_page} 页文章, 已处理:{curr_searched}页, 共在 todo.list 中包含 {len(data)} 条文章链接 ...\")\n    todo['__total_cnt'] = total\n    todo['__mask'] = ''.join(mask)\n    save_todo_list(Input.fake_name, todo)\n\n    cnt = 0\n    for url, arti_info in data.items():\n        if url in Input.arti_cache:\n            continue   \n        print(f\"{arti_info['title']} --> {url}\")\n        if pipe_crawl_articles(arti_info):\n            cnt += 1\n            append_arti_cache(url)\n\n    print(f\" 本次共处理了 {cnt} 条文章链接!\")\n\ndef verfy_arti_content(html):\n    if not html:\n        return False, \"从服务器获取失败\"\n    pat = re.compile(r'<div class=\"page_msg')\n    if not pat.search(html):\n        return True, \"\"\n    pat = re.compile(r'<div class=\"global_error_msg.*?\">(.*?)</div', re.MULTILINE| re.DOTALL)\n    ms = pat.findall(html)\n    if ms:\n        return False, ms[0].strip()\n    return False, \"服务器返回未知错误\"\n\ndef crawl_all_images(url, sdir, url_cache, html=None):\n    pat = re.compile(r'src=\"(https://.*?)\"')\n    pat2 = re.compile(r'wx_fmt=(.*)')\n    urls = []\n    try:\n        if not html:\n            rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)\n            html = rep.text\n        mats = pat.findall(html, pos=0)\n        idx = 0\n        for m in mats:\n            if m in url_cache:\n                continue\n                \n            pps = pat2.findall(m)\n            if pps:\n                postfix = pps[0]\n            else:\n                postfix = 'jpg'\n\n            download(m, os.path.join(sdir, f\"{idx}.{postfix}\"))\n            urls.append(m)\n            idx += 1\n        append_url_cache(urls)\n        return True\n    except:\n        print(f\"failed crawl images from url:{url}\")\n        sg = traceback.format_exc()\n        print(sg)\n        return False\n\ndef crawl_baidu_pan_link(url, sdir, url_cache):\n    pat = re.compile(r'链接\\s*[:|：]\\s*(https://pan\\.baidu\\.com/.*?)提取码\\s*[:|：]\\s*(....)')\n    try:\n        urls = []\n        rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)\n        html = rep.text\n        mats = pat.findall(html, pos=0)\n        if not mats:\n            return False\n        with open(\"baidu.pan.links.txt\", \"a\") as myfile:\n            for uus in mats:\n                uu = uus[0]\n                if not uu or uu in Input.url_cache:\n                    continue\n                pwd = uus[1]\n                myfile.write(f\"{uu} => {pwd}\\n\") \n                Input.url_cache[uu] = True\n                urls.append(uu)\n        append_url_cache(urls)\n        return True\n    except:\n        print(f\"failed crawl linkss from url:{url}\")\n        sg = traceback.format_exc()\n        print(sg)\n        return False\n\ndef crawl_whole_page(url, sdir, url_cache):\n    try:\n        rep = requests.get(url, cookies=Session.cookies, headers=Session.headers)\n        if rep.status_code != 200:\n            return False\n        html = rep.text\n        valid, msg = verfy_arti_content(html)\n        if not valid:\n            raise Exception(f\"保存网页失败: {msg}\")\n        \n        os.makedirs(sdir, exist_ok=True)\n        with codecs.open(os.path.join(sdir, 'index.html'), \"w\", 'utf-8') as f:\n            f.write(html)\n            f.flush()\n        return crawl_all_images(url, sdir, Input.url_cache, html=html)\n    except:\n        print(f\"failed crawl page from url:{url}\")\n        sg = traceback.format_exc()\n        print(sg)\n        return False\n\ndef crawl_by_custom_pipe(url, sdir, url_cache):\n    if not Input.custom_pipe:\n        sps = (Input.args.pipe if Input.args.pipe else '').split(',')\n        for sp in sps:\n            Input.custom_pipe.append(__import__(sp.strip()))\n    \n    for p in Input.custom_pipe:\n        urls = p.crawl(url, sdir)\n        for url in urls:\n            url_cache[url] = True\n        return not not urls\n    \n    return False\n            \n    \ndef pipe_crawl_articles(arti_info):\n    title_4_dir = arti_info['title'].replace(':', '').replace(' ', '').replace(':', '').replace('/', '').replace('|', '').replace('<', '').replace('>', '').replace('?', '').replace('\"', '')\n    sdir = os.path.join(Input.out_dir, Input.fake_name, title_4_dir)\n    if not os.path.exists(sdir):\n        os.makedirs(sdir, exist_ok=True)\n    if Input.crawl_method == 'all_images':\n        return crawl_all_images(arti_info['link'], sdir, Input.url_cache)\n    elif Input.crawl_method == 'baidu_pan_links':\n        return crawl_baidu_pan_link(arti_info['link'], sdir, Input.url_cache)\n    elif Input.crawl_method == 'whole_page':\n        return crawl_whole_page(arti_info['link'], sdir, Input.url_cache)\n    elif Input.crawl_method == 'pipe': \n        return crawl_by_custom_pipe(arti_info['link'], sdir, Input.url_cache)\n\ndef pipe():\n    '''query fakes '''\n    fake_info = pipe_fakes(Input.fake_name)\n    if not fake_info:\n        raise Exception(f\"Can not query fakes with input:{Input.fake_name}\")\n    \n    '''query arti'''\n    fakeid = fake_info['fakeid']\n    pipe_articles(fakeid)\n    input(\"pipe contiune:\")\n\n\ndef process_input():\n    Input.artis_cache = {}\n    ac = os.path.join('arti.cache.list')\n    if os.path.isfile(ac):\n        with open(ac, 'rt') as fi:\n            line = fi.readline()\n            while line:\n                Input.arti_cache[line.strip()] = True\n                line = fi.readline()\n        \n    uc = os.path.join('url.cache.list')\n    if os.path.isfile(uc):\n        with open(uc, 'rt') as fi:\n            line = fi.readline()\n            while line:\n                Input.url_cache[line.strip()] = True\n                line = fi.readline()\n\n\ndef append_arti_cache(arti_link):\n    arti_link = arti_link.strip()\n    if not arti_link:\n        return\n    ac = os.path.join('arti.cache.list')\n    with open(ac, \"a\") as myfile:\n        myfile.write(f\"{arti_link}\\n\")\n        Input.arti_cache[arti_link] = True\n\ndef append_url_cache(urls):\n    ac = os.path.join('url.cache.list')\n    with open(ac, \"a\") as myfile:\n        for url in urls:\n            url = url.strip()\n            if not url:\n                continue\n            myfile.write(f\"{url}\\n\") \n            Input.url_cache[url] = True\n\ndef load_todo_list(key):\n    fn = os.path.join('output', key, \"todo.list\")\n    if os.path.isfile(fn):\n        with open(fn, 'rb') as fi:\n            return json.load(fi)\n    return {}\n\ndef save_todo_list(key, dic):\n    if not dic:\n        return\n    fn = os.path.join('output', key, \"todo.list\")\n    os.makedirs(os.path.dirname(fn), exist_ok=True)\n    open(fn, 'wb').write(json.dumps(dic).encode('utf-8'))\n\ndef main(chrome):\n    #会过期, 重新登录后需要重新取得\n    if not chrome:\n        if os.path.isfile('chromedriver'):\n            chrome = 'chromedriver'\n        else:\n            chrome = input('输入webchrome:').strip()\n    driver = webdriver.Chrome(executable_path=chrome)\n    cookies = json.load(open('cookies.json', 'rb')) if os.path.isfile('cookies.json') else []\n    driver.get(Urls.index)\n    if not cookies:\n        input(\"请先手动登录, 完成后按回车继续:\")\n        cookies = driver.get_cookies()\n        open('cookies.json', 'wb').write(json.dumps(cookies).encode('utf-8'))\n\n    set_cookies(driver, cookies)\n    driver.get(Urls.index)\n    url = driver.current_url\n    if 'token' not in url:\n        raise Exception(f\"获取网页失败!\")\n    Session.token = re.findall(r'token=(\\w+)', url)[0]\n    process_input()\n    pipe()\n\n\n# def test():\n#     Input.fake_name = '大J小D'\n#     Input.crawl_method = 'baidu_pan_links'\n#     main(None)\n\nif __name__ == '__main__':\n    # test()\n   \n    description = u\"公众号文章全搞定\"\n    parser = argparse.ArgumentParser(description=description)\n    parser.add_argument('-biz', dest='biz', type=str, help='必填:公众号名字', required=True)\n    parser.add_argument('-chrome', dest='chrome', type=str, help='可选:web chrome 路径, 默认使用脚本同级目录下的chromedriver')\n    parser.add_argument('-arti', dest='arti', type=str, help='可选:文章名字, 默认处理全部文章')\n    parser.add_argument('-method', dest='method', type=str, help='可选, 处理方法:  all_images, baidu_pan_links, whole_page')\n    parser.add_argument('-sleep', dest='sleep', type=str, help='翻页休眠时间, 默认为1即 1秒每页.')\n    parser.add_argument('-pipe', dest='pipe', type=str, help='在method指定为pipe时, 该参数指定pipe处理流程. 例如:\"pipe_example, pipe_example1, pipe_example2, pipe_example3\"')\n    parser.add_argument('-pl', dest='page_limit', type=str, help='指定最大翻页次数, 每次同一个公众号, 翻页太多次会被ban, 0:不翻页 只处理todo.list, 默认<0:无限制 >0:翻页次数')\n\n    Input.args = parser.parse_args()\n    Input.fake_name = Input.args.biz\n    Input.crawl_method = Input.args.method if Input.args.method else 'all_images'\n    Input.page_sleep = int(Input.args.sleep) if Input.args.sleep else 1\n    Input.page_limit = int(Input.args.page_limit) if Input.args.page_limit else -1\n    main(Input.args.chrome)"
  }
]