[
  {
    "path": ".gitignore",
    "content": ".idea/*\n*/*\nvenv"
  },
  {
    "path": "LICENCE",
    "content": "MIT License\n\nCopyright (c) 2017\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE."
  },
  {
    "path": "README.md",
    "content": "# BaiduImageSpider\n百度图片爬虫，基于python3\n\n个人学习开发用\n\n单线程爬取百度图片。\n\n# 爬虫工具 Required\n\n**需要安装python版本 >= 3.6**\n\n# 使用方法\n```\n$ python crawling.py -h\nusage: crawling.py [-h] -w WORD -tp TOTAL_PAGE -sp START_PAGE\n                   [-pp [{10,20,30,40,50,60,70,80,90,100}]] [-d DELAY]\n\noptional arguments:\n  -h, --help            show this help message and exit\n  -w WORD, --word WORD  抓取关键词\n  -tp TOTAL_PAGE, --total_page TOTAL_PAGE\n                        需要抓取的总页数\n  -sp START_PAGE, --start_page START_PAGE\n                        起始页数\n  -pp [{10,20,30,40,50,60,70,80,90,100}], --per_page [{10,20,30,40,50,60,70,80,90,100}]\n                        每页大小\n  -d DELAY, --delay DELAY\n                        抓取延时（间隔）\n```\n\n开始爬取图片\n```\npython crawling.py --word \"美女\" --total_page 10 --start_page 1 --per_page 30\n```\n\n\n另外也可以在`crawling.py`最后一行修改编辑查找关键字\n图片默认保存在项目路径\n运行爬虫：\n``` python\npython crawling.py\n```\n\n# 博客\n\n[爬虫总结](http://www.jwlchina.cn/2016/02/06/python%E7%99%BE%E5%BA%A6%E5%9B%BE%E7%89%87%E7%88%AC%E8%99%AB/)\n\n效果图：\n![效果图](http://blog-image.jwlchina.cn/kong36088/kong36088.github.io/master/uploads/python%E5%9B%BE%E7%89%87%E7%88%AC%E8%99%AB%E6%88%AA%E5%9B%BE.png)\n\n# 捐赠\n\n您的支持是对我的最大鼓励！\n谢谢你请我吃糖\n![wechatpay](http://blog-image.jwlchina.cn/kong36088/kong36088.github.io/master/uploads/site/wechat-pay.png)\n![alipay](http://blog-image.jwlchina.cn/kong36088/kong36088.github.io/master/uploads/site/zhifubao.jpg)\n\n"
  },
  {
    "path": "crawling.py",
    "content": "#!/usr/bin/env python\n# -*- coding:utf-8 -*-\nimport argparse\nimport os\nimport re\nimport sys\nimport urllib\nimport json\nimport socket\nimport urllib.request\nimport urllib.parse\nimport urllib.error\n# 设置超时\nimport time\n\ntimeout = 5\nsocket.setdefaulttimeout(timeout)\n\n\nclass Crawler:\n    # 睡眠时长\n    __time_sleep = 0.1\n    __amount = 0\n    __start_amount = 0\n    __counter = 0\n    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0', 'Cookie': ''}\n    __per_page = 30\n\n    # 获取图片url内容等\n    # t 下载图片时间间隔\n    def __init__(self, t=0.1):\n        self.time_sleep = t\n\n    # 获取后缀名\n    @staticmethod\n    def get_suffix(name):\n        m = re.search(r'\\.[^\\.]*$', name)\n        if m.group(0) and len(m.group(0)) <= 5:\n            return m.group(0)\n        else:\n            return '.jpeg'\n\n    @staticmethod\n    def handle_baidu_cookie(original_cookie, cookies):\n        \"\"\"\n        :param string original_cookie:\n        :param list cookies:\n        :return string:\n        \"\"\"\n        if not cookies:\n            return original_cookie\n        result = original_cookie\n        for cookie in cookies:\n            result += cookie.split(';')[0] + ';'\n        result.rstrip(';')\n        return result\n\n    # 保存图片\n    def save_image(self, rsp_data, word):\n        if not os.path.exists(\"./\" + word):\n            os.mkdir(\"./\" + word)\n        # 判断名字是否重复，获取图片长度\n        self.__counter = len(os.listdir('./' + word)) + 1\n        for image_info in rsp_data['data']:\n            try:\n                if 'replaceUrl' not in image_info or len(image_info['replaceUrl']) < 1:\n                    continue\n                obj_url = image_info['replaceUrl'][0]['ObjUrl']\n                thumb_url = image_info['thumbURL']\n                url = 'https://image.baidu.com/search/down?tn=download&ipn=dwnl&word=download&ie=utf8&fr=result&url=%s&thumburl=%s' % (urllib.parse.quote(obj_url), urllib.parse.quote(thumb_url))\n                time.sleep(self.time_sleep)\n                suffix = self.get_suffix(obj_url)\n                # 指定UA和referrer，减少403\n                opener = urllib.request.build_opener()\n                opener.addheaders = [\n                    ('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'),\n                ]\n                urllib.request.install_opener(opener)\n                # 保存图片\n                filepath = './%s/%s' % (word, str(self.__counter) + str(suffix))\n                urllib.request.urlretrieve(url, filepath)\n                if os.path.getsize(filepath) < 5:\n                    print(\"下载到了空文件，跳过!\")\n                    os.unlink(filepath)\n                    continue\n            except urllib.error.HTTPError as urllib_err:\n                print(urllib_err)\n                continue\n            except Exception as err:\n                time.sleep(1)\n                print(err)\n                print(\"产生未知错误，放弃保存\")\n                continue\n            else:\n                print(\"小黄图+1,已有\" + str(self.__counter) + \"张小黄图\")\n                self.__counter += 1\n        return\n\n    # 开始获取\n    def get_images(self, word):\n        search = urllib.parse.quote(word)\n        # pn int 图片数\n        pn = self.__start_amount\n        while pn < self.__amount:\n            url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%s&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%s&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=%s&rn=%d&gsm=1e&1594447993172=' % (search, search, str(pn), self.__per_page)\n            # 设置header防403\n            try:\n                time.sleep(self.time_sleep)\n                req = urllib.request.Request(url=url, headers=self.headers)\n                page = urllib.request.urlopen(req)\n                self.headers['Cookie'] = self.handle_baidu_cookie(self.headers['Cookie'], page.info().get_all('Set-Cookie'))\n                rsp = page.read()\n                page.close()\n            except UnicodeDecodeError as e:\n                print(e)\n                print('-----UnicodeDecodeErrorurl:', url)\n            except urllib.error.URLError as e:\n                print(e)\n                print(\"-----urlErrorurl:\", url)\n            except socket.timeout as e:\n                print(e)\n                print(\"-----socket timout:\", url)\n            else:\n                # 解析json\n                rsp_data = json.loads(rsp, strict=False)\n                if 'data' not in rsp_data:\n                    print(\"触发了反爬机制，自动重试！\")\n                else:\n                    self.save_image(rsp_data, word)\n                    # 读取下一页\n                    print(\"下载下一页\")\n                    pn += self.__per_page\n        print(\"下载任务结束\")\n        return\n\n    def start(self, word, total_page=1, start_page=1, per_page=30):\n        \"\"\"\n        爬虫入口\n        :param word: 抓取的关键词\n        :param total_page: 需要抓取数据页数 总抓取图片数量为 页数 x per_page\n        :param start_page:起始页码\n        :param per_page: 每页数量\n        :return:\n        \"\"\"\n        self.__per_page = per_page\n        self.__start_amount = (start_page - 1) * self.__per_page\n        self.__amount = total_page * self.__per_page + self.__start_amount\n        self.get_images(word)\n\n\nif __name__ == '__main__':\n    if len(sys.argv) > 1:\n        parser = argparse.ArgumentParser()\n        parser.add_argument(\"-w\", \"--word\", type=str, help=\"抓取关键词\", required=True)\n        parser.add_argument(\"-tp\", \"--total_page\", type=int, help=\"需要抓取的总页数\", required=True)\n        parser.add_argument(\"-sp\", \"--start_page\", type=int, help=\"起始页数\", required=True)\n        parser.add_argument(\"-pp\", \"--per_page\", type=int, help=\"每页大小\", choices=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100], default=30, nargs='?')\n        parser.add_argument(\"-d\", \"--delay\", type=float, help=\"抓取延时（间隔）\", default=0.05)\n        args = parser.parse_args()\n\n        crawler = Crawler(args.delay)\n        crawler.start(args.word, args.total_page, args.start_page, args.per_page)  # 抓取关键词为 “美女”，总数为 1 页（即总共 1*60=60 张），开始页码为 2\n    else:\n        # 如果不指定参数，那么程序会按照下面进行执行\n        crawler = Crawler(0.05)  # 抓取延迟为 0.05\n\n        crawler.start('美女', 10, 2, 30)  # 抓取关键词为 “美女”，总数为 1 页，开始页码为 2，每页30张（即总共 2*30=60 张）\n        # crawler.start('二次元 美女', 10, 1)  # 抓取关键词为 “二次元 美女”\n        # crawler.start('帅哥', 5)  # 抓取关键词为 “帅哥”\n"
  }
]