Repository: Kr1s77/awesome-python-login-model Branch: master Commit: b458a09bf554 Files: 99 Total size: 14.4 MB Directory structure: gitextract_aykf45t2/ ├── .gitattributes ├── .gitignore ├── 126email/ │ └── 126email.py ├── 163email/ │ └── 163email.py ├── 163youdao/ │ └── 163youdao.py ├── Github/ │ └── login.py ├── LICENSE ├── NeteaseCloudMusicDownload/ │ └── api.py ├── README-Test.md ├── README-en-us.md ├── README.md ├── baidu/ │ ├── baidu.py │ ├── requirements.txt │ └── util.py ├── baidu_translate/ │ ├── Baidufanyi.py │ └── translate.js ├── bilibili/ │ └── bilibili.py ├── csdn/ │ ├── README │ └── selenium_csdn.py ├── douban/ │ ├── douban.py │ └── douban_spider.py ├── facebook/ │ └── facebook.py ├── guoke/ │ ├── guoke.py │ └── guoke_spider.py ├── jd_login/ │ ├── Method_First/ │ │ ├── Try_selenium.py │ │ ├── ban.txt │ │ ├── choice.txt │ │ └── config.py │ ├── Method_Second/ │ │ ├── Config.py │ │ ├── Truekeyword.txt │ │ └── main.py │ ├── README.md │ └── login_by_selenium.py ├── lagou/ │ └── Lagou.py ├── liepin/ │ ├── README.md │ ├── liepinSpd/ │ │ ├── liepinSpd/ │ │ │ ├── __init__.py │ │ │ ├── dbhelper.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders/ │ │ │ ├── __init__.py │ │ │ └── lpspider.py │ │ ├── run_liepin1.py │ │ └── scrapy.cfg │ ├── liepinSpd2/ │ │ ├── liepinSpd2/ │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders/ │ │ │ ├── __init__.py │ │ │ └── liepinJob.py │ │ ├── run_liepin2.py │ │ └── scrapy.cfg │ ├── liepinSpd_500/ │ │ ├── liepinSpd/ │ │ │ ├── __init__.py │ │ │ ├── dbhelper.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders/ │ │ │ ├── __init__.py │ │ │ └── lpspider.py │ │ ├── run_liepin1.py │ │ └── scrapy.cfg │ ├── liepinSpecialCom/ │ │ ├── liepinSpecialCom/ │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders/ │ │ │ ├── __init__.py │ │ │ └── lpspecialcom.py │ │ ├── run_liepinspecialcom.py │ │ └── scrapy.cfg │ ├── liepinSpecialComJob/ │ │ ├── liepinSpecialComJob/ │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders/ │ │ │ ├── __init__.py │ │ │ └── lpspecialcomjob.py │ │ ├── run_liepinspecialjob.py │ │ └── scrapy.cfg │ └── liepin_login.py ├── qqmusic/ │ ├── qqmusic_spider.py │ └── sign.js ├── qqzone/ │ └── qq_zone.py ├── qsbk/ │ └── qiushibaike.py ├── sina/ │ ├── sina.py │ └── spider/ │ ├── Ajax_weibo.py │ └── selenium_test.py ├── taobao/ │ ├── mac_chromedriver/ │ │ └── chromedriver │ ├── taobao_via_username_password.py │ └── taobao_via_weibo.py ├── tieba/ │ └── tieba_spider.py ├── tuchong/ │ └── tuchong.py ├── webWeixin/ │ └── webWeixin.py ├── xiamiMusic/ │ ├── README │ └── api.py └── zhaopingou/ └── zhaopingou_login.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ *.js linguist-language=python *.css linguist-language=python *.html linguist-language=python ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # Mac os .DS_Store */.DS_Store # pycharm .idea .env ================================================ FILE: 126email/126email.py ================================================ import time from selenium import webdriver from getpass import getpass def login(): acount_num = input('请输入账号:') passwd_str = getpass('请输入密码:') driver = webdriver.Chrome() url = 'http://mail.126.com/' driver.get(url) time.sleep(30) elem = driver.find_element_by_css_selector("iframe[id^='x-URS-iframe']") # # 126登陆框是使用iframe进行嵌套的,所以需要先切换到该iframe driver.switch_to.frame(elem) acount = driver.find_element_by_name('email') acount.clear() acount.send_keys(acount_num) passwd = driver.find_element_by_name('password') passwd.clear() passwd.send_keys(passwd_str) time.sleep(3) click_button = driver.find_element_by_id('dologin') click_button.click() time.sleep(5) cur_cookies = driver.get_cookies()[0] return cur_cookies if __name__ == '__main__': login() ================================================ FILE: 163email/163email.py ================================================ import time from getpass import getpass from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait def login(): acount_num = input('请输入账号:') passwd_str = getpass('请输入密码:') driver = webdriver.Chrome() url = 'http://mail.163.com/' driver.get(url) # 等待页面加载完成,出现可以点击到密码登录的button wait = WebDriverWait(driver, 10) wait.until(EC.element_to_be_clickable((By.ID, 'lbNormal'))) driver.find_element_by_id('lbNormal').click() # 使用CSSSelector正则匹配头部 elem = driver.find_element_by_css_selector("iframe[id^='x-URS-iframe']") # 163登陆框是使用iframe进行嵌套的,所以需要先切换到该iframe driver.switch_to.frame(elem) account_el = driver.find_element_by_xpath('//input[@name="email"]') account_el.clear() account_el.send_keys(acount_num) password_el = driver.find_element_by_xpath('//input[@name="password"]') password_el.clear() password_el.send_keys(passwd_str) login_el = driver.find_element_by_xpath('//a[@id="dologin"]') login_el.click() time.sleep(10) cur_cookies = driver.get_cookies() return cur_cookies if __name__ == '__main__': login() ================================================ FILE: 163youdao/163youdao.py ================================================ import time from selenium import webdriver login_url = "http://account.youdao.com/login?service=dict" xpaths = {'usernameTxtBox': ".//*[@id='username']", 'passwordTxtBox': ".//*[@id='password']", 'submitButton': ".//*[@id='login']/div[2]/div/div[1]/form/p[4]/nobr/input", } def login(): mydriver = webdriver.Firefox() mydriver.get(login_url) mydriver.maximize_window() # Clear Username TextBox if already allowed "Remember Me" mydriver.find_element_by_xpath(xpaths['usernameTxtBox']).clear() username = input('Please type your user name:\n') # Write Username in Username TextBox mydriver.find_element_by_xpath(xpaths['usernameTxtBox']).send_keys(username) # Clear Password TextBox if already allowed "Remember Me" mydriver.find_element_by_xpath(xpaths['passwordTxtBox']).clear() password = input('Please type your password:\n') # Write Password in password TextBox mydriver.find_element_by_xpath(xpaths['passwordTxtBox']).send_keys(password) # Click Login button mydriver.find_element_by_xpath(xpaths['submitButton']).click() # add sleep print('登录成功') time.sleep(5) if __name__ == '__main__': login() ================================================ FILE: Github/login.py ================================================ # -*- coding: utf-8 -*- # @Author: CriseLYJ # @Date: 2020-08-14 12:13:11 import re import requests from getpass import getpass class GithubLogin(object): def __init__(self, email, password): # 初始化信息 self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'Referer': 'https://github.com/', 'Host': 'github.com' } self.session = requests.Session() self.login_url = 'https://github.com/login' self.post_url = 'https://github.com/session' self.email = email self.password = password def login_GitHub(self): # 登录入口 post_data = { 'commit': 'Sign in', 'utf8': '✓', 'authenticity_token': self.get_token(), 'login': self.email, 'password': self.password } resp = self.session.post( self.post_url, data=post_data, headers=self.headers) print('StatusCode:', resp.status_code) if resp.status_code != 200: print('Login Fail') match = re.search(r'"user-login" content="(.*?)"', resp.text) user_name = match.group(1) print('UserName:', user_name) # Get login token def get_token(self): response = self.session.get(self.login_url, headers=self.headers) if response.status_code != 200: print('Get token fail') return None match = re.search( r'name="authenticity_token" value="(.*?)"', response.text) if not match: print('Get Token Fail') return None return match.group(1) if __name__ == '__main__': email = input('Account:') password = getpass('Password:') login = GithubLogin(email, password) login.login_GitHub() ================================================ FILE: LICENSE ================================================ The MIT License Copyright (c) 2018 CriseLYJ. https://github.com/CriseLYJ/awesome-python-login-model Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: NeteaseCloudMusicDownload/api.py ================================================ # -*- coding: utf-8 -*- # @Author: CriseLYJ # @Date: 2020-08-14 13:48:23 import requests import math import random from Crypto.Cipher import AES import base64 import codecs import os class decrypt_music(object): def __init__(self, d): self.d = d self.e = '010001' self.f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5a" \ "a76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46be" \ "e255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7" self.g = '0CoJUm6Qyw8W8jud' self.random_text = self.get_random_str() def get_random_str(self): str = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' res = '' for x in range(16): index = math.floor(random.random() * len(str)) res += str[index] return res def aes_encrypt(self, text, key): iv = b'0102030405060708' pad = 16 - len(text.encode()) % 16 text = text + pad * chr(pad) # fix: https://github.com/Kr1s77/awesome-python-login-model/issues/100#issuecomment-673897848 # error: TypeError: Object type cannot be passed to C code encryptor = AES.new(key.encode(), AES.MODE_CBC, iv) msg = base64.b64encode(encryptor.encrypt(text.encode())) return msg def rsa_encrypt(self, value, text, modulus): '''进行rsa加密''' text = text[::-1] rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(value, 16) % int(modulus, 16) return format(rs, 'x').zfill(256) def get_data(self): params = self.aes_encrypt(self.d, self.g) params = self.aes_encrypt(params.decode('utf-8'), self.random_text) enc_sec_key = self.rsa_encrypt(self.e, self.random_text, self.f) return { 'params': params, 'encSecKey': enc_sec_key } class Spider(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 'Cookie': '_iuqxldmzr_=32; _ntes_nnid=8d4ef0883a3bcc9d3a2889b0bf36766a,1533782432391; _ntes_nuid=8d4ef0883a3bcc9d3a2889b0bf36766a; __utmc=94650624; WM_TID=GzmBlbRkRGQXeQiYuDVCfoEatU6VSsKC; playerid=19729878; __utma=94650624.1180067615.1533782433.1533816989.1533822858.9; __utmz=94650624.1533822858.9.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; WM_NI=S5gViyNVs14K%2BZoVerGK69gLlmtnH5NqzyHcCUY%2BiWm2ZaHATeI1gfsEnK%2BQ1jyP%2FROzbzDV0AyJHR4YQfBetXSRipyrYCFn%2BNdA%2FA8Mv80riS3cuMVJi%2BAFgCpXTiHBNHE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6ee84b674afedfbd3cd7d98b8e1d0f554f888a4abc76990b184badc4f89e7af8ece2af0fea7c3b92a91eba9b7ec738e8abdd2b741e986a1b7e87a8595fadae648b0b3bc8fcb3f8eafb69acb69818b97ccec5dafee9682cb4b98bb87d2e66eb19ba2acaa5bf3b6b7b1ae5a8da6ae9bc75ef49fb7abcb5af8879f87c16fb8889db3ec7cbbae97a4c566e992aca2ae4bfc93bad9b37aab8dfd84f8479696a7ccc44ea59dc0b9d7638c9e82a9c837e2a3; JSESSIONID-WYYY=sHwCKYJYxz6ODfURChA471BMF%5CSVf3%5CTc8Qcy9h9Whj6CfMxw4YWTMV7CIx5g6rqW8OBv04YGHwwq%2B%5CD1N61qknTP%2Fym%2BHJZ1ylSH1EabbQASc9ywIT8YvOr%2FpMgvmm1cbr2%2Bd6ssMYXuTlpOIrKqp%5C%2FM611EhmfAfU47%5CSQWAs%2BYzgY%3A1533828139236' } def __get_songs(self, name): d = '{"hlpretag":"","hlposttag":"","s":"%s","type":"1","offset":"0","total":"true","limit":"30","csrf_token":""}' % name wyy = decrypt_music(d) data = wyy.get_data() url = 'https://music.163.com/weapi/cloudsearch/get/web?csrf_token=' response = requests.post(url, data=data, headers=self.headers).json() return response['result'] def __get_mp3(self, id): d = '{"ids":"[%s]","br":320000,"csrf_token":""}' % id wyy = decrypt_music(d) data = wyy.get_data() url = 'https://music.163.com/weapi/song/enhance/player/url?csrf_token=' response = requests.post(url, data=data, headers=self.headers).json() print(response) return response['data'][0]['url'] def __download_mp3(self, url, filename): abspath = os.path.abspath('.') os.chdir(abspath) response = requests.get(url, headers=self.headers).content path = os.path.join(abspath, filename) with open(filename + '.mp3', 'wb') as f: f.write(response) print('下载完毕,可以在%s 路径下查看' % path + '.mp3') def __print_info(self, songs): """打印歌曲需要下载的歌曲信息""" songs_list = [] for num, song in enumerate(songs): print(num, '歌曲名字:', song['name'], '作者:', song['ar'][0]['name']) songs_list.append((song['name'], song['id'])) return songs_list def run(self): while True: name = input('请输入你需要下载的歌曲:') songs = self.__get_songs(name) if songs['songCount'] == 0: print('没有搜到此歌曲,请换个关键字') else: songs = self.__print_info(songs['songs']) num = input('请输入需要下载的歌曲,输入左边对应数字即可') url = self.__get_mp3(songs[int(num)][1]) if not url: print('歌曲需要收费,下载失败') else: filename = songs[int(num)][0] self.__download_mp3(url, filename) flag = input('如需继续可以按任意键进行搜歌,否则按0结束程序') if flag == '0': break print('程序结束!') if __name__ == '__main__': spider = Spider() spider.run() ================================================ FILE: README-Test.md ================================================ ## Test ### Bilibili自动登录测试正常,成功率98% ![](./images/bilibili.gif) ![](./images/bilibili.jpg) ### web微信 ![](./images/weixin.gif) ![Alt text](./images/weixin.jpg) ### 图虫Spider ![](./images/tuchong.gif) ![](./images/tuchong.jpg) ### 淘宝web - taobao.py为模拟登录 - 剩下的文件为爬虫 ### Github ![](./images/github.jpg) ### 新增链家Spider ![](./images/lianjia.gif) ``` 1. 爬取淘宝各子标签,按销量排名商品信息,按分类保存至MongoDB 2. 通过pandas进行数据分析 3 .将商品在各省分布、销量排行、地图分布等通过matplotlib绘图显示 ``` ### guoke.spider使用需谨慎,下载的比较快!10秒能下载一堆,截图我就不展示了,已经删除,东西太多了😝 ### 微博 - sina.py为模拟登录 - spider文件夹中为爬虫 ``` 1. 输入要爬取的博主ID,获取ajax请求 2. 解析json数据,爬取博主所有微博,保存至MySQL ``` ### 网易云音乐 - 新增网易云音乐下载,之前的一个小demo应该还可以用,Crypto包应该挺难搞的,安装之后还是导入不了,推荐去百度一下,百度上的这个解决方法有很多,我就不多赘述了嘿嘿! ### 知乎 - 知乎登录没有问题,不过要手动输入验证码 - 知乎登录遇到“execjs._exceptions.ProgramError: TypeError: 'exports' 未定义” - 原因以及解决办法: ``` 1. 由于是你本地的JScript引擎只有一个默认的JScript,所以会造成json未定义的错误。 2. execjs会自动使用当前电脑上的运行时环境 3. 解决办法:安装一个nodejs的V8引擎就可以了 ``` ![](./images/zhihu.jpg) ### 糗事百科 ![](./images/qiushibaike.gif) ![](./images/qiushibaike.jpg) ### 百度翻译 - 输入英语自动翻译 ![](./images/baidu_translate.gif) ================================================ FILE: README-en-us.md ================================================

🐍Website_login_mode


Master


"Did you know all your doors were locked?" - Riddick (The Chronicles of Riddick)


Created by CriseLYJ

**** # 🌟Website_login_mode I collected some major website login methods, and some website crawling programs, some are registered through selenium, some are directly simulated login by capturing packets, some are using scrapy, I hope to help Xiaobai, this project is used for research and sharing The simulated landing mode of the big website, and the crawler program, I will continue to update. . . ## Simulate login to some common websites and crawl corresponding information ## About The basic login is based on direct login or using selenium+webdriver. Some websites are very difficult to log in directly. For example, qq space, bilibili, etc. if you use selenium, it is relatively easy. Although it is selenium when logging in, for efficiency, we can maintain the cookie obtained after login, and then call requests or scrapy for data collection, so the speed of data collection can be guaranteed. ## Completed - [x] [Facebook](https://www.facebook.com/) - [x] [无需身份验证即可抓取Twitter前端API](https://twitter.com/) - [x] [微博网页版](http://weibo.com) - [x] [知乎](http://zhihu.com) - [x] [QQZone](https://qzone.qq.com/) - [x] [CSDN](https://www.csdn.net/) - [x] [淘宝](www.taobao.com) - [x] [Baidu](www.baidu.com) - [x] [果壳](https://www.guokr.com/) - [x] [JingDong 模拟登录和自动申请京东试用](https://www.jd.com/) - [x] [163mail](https://mail.163.com/) - [x] [拉钩](https://www.lagou.com/) - [x] [Bilibili](https://www.bilibili.com/) - [x] [豆瓣](https://www.douban.com/) - [x] [Baidu2](www.baidu.com) - [x] [猎聘网](https://www.liepin.com/) - [x] [微信网页版登录并获取好友列表](https://wx.qq.com/) - [x] [Github](https://github.com/) - [x] [爬取图虫相应的图片](https://tuchong.com/) ## show ### Bilibili automatic login test is normal, the success rate is 98% ![](./images/bilibili.jpg) ### web Weichat ![Alt text](./images/weixin.jpg) ### 图虫spider ![](./images/Jietu20190306-232224.jpg) ![](./images/Jietu20190306-232303.jpg) ### TaoBaoweb - taobao.py为模拟登录 - 剩下的文件为爬虫 ### Github ![](./images/github.jpg) ``` 1. Climb the sub-labels of Taobao, rank the product information by sales, and save to MongoDB by category. 2. Data analysis by pandas 3. Display the distribution of goods in each province, sales ranking, map distribution, etc. through matplotlib ``` ### Guoke.spider use caution, download faster! 10 seconds to download a bunch, screenshots I will not show, has been deleted, too many things 😝 ### Sina - sina.py: Log in for the simulation - spider: Folder in the crawler ``` 1. Enter the blogger ID to crawl and get an ajax request 2. Parse the json data, crawl all the bloggers of the blogger, save to MySQL ``` ## tips of pull request - Welcome everyone to come pull request 💗 ## Problems - About the verification code: The method used in this project does not process the verification code. The difficulty of identifying the complex verification code is still relatively large at present. In my opinion, the best way to do reptiles is to try to avoid the verification code. - Code invalidation: Due to website policy or style change, the code is invalid, please give me an issue. If you have already solved it, you can mention PR, thank you! ## Another - If you have any website that is difficult to log in, such as a website that uses selenium+webdriver and can't log in, please feel free to give me an issue. - If the repo is helpful to everyone, give a star encouragement. ## something to add 1. After writing the project for a period of time, I found that the style of the code and the ease of use of the program, scalability, and readability of the code all have certain problems, so the next most important thing is to refactor the code so that everyone can It's easier to make some small features of your own. 2. If you feel that the login of a website is very representative, please feel free to ask in the issue 3. If the login to the site is very interesting, I will add it in a later update. 4. The login mechanism of the website may change frequently, so when the current simulated login rule cannot be used, please submit it in the issue. - If you have a lot of attention, I will continue to maintain this repository to bring more things and refactor the code. ## Acknowledgments - Thanks for all! ## Written at the end - I need your support. - And I think you can give me a 🌟``star``!s ================================================ FILE: README.md ================================================

🎉Life is fantastic🥳!~


Master


"Did you know all your doors were locked?" - Riddick (The Chronicles of Riddick)


Branch Stars Forks License Awesome


Created by @kris

**** ## 传送门 - [x] [4G 代理](https://github.com/Kr1s77/FgSurfing) - [x] [异常处理回调,直接 hook 所有函数,和类](https://github.com/Kr1s77/abnormalities) 给个 🌟 再走吧... ## 💕Website login model 一些爬虫示例程序,以及模拟登陆程序,模拟登陆基于 selenium,有些模拟登录基于 js 逆向,持续更新,有问题可以直接提交 Issues,欢迎提交 PR, 测试通过可以直接 merge,文中所有程序都是使用 ``python3`` 编写 :-) ## About 模拟登陆基本采用的是直接登录或者使用selenium+webdriver的方式,有的网站直接登录难度很大,比如qq空间,bilibili等如果采用selenium就相对轻松一些。 虽然在登录的时候采用的是selenium,为了效率,我们可以在登录过后得到的cookie维护起来,然后调用requests或者scrapy等进行数据采集,这样数据采集的速度可以得到保证。 ## WebDriver [Chrome](https://chromedriver.chromium.org/) [FireFox](https://github.com/mozilla/geckodriver/releases/) ## Completed - [x] [虾米音乐](https://www.xiami.com/) - [x] [Facebook](https://www.facebook.com/) - [x] [微博网页版](http://weibo.com) - [x] [知乎](http://zhihu.com) - [x] [QQZone](https://qzone.qq.com/) - [x] [CSDN](https://www.csdn.net/) - [x] [淘宝-接口修复完成-可用](https://login.taobao.com/member/login.jhtml) - [x] [CSDN--已重构](https://www.csdn.net/) - [x] [Baidu](www.baidu.com) - [x] [果壳](https://www.guokr.com/) - [x] [JingDong 模拟登录和自动申请京东试用](https://www.jd.com/) - [x] [163mail](https://mail.163.com/) - [x] [拉钩](https://www.lagou.com/) - [x] [Bilibili](https://www.bilibili.com/) - [x] [豆瓣](https://www.douban.com/) - [x] [豆瓣spider](https://www.douban.com/) - [x] [Baidu](www.baidu.com) - [x] [猎聘网](https://www.liepin.com/) - [x] [微信网页版登录并获取好友列表](https://wx.qq.com/) - [x] [Github](https://github.com/) - [x] [爬取图虫相应的图片](https://tuchong.com/) - [x] [网易云音乐](https://music.163.com/) - [x] [糗事百科--改为协程版](https://www.qiushibaike.com/) - [x] [百度贴吧spider](https://tieba.baidu.com/) - [x] [百度翻译](https://fanyi.baidu.com/) ## catalogue - [x] [虾米音乐](https://github.com/Kr1s77/awesome-python-login-model/tree/master/xiamiMusic) - [x] [Facebook模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/facebook) - [x] [微博网页版模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/sina) - [x] [QQZone模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/qqzone) - [x] [CSDN模拟登录--已恢复](https://github.com/Kr1s77/awesome-python-login-model/blob/master/csdn) - [x] [淘宝爬虫--重构中](https://github.com/Kr1s77/awesome-python-login-model/tree/master/taobao) - [x] [Baidu模拟登录一](https://github.com/Kr1s77/awesome-python-login-model/tree/master/baidu) - [x] [果壳爬虫程序](https://github.com/Kr1s77/awesome-python-login-model/tree/master/guoke) - [x] [JingDong 模拟登录和自动申请京东试用](https://github.com/Kr1s77/awesome-python-login-model/tree/master/jd_login) - [x] [163mail--已恢复](https://github.com/Kr1s77/awesome-python-login-model/blob/master/163email/163email.py) - [x] [拉钩模拟登录--已失效](https://github.com/Kr1s77/awesome-python-login-model/blob/master/lagou/Lagou.py) - [x] [Bilibili模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/bilibili/bilibili.py) - [x] [豆瓣](https://github.com/Kr1s77/awesome-python-login-model/blob/master/douban/douban.py) - [x] [Baidu2模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/baidu2/baidu.py) - [x] [猎聘网模拟登录](https://github.com/Kr1s77/awesome-python-login-model/tree/master/liepin) - [x] [微信网页版登录并获取好友列表](https://github.com/Kr1s77/awesome-python-login-model/blob/master/webWeixin/webWeixin.py) - [x] [Github模拟登录两种解决方案都可行](https://github.com/Kr1s77/awesome-python-login-model/tree/master/Github) - [x] [爬取图虫想要的图片](https://github.com/Kr1s77/awesome-python-login-model/blob/master/tuchong/tuchong.py) - [x] [网易云音乐downloader](https://github.com/Kr1s77/awesome-python-login-model/blob/master/NeteaseCloudMusicDownload/wangyiyun_spider.py) - [x] [糗事百科爬虫](https://github.com/Kr1s77/awesome-python-login-model/blob/master/qsbk/qiushibaike.py) - [x] [淘宝登陆-访问](https://login.taobao.com/member/login.jhtml) # Test > [Please touch here to view test images](./README-Test.md) ## Informations - 为感谢你们的支持,准备写一套免费爬虫的教程,保证你学会以后可以爬取市面上大部分的网站,[教程地址](https://github.com/CriseLYJ/-Python-crawler-starts-from-zero) ## tips of pull request - 欢迎大家一起来 pull request 💗 ## Problems - 关于验证码:本项目所用的方法都没有处理验证码,识别复杂验证码的难度就目前来说,还是比较大的。以我的心得来说,做爬虫最好的方式就是尽量规避验证码。 - 代码失效:由于网站策略或者样式改变,导致代码失效,请给我提issue,如果你已经解决,可以提PR,谢谢! - 正在对部分代码进行优化。。。 - 如果该repo对大家有帮助,记得 star 哦。 ## Acknowledgments > [@deepforce](https://github.com/deepforce) | [@cclauss](https://github.com/cclauss) | [ksoeasyxiaosi](https://github.com/ksoeasyxiaosi) | [JasonJunJun](https://github.com/JasonJunJun) | [MediocrityXT](https://github.com/MediocrityXT) - 感谢以上开发者的支持和贡献。 ## 联系我 - 欢迎反馈! - Email : criselyj@163.com ## 注意: - 本项目仅用于学习和交流 > 欢迎任何人参与和完善:一个人可以走的很快,但是一群人却可以走的更远 ================================================ FILE: baidu/baidu.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import re import sys import time from uuid import uuid4 from getpass import getpass import requests from util import * if (sys.version_info < (3, 0)): input = raw_input class BaiduLogin(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7', 'referer': 'https://pan.baidu.com/', } self.sess = requests.session() self.gid = str(uuid4()).upper() self.token = None self.key = None self.public_key = None def _init_cookies(self): """初始化cookies :return: """ self.sess.get(url='https://pan.baidu.com/', headers=self.headers) def _get_token(self): """获取登陆token :return: """ url = 'https://passport.baidu.com/v2/api/?getapi' payload = { 'getapi': '', 'tpl': 'mn', 'apiver': 'v3', 'tt': str(int(time.time() * 1000)), 'class': 'login', 'gid': self.gid, 'loginversion': 'v4', 'logintype': 'dialogLogin', 'traceid': '', 'callback': 'bd__cbs__pivyke', } resp = self.sess.get(url=url, params=payload, headers=self.headers) js = parse_json(resp.text.replace("\'", "\"")) self.token = js['data']['token'] def _get_public_key(self): """获取RSA公钥 :return: RSA公钥 """ url = 'https://passport.baidu.com/v2/getpublickey' payload = { 'token': self.token, 'tpl': 'mn', 'apiver': 'v3', 'tt': str(int(time.time() * 1000)), 'gid': self.gid, 'loginversion': 'v4', 'traceid': '', 'callback': 'bd__cbs__h02h0j' } resp = self.sess.get(url=url, params=payload, headers=self.headers) js = parse_json(resp.text.replace("\'", "\"")) self.key, self.public_key = js.get('key'), js.get('pubkey') def login(self, username, password, retry=4): """用户名密码登陆 :param username: 用户名 :param password: 密码 :return: """ self._init_cookies() self._get_token() self._get_public_key() url = 'https://passport.baidu.com/v2/api/?login' data = { 'staticpage': 'https://www.baidu.com/cache/user/html/v3Jump.html', 'charset': 'UTF-8', 'token': self.token, 'tpl': 'netdisk', 'subpro': 'netdisk_web', 'apiver': 'v3', 'tt': str(int(time.time() * 1000)), 'codestring': '', 'safeflg': '0', 'u': 'https://www.baidu.com/', 'isPhone': 'false', 'detect': '1', 'gid': self.gid, 'quick_user': '0', 'logintype': 'dialogLogin', 'logLoginType': 'pc_loginDialog', 'idc': '', 'loginmerge': 'true', 'splogin': 'rate', 'username': username, 'password': encrypt_pwd(password, self.public_key), 'rsakey': self.key, 'crypttype': '12', 'ppui_logintime': 254896, 'countrycode': '', 'loginversion': 'v4', 'traceid': '', 'callback': 'parent.bd__pcbs__oxzeyj' } for _ in range(retry): resp = self.sess.post(url=url, headers=self.headers, data=data) m = re.search('.*href \+= "(.*)"\+accounts', resp.text) result = m.group(1) d = dict([x.split("=") for x in result.split("&")]) err_no = d.get('err_no') if err_no == '0': print('Login success!') return elif err_no in ['6', '257']: code_string = d.get('codeString') data['codestring'] = code_string resp = self.sess.get( url='https://passport.baidu.com/cgi-bin/genimage?{}'.format(code_string), headers=self.headers ) image_path = os.path.join(os.getcwd(), 'vcode-login.jpg') save_image(resp, image_path) open_image(image_path) verify_code = input('Please enter the verify code for login(return change):') data['verifycode'] = verify_code elif err_no == '120021': raise LoginError("Account is in risk, please do security verification first!") elif err_no in ['4', '7']: raise LoginError('Error username or password!') else: raise LoginError("Unknown error:" + result) raise LoginError('Login Fail!') class LoginError(Exception): pass if __name__ == '__main__': username = input("Username: ") password = getpass("Password: ") b = BaiduLogin() b.login(username=username, password=password) ================================================ FILE: baidu/requirements.txt ================================================ requests>=2.20.0 pycryptodome>=3.6.6 ================================================ FILE: baidu/util.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- import os import json from base64 import b64encode from Crypto.PublicKey import RSA from Crypto.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 def encrypt_pwd(password, public_key): rsa_key = RSA.importKey(public_key) encryptor = Cipher_pkcs1_v1_5.new(rsa_key) cipher = b64encode(encryptor.encrypt(password.encode('utf-8'))) return cipher.decode('utf-8') def open_image(image_file): if os.name == "nt": os.system('start ' + image_file) # for Windows else: if os.uname()[0] == "Linux": os.system("eog " + image_file) # for Linux else: os.system("open " + image_file) # for Mac def save_image(resp, image_file): with open(image_file, 'wb') as f: for chunk in resp.iter_content(chunk_size=1024): f.write(chunk) def parse_json(s): begin = s.find('{') end = s.rfind('}') + 1 return json.loads(s[begin:end]) ================================================ FILE: baidu_translate/Baidufanyi.py ================================================ # !/usr/bin/python3 # -*- coding: utf-8 -*- """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ """ """ 请求url分析 :https://fanyi.baidu.com/basetrans 请求方式分析 :POST 请求参数分析 : { query: hello from: en to: zh token: 6f5c83b84d69ad3633abdf18abcb030d sign: 54706.276099} 请求头分析 """ # 代码实现流程 # 1. 实现面对对象构建爬虫对象 # 2. 爬虫流程四步骤 # 2.1 获取URl # 2.2 发送请求获取响应 # 2.3 从响应中提取数据 # 2.4 保存数据 import requests import js2py context = js2py.EvalJs() # 翻译模式 # 0:英译中 1:中译英 translating_mode = 0 class BaiDuTranslater(object): """ 百度翻译爬虫 """ def __init__(self, query): # 初始化 self.url = "https://fanyi.baidu.com/basetrans" self.query = query self.headers = { "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1", "Referer": "https://fanyi.baidu.com/", "Cookie": "BAIDUID=714BFAAF02DA927F583935C7A354949A:FG=1; BIDUPSID=714BFAAF02DA927F583935C7A354949A; PSTM=1553390486; delPer=0; PSINO=5; H_PS_PSSID=28742_1463_21125_18559_28723_28557_28697_28585_28640_28604_28626_22160; locale=zh; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1553658863,1553766321,1553769980,1553770442; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1553770442; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1553766258,1553766321,1553769980,1553770442; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1553770442" } def make_sign(self): # js逆向获取sign的值 with open("translate.js", "r", encoding="utf-8") as f: context.execute(f.read()) # 调用js中的函数生成sign sign = context.a(self.query) # 将sign加入到data中 return sign def make_data(self, sign): # 判断翻译模式,选取对应的 from 和 to 值. if translating_mode == 0: from_str = "en" to_str = "zh" else: from_str = "zh" to_str = "en" data = { "query": self.query, "from": from_str, "to": to_str, "token": "6f5c83b84d69ad3633abdf18abcb030d", "sign": sign } return data def get_content(self, data): # 发送请求获取响应 response = requests.post( url=self.url, headers=self.headers, data=data ) return response.json()["trans"][0]["dst"] def run(self): """运行程序""" # 获取sign的值 sign = self.make_sign() # 构建参数 data = self.make_data(sign) # 获取翻译内容 content = self.get_content(data) print(content) if __name__ == '__main__': translating_mode = int(input("请输入翻译模式(0:英译中 1:中译英):")) query = input("请输入您要翻译的内容:") translater = BaiDuTranslater(query) translater.run() ================================================ FILE: baidu_translate/translate.js ================================================ var i = "320305.131321201" function n(r, o) { for (var t = 0; t < o.length - 2; t += 3) { var e = o.charAt(t + 2); e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e), e = "+" === o.charAt(t + 1) ? r >>> e : r << e, r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e } return r } function a(r) { var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); if (null === t) { var a = r.length; a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10)) } else { for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++) "" !== C[h] && u.push.apply(u, e(C[h].split(""))), h !== f - 1 && u.push(t[h]); var g = u.length; g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join("")) } var l = void 0 , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); l = null !== i ? i : (i = o.common[d] || "") || ""; for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) { var p = r.charCodeAt(F); 128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)), c[v++] = p >> 18 | 240, c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224, c[v++] = p >> 6 & 63 | 128), c[v++] = 63 & p | 128) } for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++) w += c[D], w = n(w, A); return w = n(w, b), w ^= s, 0 > w && (w = (2147483647 & w) + 2147483648), w %= 1e6, w.toString() + "." + (w ^ S) } ================================================ FILE: bilibili/bilibili.py ================================================ from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.common.exceptions import TimeoutException from PIL import Image from io import BytesIO from time import sleep from getpass import getpass import random """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-7 """ class BiliBili(): """ 登陆B站, 处理验证码 电脑的缩放比例需要为100%, 否则验证码图片的获取会出现问题 """ def __init__(self, username, password): """ 初始化 """ options = webdriver.ChromeOptions() # 设置为开发者模式,避免被识别 options.add_experimental_option('excludeSwitches', ['enable-automation']) self.browser = webdriver.Chrome(options=options) self.url = 'https://passport.bilibili.com/login' self.browser.get(self.url) self.wait = WebDriverWait(self.browser, 5, 0.2) self.username = username self.password = password def get_button(self): """ 获取滑动块, 并且返回 :return: button """ button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'gt_slider_knob'))) return button def get_screenshot(self, button): """ 获取网页两次截图: 1. 鼠标悬停于button的截图 2. 鼠标点击button后的截图 :param button: 滑动块 :return: 两次截图的结果 """ ActionChains(self.browser).move_to_element(button).perform() screenshot1 = self.browser.get_screenshot_as_png() screenshot1 = Image.open(BytesIO(screenshot1)) ActionChains(self.browser).click_and_hold(button).perform() screenshot2 = self.browser.get_screenshot_as_png() screenshot2 = Image.open(BytesIO(screenshot2)) return (screenshot1, screenshot2) def get_position(self, button): """ 获取验证码图片的位置 :return: 位置的四个点参数 """ ActionChains(self.browser).move_to_element(button).perform() img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'gt_box'))) sleep(2) location = img.location size = img.size print(location, size) top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], \ location['x'] + size['width'] return top, bottom, left, right def get_geetest_image(self, button, name1='captcha1.png', name2='captcha2.png'): """ 获取两次验证码的截图: 1. 鼠标悬停于button的截图 2. 鼠标点击button后的截图 :param button: 滑动块 :param name1: 原始验证码保存的名字 :param name2: 缺块验证码保存的名字 :return: 两次验证码截图的结果 """ top, bottom, left, right = self.get_position(button) print('验证码位置', top, bottom, left, right) screenshot = self.get_screenshot(button) captcha1 = screenshot[0].crop((left, top, right, bottom)) captcha1.save(name1) captcha2 = screenshot[1].crop((left, top, right, bottom)) captcha2.save(name2) return (captcha1, captcha2) def login(self): """ 打开浏览器,并且输入账号密码 :return: None """ self.browser.get(self.url) username = self.wait.until(EC.element_to_be_clickable((By.ID, 'login-username'))) password = self.wait.until(EC.element_to_be_clickable((By.ID, 'login-passwd'))) sleep(1) username.send_keys(self.username) sleep(1) password.send_keys(self.password) def is_pixel_equal(self, img1, img2, x, y): """ 判断两个像素是否相同 :param img1: 原始验证码 :param img2: 缺块验证码 :param x: 像素点的x坐标 :param y: 像素点的y坐标 :return: 像素是否相同 """ pixel1 = img1.load()[x-1, y] pixel2 = img2.load()[x-1, y] threshold = 100 if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs( pixel1[2] - pixel2[2]) < threshold: return True else: return False def get_gap(self, img1, img2): """ 获取缺口偏移量 :param img1: 原始验证码 :param img2: 缺块验证码 :return: 第二个缺块的左侧的x坐标 """ left = 60 # 大致忽略掉第一个缺块 for i in range(left, img1.size[0]): for j in range(img1.size[1]): if not self.is_pixel_equal(img1, img2, i, j): left = i return left return left def get_track(self, distance): """ 获取滑块移动轨迹的列表 :param distance: 第二个缺块的左侧的x坐标 :return: 滑块移动轨迹列表 """ track = [] current = 0 mid = distance * 2 / 3 t = 0.2 v = 0 distance += 10 # 使滑块划过目标地点, 然后回退 while current < distance: if current < mid: a = random.randint(1, 3) else: a = -random.randint(3, 5) v0 = v v = v0 + a * t move = v0 * t + 0.5 * a * t * t current += move track.append(round(move)) for i in range(2): track.append(-random.randint(2, 3)) for i in range(2): track.append(-random.randint(1, 4)) print(track) return track def move_button(self, button, track): """ 将滑块拖动到指定位置 :param button: 滑动块 :param track: 滑块运动轨迹列表 :return: None """ ActionChains(self.browser).click_and_hold(button).perform() for i in track: ActionChains(self.browser).move_by_offset(xoffset=i, yoffset=0).perform() sleep(0.0005) sleep(0.5) ActionChains(self.browser).release().perform() def crack(self): """ 串接整个流程: 1. 输入账号密码 2. 获取滑动块 3. 获取两张验证码图片 4. 获取滑块移动轨迹 5. 将滑块拖动至指定位置 :return: """ self.login() button = self.get_button() captcha = self.get_geetest_image(button) left = self.get_gap(captcha[0], captcha[1]) print(left) track = self.get_track(left) # 如果尝试登陆失败, 则重新验证, 最多三次 times = 0 while times < 3: self.move_button(button, track) try: success = self.wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'gt_info_type'), '验证通过:')) print(success) except TimeoutException as e: times += 1 print('fail') else: print('success') return None if __name__ == '__main__': ACCOUNT = input('请输入您的账号:') PASSWORD = getpass('请输入您的密码:') test = BiliBili(ACCOUNT, PASSWORD) # 输入账号和密码 test.crack() ================================================ FILE: csdn/README ================================================ csdn login module ======================== @upload and test date: 2020-08-17 @use module: pyppeteer==0.2.2 @author: Kris $ pip install pyppeteer==0.2.2 ================================================ FILE: csdn/selenium_csdn.py ================================================ # -*- coding: utf-8 -*- # @Author: Kris # @Mail: criselyj@163.com # @Date: 2020-08-14 17:40:11 import os import random from getpass import getpass import asyncio from pyppeteer import launch base_url = 'https://passport.csdn.net/login' current_dir = os.path.dirname(os.path.realpath(__file__)) # Fix:https://github.com/miyakogi/pyppeteer/issues/183 文件权限问题。 cache_dir = os.path.join(current_dir, 'cache') if not os.path.exists(cache_dir): os.mkdir(cache_dir) class Api(object): def __init__(self, account, password): self.url = base_url self.account = account self.password = password self.browser = None self.page = None async def send_key(self): await asyncio.sleep(random.randint(2, 3)) switch_btn = await self.page.xpath('//ul/li[@class="text-tab border-right"][2]/a') await switch_btn[0].click() input_account = await self.page.xpath('//div[@class="form-group"]/div/input[1]') await input_account[0].type(self.account, {'delay': random.randint(100, 200) - 50}) await self.page.type('#password-number', self.password, {'delay': random.randint(100, 200) - 50}) await self.page.click('button[data-type=account]') await asyncio.sleep(random.randint(5, 10)) async def crawl(self): # 测试环境下 headless 设置为 False # 生产环境可以修改为无头浏览器 self.browser = await launch({ 'headless': False, 'userDataDir': cache_dir, 'defaultViewport': {'width': 1440, 'height': 1000}, 'args': ['--no-sandbox'] }) self.page = await self.browser.newPage() await self.page.goto(self.url) # 伪造当前浏览状态 防止自动化工具检测 codes = ( "() =>{ Object.defineProperties(navigator,{ webdriver:" "{ get: () => false } }) }", "() =>{ window.navigator.chrome = { runtime: {}, }; }", "() =>{ Object.defineProperty(navigator, 'languages', " "{ get: () => ['en-US', 'en'] }); }", "() =>{ Object.defineProperty(navigator, 'plugins', { " "get: () => [1, 2, 3, 4, 5,6], }); }" ) for code in codes: await self.page.evaluate(code) await self.send_key() def main(): print('[*] 模拟登陆 CSDN 程序启动...') account = input('[*] 请输入账号:') password = getpass('[*] 请输入密码:') login = Api(account, password) loop = asyncio.get_event_loop() loop.run_until_complete(login.crawl()) if __name__ == '__main__': main() ================================================ FILE: douban/douban.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import requests """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-04-04 """ """ 模拟登陆豆瓣 """ class DouBanLogin(object): def __init__(self, account, password): self.url = "https://accounts.douban.com/j/mobile/login/basic" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } """初始化数据""" self.data = { "ck": "", "name": account, "password": password, "remember": "true", "ticket": "" } self.session = requests.Session() def get_cookie(self): """模拟登陆获取cookie""" html = self.session.post( url=self.url, headers=self.headers, data=self.data ).json() if html["status"] == "success": print("恭喜你,登陆成功") def get_user_data(self): """获取用户数据表明登陆成功""" # TODO: 这里填写你用户主页的url url = "这里填写你用户主页的url" # 获取用户信息页面 html = self.session.get(url).text print(html) def run(self): """运行程序""" self.get_cookie() self.get_user_data() if __name__ == '__main__': account = input("请输入你的账号:") password = input("请输入你的密码:") login = DouBanLogin(account, password) login.run() ================================================ FILE: douban/douban_spider.py ================================================ #!/usr/bin/python3 # -*- coding: utf-8 -*- import json import requests # 定义请求url url = "https://movie.douban.com/j/search_subjects" # 定义请求头 headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } # 循环构建请求参数并且发送请求 for page_start in range(0, 100, 20): params = { "type": "movie", "tag": "热门", "sort": "recommend", "page_limit": "20", "page_start": page_start } response = requests.get( url=url, headers=headers, params=params ) # 方式一 直接转换json方法 # results = response.json() # 方式二:手动转换 # 获取字符串 content = response.content # 转换成字符串 results = json.loads(content) # 解析结果 for movie in results["subjects"]: print(movie["title"], movie["rate"]) ================================================ FILE: facebook/facebook.py ================================================ from __future__ import print_function import argparse import requests import pyquery def login(session, email, password): """ 获取cookie """ response = session.get('https://m.facebook.com') # 尝试登陆 response = session.post('https://m.facebook.com/login.php', data={ 'email': email, 'pass': password }, allow_redirects=False) if 'c_user' in response.cookies: # 说明登陆成功 homepage_resp = session.get('https://m.facebook.com/home.php') dom = pyquery.PyQuery(homepage_resp.text.encode('utf8')) fb_dtsg = dom('input[name="fb_dtsg"]').val() return fb_dtsg, response.cookies['c_user'], response.cookies['xs'] else: return False if __name__ == "__main__": parser = argparse.ArgumentParser(description='Login to Facebook') parser.add_argument('email', help='Email address') parser.add_argument('password', help='Login password') args = parser.parse_args() session = requests.session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' }) fb_dtsg, user_id, xs = login(session, args.email, args.password) if user_id: print('{0}:{1}:{2}'.format(fb_dtsg, user_id, xs)) else: print('Login Failed') ================================================ FILE: guoke/guoke.py ================================================ import requests import re headers_login = { 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Host': 'account.guokr.com', 'Pragma': 'no-cache', 'Cookie': '__utmt=1; __utma=253067679.2102330349.1540780238.1540780238.1541122809.2; __utmb=253067679.12.9.1541122812936; __utmc=253067679; __utmz=253067679.1540780238.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=253067679.|1=Is%20Registered=No=1; session=afcf1b0f-c71b-43d2-8046-f60ae28f9b45', 'Referer': 'https://account.guokr.com/sign_in/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36' } session = requests.Session() url = 'https://account.guokr.com/sign_in/' resp = session.get(url, headers=headers_login) html = resp.text csrf_token = re.search(r'id="csrf_token[\s\S]*?(\d+[\s\S]*?)"', html).group(1) captcha_rand = re.search(r'id="captchaRand[\s\S]*?(\d+)', html).group(1) img_url = 'https://account.guokr.com/captcha/' + captcha_rand with open('captcha.jpg', 'wb') as fw: fw.write(session.get(img_url, headers=headers_login).content) username = input('请输入用户名:') password = input('请输入密码:') captcha = input('请输入验证码 : ') data = { 'csrf_token': csrf_token, 'username': username, 'password': password, 'captcha': captcha, 'captcha_rand': captcha_rand, 'permanent': 'y ', } response = session.post(url, data=data) with open('response.html', 'w', encoding='utf-8') as fw: fw.write(response.text) # print(response.cookies) # print(session.cookies) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36' } homepage = 'https://www.guokr.com/i/0210199872/' with open('homepage.html', 'w', encoding='utf-8') as fw: res = session.get(homepage, headers=headers) fw.write(res.text) ================================================ FILE: guoke/guoke_spider.py ================================================ # -*- coding: utf-8 -*- import requests from urllib.parse import urlencode from requests import codes import os from multiprocessing.pool import Pool from bs4 import BeautifulSoup as bsp import json import time import re """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-7 """ def get_index(offset): base_url = 'http://www.guokr.com/apis/minisite/article.json?' data = { 'retrieve_type': "by_subject", 'limit': "20", 'offset': offset } url = base_url + urlencode(data) # print(url) try: resp = requests.get(url) if codes.ok == resp.status_code: return resp.json() except requests.ConnectionError: return None # 解析出文章的url def get_url(json): if json.get('result'): result = json.get('result') for item in result: if item.get('cell_type') is not None: continue yield item.get('url') """ try: result=json.load(json) if result: for i in result.get('result'): yield i.get('url') """ # 解析文章详情页 def get_text(url): html = requests.get(url).text print(html) soup = bsp(html, 'lxml') title = soup.find('h1', id='articleTitle').get_text() autor = soup.find('div', class_="content-th-info").find('a').get_text() article_content = soup.find('div', class_="document").find_all('p') all_p = [i.get_text() for i in article_content if not i.find('img') and not i.find('a')] # 去除标签 article = '\n'.join(all_p) yield {"title": title, "autor": autor, "article": article} def save_article(content): try: if content.get('title'): file_name = str(content.get('title')) + '.txt' with open(file_name, 'w', encoding='utf-8') as f: # f.write(json.dumps(content,ensure_ascii=False)) f.write('\n'.join([str(content.get('title')), str(content.get('autor')), str(content.get('article'))])) print('Downloaded article path is %s' % file_name) else: file_name = str(content.get('title')) + '.txt' print('Already Downloaded', file_name) except requests.ConnectionError: print('Failed to Save Image,item %s' % content) def main(offset): result = get_index(offset) all_url = get_url(result) for url in all_url: article = get_text(url) for art in article: # print(art) save_article(art) GROUP_START = 0 GROUP_END = 7 if __name__ == '__main__': for i in range(GROUP_START, GROUP_END + 1): main(offset=i * 20 + 18) time.sleep(1) ================================================ FILE: jd_login/Method_First/Try_selenium.py ================================================ #coding=utf-8 from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import TimeoutException from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import os from pyquery import PyQuery as pq from config import settings as SET import re #browser_for_login为正常浏览器,用于登录 browser_for_login = webdriver.Chrome() chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') #无头模式 browser = webdriver.Chrome(chrome_options=chrome_options) wait = WebDriverWait(browser,10) total_num_of_products = SET['total_products'] total_num_of_products_cur = 0 choice_list=[] ban_list=[] #所有的sleep为了是减慢速度, 防止被检查异常 def do_try(url): try: browser.switch_to.window(browser.window_handles[1]) browser.get(url) time.sleep(2) button = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,'#product-intro > div.info > div.try-info.clearfix.bigImg > div.info-detail.chosen > div > div.btn-wrap > a')) ) #如果按钮不是‘申请使用’,则说明该商品申请出错或者已经申请过了,则跳回到试用商品列表界面 if button.text!='申请试用': browser.switch_to.window(browser.window_handles[0]) return False button.click() #等待关注商铺的信息出来,然后点击关注即可。如果无需关注,则可能会抛出超时异常 button2 = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,'body > div.ui-dialog > div.ui-dialog-content > div > div > div.btn > a.y')) ) time.sleep(1) button2.click() time.sleep(2) browser.switch_to.window(browser.window_handles[0]) return True except TimeoutException: browser.switch_to.window(browser.window_handles[0]) #抛出超时异常则返回到试用商品列表界面即可 return True def get_try(page): url='https://try.jd.com/activity/getActivityList'+'?page='+str(page) browser.get(url) time.sleep(2) html = browser.page_source #print(html) #利用PyQuery获得所有关于试用商品跳转的class=item的
  • 标签 doc = pq(html) #因为已经申请过的商品的
  • 标签中的class除了item,还有applied,故将其删除之后申请便可跳过已申请的商品 doc('.applied').remove() items = doc('.root61 .container .w .goods-list .items .con .clearfix .item').items() #print(type(items)) #print(items) items=list(items) for item in items: #获得每个商品的标题,如果进行商品过滤则有可能有用 title = item('.p-name').text() if check_name(title) == False: continue price_text = item('.p-price').text()[6:] price = float(price_text) if price < float(SET['price_limit']): continue try_url = 'https:'+item('.link').attr('href') print('价格: ',price) print(title) #print(try_url) time.sleep(1) global total_num_of_products_cur global total_num_of_products if do_try(try_url) == True: total_num_of_products_cur +=1 print("申请成功") print('') else : print("申请失败") print('') #到达指定个数之后退出 if total_num_of_products_cur >= total_num_of_products: return def Control_try(total_page): browser.execute_script('window.open()') browser.switch_to.window(browser.window_handles[0]) for page in range(1,total_page+1): print('开始申请第'+str(page)+'页') get_try(page) global total_num_of_products global total_num_of_products_cur if total_num_of_products_cur >= total_num_of_products: return print('第'+str(page)+'页申请完成') #成功登录后将browser_for_login的cookies取出放到无头browser中即可 def login(): browser_for_login.get('https://passport.jd.com/new/login.aspx') while browser_for_login.current_url!='https://www.jd.com/': time.sleep(2) cookies = browser_for_login.get_cookies() browser_for_login.close() browser.get('https://www.jd.com') for cookie in cookies: browser.add_cookie(cookie) browser.get('https://www.jd.com') def auto_showdown(): if SET['auto_shutdown'] == True: print('\n5秒后将自动关机') time.sleep(5) os.system('shutdown -s -t 1') def deal_file(): global choice_list global ban_list if SET['choice']==True: with open('choice.txt','r') as f: choice_list = re.split('[ |.|,|!|\n]',f.read()) f.close() if SET['ban']==True: with open('ban.txt','r') as f: ban_list = re.split('[ |.|,|!|\n]',f.read()) f.close() def check_name(title): is_choice = False if len(choice_list)==0: is_choice = True for ch in choice_list: if ch in title: is_choice = True break if is_choice == False: return False is_ban = False for ba in ban_list: if ba in title: is_ban = True break if is_ban == True: return False return True if __name__ == '__main__': deal_file() login() #申请前SET['total_num_of_page']页 Control_try(SET['total_num_of_page']) browser.close() print('申请完成') auto_showdown() ================================================ FILE: jd_login/Method_First/ban.txt ================================================ Ȥ ˬ ================================================ FILE: jd_login/Method_First/choice.txt ================================================ ž ȳ Ǯ ================================================ FILE: jd_login/Method_First/config.py ================================================ #coding:utf-8 settings = { 'auto_shutdown':False, #是否自动关机,默认为False 'total_products':300, #要申请的商品个数上限,默认为300 'total_num_of_page':50, #申请前total_num_of_page页 'choice':False, #是否按照商品名称选择要申请的商品,如果设置为True,则应该创建choice.txt文件 #并将想要的商品名称写进去即可。默认为False 'ban':False #是否按照商品名称选择要过滤掉的商品,如果设置为True,则应该创建ban.txt文件 #并将想过滤掉的商品名称写进去即可。(不同商品名称之间用,.!空格或换行符隔开即可)默认为False } ================================================ FILE: jd_login/Method_Second/Config.py ================================================ """ Config.py 配置文件 """ settings = { #一天申请的限制个数 'maxApplyNum' : 300 , #试用类型 #家用电器737 手机数码652 电脑办公670 家居家装1620 服饰鞋包1315 生鲜美食12218 钟表奢品5025 家庭清洁15901 食品饮料1320 'cids' : ['737', '652' ,'670', '1620', '1315', '12218' ,'5025' , '15901' ,'1320' ,] , #申请商品价格下限 单位 元 'goodPrice' : 30 , #浏览器button最长等待时间 单位秒 'waitTime' : 10 , #试用结束后是否自动关机 True代表关机 'shutdown' : False , } ================================================ FILE: jd_login/Method_Second/Truekeyword.txt ================================================ / 表带 手表 手机 华为 huawei mate vivo oppo 小米 苹果 apple MacBook 电脑 笔记本 ipad/ 套 膜 钢化 全包 壳 支架 防水袋 自拍杆 三脚架 内存卡 / / 流量卡 手机卡 不限速 上网卡 日租卡 无限流量 0月租 纯流量 电信号码 移动号码 联通号码 / 苹果皮 智能机器人 手机电池 机器人盒子 儿童麦克风 / 腕带 充电器底座 专用电插排 / 补光灯 手机声卡 有线话筒 美颜灯 / 游戏手柄 吃鸡神器 手机散热器 吃鸡按键 吃鸡辅助 手游充电线 走位神器 单反 微单 相机 / 清洁棒 除灰 手柄 苹果 MacBook / 专用键盘布 / 运动臂包 壁虎支架 / 手链 项链 耳钉 耳环 珠宝 耳坠 吊坠 平安扣 菩提 手串 佛珠 戒指 手镯 挂件 文玩 镯子 骨链 脚链 尾戒 弥勒佛 档位珠 转运珠 木料 阿梵尼 束发带 领针 胸针 银饰 金饰 / 头部按摩器 头部按摩仪 脑部按摩爪 肩颈按摩器 电动按摩捶 洁面仪 腰部按摩器 全身按摩垫 颈部按摩器 颈椎腰按摩器 / 足疗机 / 离子精华导入仪 美容仪 补水仪 蒸脸器 眼部按摩仪 点歌机 电陶炉专用清洁刮刀 颈部护颈按摩仪 洗脸仪 洁面仪 颈椎按摩器 脖子肩颈仪 按摩护颈仪 / 电视架 电视挂架 电视机支架 电视支架 外机架 室外空调支架 托架配件 指环 / 收款播放器 收钱提示音响 / 丰胸 乳房 下垂 胸部 乳腺 美胸 文胸 / 电热护颈 颈部热敷 / 冲牙器 水牙线 洗牙器 洁牙机 / 滤芯 滤网 过滤网 空调滤 / 洗衣机罩 / 吸顶喇叭 定压喇叭 时序器 / 时序器 转换插头 切换器 监控摄像头 监控电源 电源适配器 / 财务软件 用友 管理软件 服务手册 / 硒鼓 粉盒 粉仓 感光鼓 文档保护页 底片夹 扫描配件 幻灯片片夹 墨水 搓纸轮 分页器 碳粉 墨粉 / 方块机 显示器支架 电脑架 / 键盘膜 卡套挂绳 桌牌 党员牌 口取纸 分类纸 / 机顶盒 挂历 台历 / 水族胶 莫斯胶 啫喱胶 珊瑚胶 水草胶 / 拼装模型 海绵胶带 标签纸 贴纸 车 / 贴 蜡划 车痕 修复液 车漆 钥匙包 流氓灯 大灯 手机架 支架 方向盘套 车把套 密封 缝隙 封条 除积碳 保险杠 防撞 防擦 门锁 脚垫 烟灰缸 烟缸 遮挡 摆件 车饰 抛光 喷雾 镀膜 饰品 挂件 钥匙 后视镜 雨眉 洗车液 / 雨刮 / 精油 除臭 除味 香包 香囊 硅藻 除味剂 碳包 香水 添加剂 燃油宝 清洗剂 竹炭包 补胎液 活矿石 / 马桶盖 毛巾架 浴巾架 喷头 浴室置物架 铰链 合页 西餐垫 挂钩 阀 水管 接头 衣钩 拉手 暖气管 软管 / 打泡器 起泡器 口诀表 机油 汽油 循环泵 继电器 插座面板 墙壁电源 金属开孔器 接线端子 电线连接器 展架 kt板 海报架 / 全包装修 全国承建 乡墅 装饰画 挂画 福字 防水涂料 补漏 缝剂 拉手 补漆 保护条 墙角 壁纸 墙纸 补墙漆 翻新漆 背景墙 地漏 落水器 / 鞋 裤 背心 夹克 外套 羽绒服 护腰带 童装 鞋拔 耳勺 掏耳朵 爆炸盒子 手工相册 嵌甲 甲沟炎 雪地靴 男靴 女靴 睡衣 泳衣 / 上饵器 老花镜 衬衫 打火机 帽 剑 眼镜 烟嘴 外套 针织衫 运动服 毛衣 打底衫 羊绒衫 T恤 靴 蕾丝手套 棉服 羽绒服 / 暖宫贴 宫寒贴 痛经 暖宫 暖宝宝 暖贴 暖身贴 发热贴 被子固定器 修复颈椎 中药热疗 颈椎病 / 空调套 空调罩 挂机罩 / 十字绣 钻石绣 草坪 人造草皮 圆顶吊顶 充气泵 打气泵 轨道 滑道 阻尼缓冲 防雾剂 吸顶灯 / 灭火器 预付款 防水插座 灯带 投光灯 厂房灯 广告牌 射灯 室外灯 院灯 路灯 灯笼 玄关灯 过道灯 走廊灯 吊灯 / 仿古电话 点烟器 贴墙仪 火花塞 / 面膜 唇膏 香盒 / 弹弓 小拼图 牙套矫正器 / 刮痧板 刮脸板 电子烟 镜框 营养师 减肥 减脂 烟油 烟液 烟雾 长筒袜 高筒袜 半截袜子 过膝袜 甲沟 童袜 棉衣 戒烟贴 平衡球 训练梯 敏捷梯 烟嘴 / 信息素 棉袜 袜子 车包 荧光粉 发光粉末 夜光沙 领带夹 内衣 檀香 供香 挡风板 防风罩 风衣 永生花 迷彩雨披 高尔夫球 望远镜 连帽 烟盒 纸手表 / 舌帽 字母灯 溜冰鞋 太阳花 饰唯美 太阳镜 / 缓冲门 阻尼弹簧 门吸 地吸 松动剂 润滑剂 情趣灯 防撞胶粒 胶垫 防撞贴 / 海参 闸蟹 阳澄湖 章鱼 八爪鱼 鱿鱼 海螺 / 蛋挞皮 牙签机 / 灭鼠药 杀鼠剂 蟑螂药 灭蟑螂 杀蟑螂 除蟑螂 捕鼠器 灭鼠器 驱鼠器 甲醛检测 检测甲醛 防静电喷雾 边夹 发卡 bb夹 发箍 儿童头饰 发饰 / 除味盒 去味剂 除臭剂 除味剂 蔬果净 湿巾 恒源祥 / / 连衣裙 冬季衣服 毛呢大衣 / 芡实茶 祛湿茶 花茶包 薏米茶 除湿气茶 薏仁茶 姜茶 枳椇子 润甘元 茶叶 菊花茶 水果茶 绿茶 蒙顶山茶 粉墨茶 向阳汤 玫瑰花茶 养生茶 荷叶茶 普洱茶 柑普茶 昆仑菊花 云雾茶 葛根茶 菊苣根茶 枸杞茶 葛根片 胖大海 金银花 罗汉果 金花砖茶 崖黑茶 黑茶 大红袍 武夷岩茶 柑普茶 荷叶茶 润喉茶 / 餐厅用酒 原浆老酒 朗姆酒 鸡尾酒 烹饪酒 黄酒 花雕酒 梅子酒 果酒 低度酒 米酒 饭酒 甜酒 花酒 梅酒 女士酒 竹筒酒 百年井窖 塞罕坝酒 苦荞酒 白兰地 预调酒 秘酿 威士忌 浒魄酒 冰酒 相公寨酒 高粱酒 养生酒 老酒 / 碱粉 碱面 碳酸钠 / 成人益生菌 调理肠胃饮品 / 胖子卫衣 加肥加大 宽松大码 情趣礼品 半身裙 纱裙 牙齿矫正器 / 笔记本电脑充电器 ================================================ FILE: jd_login/Method_Second/main.py ================================================ """ 京东试用自动申请程序,每天仅需执行一次即可 """ from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.common.exceptions import TimeoutException from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import time from pyquery import PyQuery as pq import json import os import getpass import base64 #载入自己编写的配置文件 from Config import settings #全局变量 #打开无界面的chrome浏览器 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') #不打印不重要的日志信息 chrome_options.add_argument('log-level=2') browser = webdriver.Chrome(chrome_options = chrome_options) #设置浏览器最长等待时间 wait = WebDriverWait(browser, settings['waitTime']) #打开用于登陆的chrome浏览器 browser_login = webdriver.Chrome() #设置浏览器最长等待时间 wait_login = WebDriverWait(browser_login, settings['waitTime']) def readCookies(): """ 从文件中读取cookies并返回 文件不存在则返回False """ #不存在cookies文件 if os.path.exists("cookies.json") == False: print("cookies文件不存在!") return False with open("cookies.json","r") as f: cookies = json.load(f) return cookies def writeCookies(cookies): """ 从浏览器中向文件写入cookies """ with open("cookies.json", "w") as f: json.dump(cookies, f) def closeSW(iApplyNum): """ 在文件中输出申请个数 iApplyNum 关闭了浏览器和程序 """ #等待5秒 time.sleep(5) #保存浏览器cookies到文件中 cookies = browser.get_cookies() writeCookies(cookies) #关闭浏览器 browser.quit() with open("log.txt", 'a') as f: #输出申请时间和数量 f.write( time.ctime() + " 申请数量:" + str(iApplyNum) + '\n') #是否关闭电脑 if settings['shutdown'] == True: os.system("shutdown -s -f") #退出程序 exit() def genekeys(): #打开正确/屏蔽词文件,并处理 keys = [] for line in open("Truekeyword.txt", 'r' ,encoding='UTF-8' ): line = line[0:line.find('\n')] if line == '': continue line = line.split('/') line[0] = line[0].strip() line[1] = line[1].strip() if line[0] == '': line[0] = [] else: line[0] = line[0].split(' ') if line[1] == '': line[1] = [] else: line[1] = line[1].split(' ') keys.append(line) return keys def goodJudge(goodName, goodPrice, keys): """ 根据商品名称和价格判断是否试用该商品 """ if goodPrice < settings['goodPrice']: return False for key in keys: booltrue = False if key[0] == []: booltrue = True for tk in key[0]: if tk == '': continue if tk in goodName: booltrue = True break if booltrue == False: continue for tk in key[1]: if tk == '': continue if tk in goodName: return False return True def do_try(url): """ 对于某个商品申请试用 url为申请网址 iApplyNum为当前申请成功的个数 """ try: #切换到选项卡1 browser.switch_to.window(browser.window_handles[1]) #访问商品网页 browser.get(url) #停2秒 time.sleep(2) #获取网页的html源码 html = browser.page_source #初始化pyquery doc = pq(html) #获取申请试用的botton button = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,'#product-intro > div.info > div.try-info.clearfix.bigImg > div.info-detail.chosen > div > div.btn-wrap > a')) ) #如果上面写的不是申请试用,就申请下一个 if button.text!='申请试用': return False #点击申请试用 button.click() #找到关注并申请的按钮 button2 = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR,'body > div.ui-dialog > div.ui-dialog-content > div > div > div.btn > a.y')) ) time.sleep(1) #点击关注 button2.click() #此时试用一件商品完成 time.sleep(2) return True #抛出超时异常 except TimeoutException: #这件商品不申请了,返回 return False def get_try(cid, iApplyNum, maxApplyNum, keys): browser.get('https://try.jd.com/activity/getActivityList?page=1&cids='+cid) #获取网页的html源码 html = browser.page_source #初始化pyquery doc = pq(html) #CSS选择器 找出总页数 pageitem = doc('.root61 .container .w .p-wrap .p-skip').items() #为了应对命名空间而采用的粗暴办法 pagestr = list(pageitem)[0].text() pagestr = pagestr[2:] pagestr = pagestr[0:pagestr.find('\n')] pagenum = int(pagestr) print("商品总页数:" + str(pagenum+1) ) for i in range(pagenum): if i >=1: #切换到下一页 browser.get('https://try.jd.com/activity/getActivityList?page='+str(i+1)+'&cids='+cid) #停2秒 time.sleep(2) html = browser.page_source doc = pq(html) #CSS选择器 找出商品列表 items = doc('.root61 .container .w .goods-list .items .con .clearfix .item').items() #迭代器转换为list类型 items=list(items) #对于每个商品进行处理 for item in items: #按钮为已申请 if item('.try-item .try-button').text() == '已申请': #已经申请过的不申请 continue #商品名称 itemname = item('.try-item .p-name').text() #商品价格 itempricetext = item('.try-item .p-price').text() #截取多余的文本 #找不到价格 出现暂无报价的情况 if itempricetext.find('¥') == -1: itemprice = 0 else: itempricetext = itempricetext[itempricetext.find('¥')+1:] #goodPrice 商品价格 itemprice = float(itempricetext) if goodJudge(itemname, itemprice, keys) == False: #不申请了 continue itemurl = item('.try-item .link') #试用该商品 if do_try('https:'+itemurl.attr('href')) == True: print("申请成功 " +str(itemprice) + " " + itemname) iApplyNum = iApplyNum + 1 #停3秒 time.sleep(2) browser.switch_to.window(browser.window_handles[0]) if iApplyNum >= maxApplyNum: print("已经成功申请" + str(maxApplyNum) + "件商品 申请结束") closeSW(iApplyNum) time.sleep(2) print(cid+'类:第'+str(i+1)+'页申请完成') return iApplyNum def trycid(): """ 控制申请类别和数量 返回已申请数量iApplyNum """ keys = genekeys() #京东限制 每天最大申请数量 maxApplyNum = settings['maxApplyNum'] iApplyNum = 0 #获取试用类型 cids = settings['cids'] browser.get('https://try.jd.com/') browser.get('https://try.jd.com/activity/getActivityList') #执行js脚本 打开一个新选项卡 browser.execute_script('window.open()') browser.switch_to.window(browser.window_handles[0]) for cid in cids: iApplyNum = get_try(cid, iApplyNum, maxApplyNum, keys) return iApplyNum def login(): """ 登陆函数 """ #必须访问一次京东 browser_login.get('https://jd.com') #读取文件中的cookies cookies = readCookies() if cookies != False: #如果从文件中读取到了cookies,就放入浏览器中 for cookie in cookies: browser_login.add_cookie(cookie) #直接去登陆界面 browser_login.get('https://passport.jd.com/login.aspx') #找到账户登陆的窗口 button_login = browser_login.find_elements_by_css_selector('#content > div.login-wrap > div.w > div > div.login-tab.login-tab-r > a') button_login = button_login[0] #点击 button_login.click() time.sleep(2) #取得用户名和密码的过程 #如果文件不存在 if os.path.exists("login.txt") == False: username = input("请输入京东用户名:") password = getpass.getpass("请输入京东密码(输入不会显示在屏幕上):") else: #从文件中读入用户名和密码 with open("login.txt",) as f: up = f.read() up = up.split('\n') username = up[0].encode() password = up[1].encode() #base64解码 username = base64.b64decode(username) username = username.decode() password = base64.b64decode(password) password = password.decode() #找到输入框 input_username = browser_login.find_element_by_name('loginname') #输入用户名 input_username.send_keys(username) #找到密码框 input_password = browser_login.find_element_by_name('nloginpwd') #输入密码 input_password.send_keys(password) #找到登录按钮 button_logOK = browser_login.find_elements_by_id('loginsubmit') button_logOK = button_logOK[0] time.sleep(2) #点击 button_logOK.click() #循环检测是否登陆 while 1: try: wait_login.until( EC.presence_of_element_located((By.CSS_SELECTOR, '#ttbar-login > div.dt.cw-icon > a')) ) break except TimeoutException: continue print('登陆成功!') time.sleep(2) #登录成功后 若不存在login.txt,则把用户名和密码写入文件 if os.path.exists("login.txt") == False: #base64编码 username = username.encode() username = base64.b64encode(username) password = password.encode() password = base64.b64encode(password) # 写入文件中 with open("login.txt", "w") as f: f.write(username.decode() +"\n") f.write(password.decode()) #把登陆浏览器的cookie转移到无界面浏览器上 #取得原浏览器的所有cookie cookies = browser_login.get_cookies() browser.get('https://www.jd.com') #cookies是一个以字典为元素的list for cookie in cookies: browser.add_cookie(cookie) #关闭登陆浏览器 browser_login.quit() if __name__ == '__main__': #登陆 login() #开始申请 iApplyNum为申请成功的个数 iApplyNum = trycid() #申请结束 closeSW(iApplyNum) ================================================ FILE: jd_login/README.md ================================================ #### Jd Spider. ================================================ FILE: jd_login/login_by_selenium.py ================================================ # tested on ubuntu15.04 import time from selenium import webdriver login_url = 'https://passport.jd.com/new/login.aspx' driver = webdriver.PhantomJS() driver.get(login_url) time.sleep(5) account = driver.find_element_by_id('loginname') password = driver.find_element_by_id('nloginpwd') submit = driver.find_element_by_id('loginsubmit') account.clear() password.clear() account.send_keys('yourname') password.send_keys('yourpassword') submit.click() time.sleep(5) # cookie和前面一样的方式获取和保存 cookies = driver.get_cookies() driver.close() ================================================ FILE: lagou/Lagou.py ================================================ # -*- coding:utf-8 -*- import re import os import time import json import sys import subprocess import requests import hashlib from bs4 import BeautifulSoup """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-6 """ class Lagou_login(object): def __init__(self): self.session = requests.session() self.CaptchaImagePath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'captcha.jpg' self.HEADERS = {'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36' ' (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36' ' Core/1.53.4882.400 QQBrowser/9.7.13059.400', 'X-Requested-With': 'XMLHttpRequest'} # 密码加密 def encryptPwd(self, passwd): # 对密码进行了md5双重加密 passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest() # veennike 这个值是在js文件找到的一个写死的值 passwd = 'veenike' + passwd + 'veenike' passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest() return passwd # 获取请求token def getTokenCode(self): login_page = 'https://passport.lagou.com/login/login.html' data = self.session.get(login_page, headers=self.HEADERS) soup = BeautifulSoup(data.content, "lxml", from_encoding='utf-8') ''' 要从登录页面提取token,code, 在头信息里面添加 ''' anti_token = {'X-Anit-Forge-Token': 'None', 'X-Anit-Forge-Code': '0'} anti = soup.findAll('script')[1].getText().splitlines() anti = [str(x) for x in anti] anti_token['X-Anit-Forge-Token'] = re.findall(r'= \'(.+?)\'', anti[1])[0] anti_token['X-Anit-Forge-Code'] = re.findall(r'= \'(.+?)\'', anti[2])[0] return anti_token # 人工读取验证码并返回 def getCaptcha(self): captchaImgUrl = 'https://passport.lagou.com/vcode/create?from=register&refresh=%s' % time.time() # 写入验证码图片 f = open(self.CaptchaImagePath, 'wb') f.write(self.session.get(captchaImgUrl, headers=self.HEADERS).content) f.close() # 打开验证码图片 if sys.platform.find('darwin') >= 0: subprocess.call(['open', self.CaptchaImagePath]) elif sys.platform.find('linux') >= 0: subprocess.call(['xdg-open', self.CaptchaImagePath]) else: os.startfile(self.CaptchaImagePath) # 输入返回验证码 captcha = input("请输入当前地址(% s)的验证码: " % self.CaptchaImagePath) print('你输入的验证码是:% s' % captcha) return captcha # 登陆操作 def login(self, user, passwd, captchaData=None, token_code=None): postData = {'isValidate': 'true', 'password': passwd, # 如需验证码,则添加上验证码 'request_form_verifyCode': (captchaData if captchaData != None else ''), 'submit': '', 'username': user } login_url = 'https://passport.lagou.com/login/login.json' # 头信息添加tokena login_headers = self.HEADERS.copy() token_code = self.getTokenCode() if token_code is None else token_code login_headers.update(token_code) # data = {"content":{"rows":[]},"message":"该帐号不存在或密码错误,请重新输入","state":400} response = self.session.post(login_url, data=postData, headers=login_headers) data = json.loads(response.content.decode('utf-8')) if data['state'] == 1: return response.content elif data['state'] == 10010: print(data['message']) captchaData = self.getCaptcha() token_code = {'X-Anit-Forge-Code': data['submitCode'], 'X-Anit-Forge-Token': data['submitToken']} return self.login(user, passwd, captchaData, token_code) else: print(data['message']) return False if __name__ == "__main__": username = input("请输入你的手机号或者邮箱\n >>>:") passwd = input("请输入你的密码\n >>>:") lg = Lagou_login() passwd = lg.encryptPwd(passwd) data = lg.login(username, passwd) if data: print(data) print('登录成功') else: print('登录不成功') ================================================ FILE: liepin/README.md ================================================ # scrapy_liepin scrapy爬猎聘,通过公司名搜索公司职位 ================================================ FILE: liepin/liepinSpd/liepinSpd/__init__.py ================================================ ================================================ FILE: liepin/liepinSpd/liepinSpd/dbhelper.py ================================================ import pymysql from scrapy.utils.project import get_project_settings#引入settings配置 class DBHelper(): def __init__(self): self.settings=get_project_settings()#获取settings配置数据 self.host=self.settings['MYSQL_HOST'] self.port=self.settings['MYSQL_PORT'] self.user=self.settings['MYSQL_USER'] self.passwd=self.settings['MYSQL_PASSWD'] self.db=self.settings['MYSQL_DBNAME'] #连接mysql def connectMysql(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, charset='utf8') return conn #连接数据库 def connectDatabase(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset='utf8') return conn #创建数据库 def createDatabase(self): conn=self.connectMysql() sql="create database if not exists "+self.db cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #创建数据表 def createTable(self,sql): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #插入数据 def insert(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor(); cur.execute(sql,params) conn.commit() cur.close() conn.close() #更新数据 def update(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #删除数据 def delete(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #测试数据库操作 class TestDBHelper(): def __init__(self): self.dbHelper=DBHelper() def testCreateDatebase(self): self.dbHelper.createDatabase() def testCreateTable(self): sql="create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))" self.dbHelper.createTable(sql) def testInsert(self): sql="insert into testtable(name,url) values(%s,%s)" params=("test","test") self.dbHelper.insert(sql,*params) def testUpdate(self): sql="update testtable set name=%s,url=%s where id=%s" params=("update","update","1") self.dbHelper.update(sql,*params) def testDelete(self): sql="delete from testtable where id=%s" params=("1") self.dbHelper.delete(sql,*params) if __name__=="__main__": testDBHelper=TestDBHelper() #testDBHelper.testCreateDatebase() # #testDBHelper.testCreateTable() # #testDBHelper.testInsert() # #testDBHelper.testUpdate() # #testDBHelper.testDelete() # ================================================ FILE: liepin/liepinSpd/liepinSpd/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LiepinspdItem(scrapy.Item): # define the fields for your item here like: as_of_date = scrapy.Field() ticker = scrapy.Field() company_name = scrapy.Field() stage = scrapy.Field() size = scrapy.Field() city = scrapy.Field() industry = scrapy.Field() comp_clearfix = scrapy.Field() rate_num = scrapy.Field() job_count = scrapy.Field() registered_capital = scrapy.Field() spider_time = scrapy.Field() origin_site = scrapy.Field() ================================================ FILE: liepin/liepinSpd/liepinSpd/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import scrapy from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import random class LiepinspdSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class LiepinspdDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MyUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENTS') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent ================================================ FILE: liepin/liepinSpd/liepinSpd/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.enterprise import adbapi import pymysql import pymysql.cursors # class LiepinspdPipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # port = settings['MYSQL_PORT'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # # insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, # (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']), # str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']), # int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],)) # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) import pymysql class LiepinspdPipeline(object): """ 同步操作 """ def __init__(self): # 建立连接 self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8' # 创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): # sql语句 insert_sql = """ insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industry,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ # 执行插入数据到数据库操作 self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']), str(item['size']), str(item['city']), str(item['industry']), str(item['comp_clearfix']), int(item['job_count']), float(item['rate_num']), float(item['registered_capital']), item['spider_time'], item['origin_site'],)) # 提交,不进行提交无法保存到数据库 self.conn.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.conn.close() ================================================ FILE: liepin/liepinSpd/liepinSpd/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for liepinSpd project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html COMPANYLIST=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469'] DEFAULT_REQUEST_HEADERS = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } BOT_NAME = 'liepinSpd' MYSQL_HOST = 'rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com' MYSQL_DBNAME = 'special_data' MYSQL_USER = 'cn_ainvest_db' MYSQL_PASSWD = 'cn_ainvest_sd3a1' MYSQL_PORT = 3306 SPIDER_MODULES = ['liepinSpd.spiders'] NEWSPIDER_MODULE = 'liepinSpd.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'liepinSpd (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False USER_AGENTS = [ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)", "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413", "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", "Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", ] # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'liepinSpd.middlewares.LiepinspdSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'liepinSpd.middlewares.LiepinspdDownloaderMiddleware': 543, 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'liepinSpd.middlewares.MyUserAgentMiddleware': 400, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'liepinSpd.pipelines.LiepinspdPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: liepin/liepinSpd/liepinSpd/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: liepin/liepinSpd/liepinSpd/spiders/lpspider.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- import scrapy import re from datetime import datetime import pandas as pd import time from liepinSpd.items import LiepinspdItem class LiepinSpdier(scrapy.Spider): name = 'liepin' companylist=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469'] start_urls = [] for company in companylist: start_urls.append(f'https://www.liepin.com/company/{company}/') # 公司主要基本信息 def parse(self, response): # company = response.meta['company'] text = response.text # print(text) # 抓取公司基本信息 # try: company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract() # print(company_name) comp_sum_tag = response.xpath('//div[@class="comp-summary-tag"]/a/text()').extract() # 好几个 stage=comp_sum_tag[0] # print(stage) size=comp_sum_tag[1] # print(size) city=comp_sum_tag[2] # print(city) industry=comp_sum_tag[3] # print(industy) #公司标签,list comp_clearfix = str(response.xpath('//ul[@class="comp-tag-list clearfix"]//span/text()').extract()) # print(comp_clearfix) #简历处理率 *%转化为float rate_num = response.xpath('//p[@class="rate-num"]//span/text()')[0].extract() rate_num=int(rate_num)/100 # print(rate_num) job_count = int(re.search(r'. 共([0-9]+) 个', text).group(1)) # print(job_count) #注册资本(万元) if '注册资本' in text and '万元人民币' in text: registered_capital = float(re.search(r'
  • 注册资本:(.*?)万元人民币
  • ', text).group(1)) else: registered_capital =0.0 # print(registered_capital) origin_site=re.search(r'"wapUrl":"(.*?)",', text).group(1) item = LiepinspdItem() # 匹配股票代码,判断如果股票简称全部在公司名内,则匹配股票代码 data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') try: for i in range(len(data)): n = 0 for j in data.loc[i, '股票简称']: if j in company_name: n += 1 if n == len(data.loc[i, '股票简称']): item['ticker'] = data.loc[i, '股票代码'] # print(n, item['ticker'], company_name) # else: # item['ticker'] ='未匹配' except BaseException as e: print('ticker匹配错误') item['as_of_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") item['company_name'] = company_name item['stage'] = stage item['size'] = size item['city'] = city item['industry'] = industry item['comp_clearfix'] = comp_clearfix item['rate_num'] = rate_num item['job_count'] = job_count item['registered_capital'] = registered_capital item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date() item['origin_site'] = origin_site yield item # except BaseException as e: # print('error and pass') ================================================ FILE: liepin/liepinSpd/run_liepin1.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- # 获取settings.py模块的设置 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from liepinSpd.spiders.lpspider import LiepinSpdier settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider类 process.crawl(LiepinSpdier) # 启动爬虫,会阻塞,直到爬取完成 process.start() ================================================ FILE: liepin/liepinSpd/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = liepinSpd.settings [deploy] #url = http://localhost:6800/ project = liepinSpd ================================================ FILE: liepin/liepinSpd2/liepinSpd2/__init__.py ================================================ ================================================ FILE: liepin/liepinSpd2/liepinSpd2/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class Liepinspd2Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() as_of_date = scrapy.Field() ticker = scrapy.Field() company_name = scrapy.Field() job_name = scrapy.Field() job_label = scrapy.Field() salary = scrapy.Field() city = scrapy.Field() education = scrapy.Field() work_year = scrapy.Field() pub_time = scrapy.Field() job_describe = scrapy.Field() # origin_site = scrapy.Field() function = scrapy.Field() spider_time = scrapy.Field() ================================================ FILE: liepin/liepinSpd2/liepinSpd2/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import time from scrapy import signals import scrapy from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import random from common.proxy_set import Proxies_set class Liepinspd2SpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class Liepinspd2DownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MyUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENTS') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent print(agent) # class ProxyMiddleware(object): # ''' # 设置Proxy # ''' # # def __init__(self, ip): # self.ip = ip # # @classmethod # def from_crawler(cls, crawler): # return cls(ip=crawler.settings.get('PROXIES')) # # def process_request(self, request, spider): # ip = random.choice(self.ip) # request.meta['proxy'] = ip # import random # import scrapy # from scrapy import log # logger = logging.getLogger() class ProxyMiddleware(object): """docstring for ProxyMiddleWare""" def process_request(self, request, spider): '''对request对象加上proxy''' proxy = self.get_random_proxy() print("this is request ip:" + proxy) request.meta['proxy'] = proxy def process_response(self, request, response, spider): '''对返回的response处理''' # 如果返回的response状态不是200,重新生成当前request对象 if response.status != 200: proxy = self.get_random_proxy() print("this is response ip:" + proxy) # 对当前reque加上代理 request.meta['proxy'] = proxy return request return response def get_random_proxy(self): '''随机从文件中读取proxy''' while 1: with open('G:\workspace\common\proxies.txt', 'r') as f: proxies = f.readlines() if proxies: break else: time.sleep(1) proxy = random.choice(proxies).strip() return proxy ================================================ FILE: liepin/liepinSpd2/liepinSpd2/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.enterprise import adbapi import pymysql import pymysql.cursors import time # class Liepinspd2Pipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # insert_sql = "insert into liepin_job(as_of_date,ticker,company_name,job_name,salary,city,education,work_year,pub_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']), # str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['origin_site']))) # # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) class Liepinspd2Pipeline(object): """ 同步操作 """ def __init__(self): # 建立连接 self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8' # 创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): # sql语句 insert_sql = """ insert into job_info(as_of_date,ticker,company_name,job_name,job_label,salary,city,education,work_year,pub_time,job_describe,spider_time,function) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ # 执行插入数据到数据库操作 self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']),str(item['job_label']), str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['job_describe']),item['spider_time'],str(item['function']))) # 提交,不进行提交无法保存到数据库 self.conn.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.conn.close() ================================================ FILE: liepin/liepinSpd2/liepinSpd2/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for liepinSpd2 project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'liepinSpd2' SPIDER_MODULES = ['liepinSpd2.spiders'] NEWSPIDER_MODULE = 'liepinSpd2.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'liepinSpd2 (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'day0123' MYSQL_USER = 'root' MYSQL_PASSWD = '123' DEFAULT_REQUEST_HEADERS = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } USER_AGENTS = [ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)", "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413", "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", "Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", ] PROXIES=['27.25.194.221:9999', '113.121.147.180:9999', '111.177.170.22:9999', '116.209.53.31:9999', '111.177.189.211:9999', '111.177.188.174:9999', '111.177.181.31:9999', '211.152.33.24:48749', '125.123.142.33:9999', '125.126.192.172:9999', '58.55.206.201:9999', '58.55.202.19:9999', '171.80.174.156:9999', '183.148.133.134:9999', '111.177.172.24:9999', '124.94.199.7:9999', '121.61.1.161:9999', '58.55.192.211:9999', '183.148.133.148:9999', '59.62.164.224:9999', '111.177.165.34:9999', '111.177.178.183:9999', '121.61.25.243:9999', '27.25.196.242:9999', '117.91.232.146:9999', '111.177.178.107:9999', '111.177.188.158:9999', '111.177.179.103:9999', '111.177.181.81:9999', '183.148.133.158:9999', '110.52.235.25:9999', '111.177.187.63:9999', '111.177.172.18:9999', '111.177.178.175:9999', '116.209.54.63:9999', '183.148.140.20:9999', '116.209.52.115:9999', '117.90.2.139:9999', '111.177.177.212:9999', '119.102.189.134:9999', '119.102.188.140:9999', '119.102.188.156:9999', '121.61.2.196:9999', '49.86.180.90:9999', '219.139.141.112:9999', '111.177.189.26:9999', '111.177.191.179:9999', '122.192.174.244:9999', '111.177.167.67:9999', '125.123.139.143:9999', '125.126.210.203:9999', '125.123.140.229:9999', '171.41.84.191:9999', '111.177.185.8:9999', '110.52.235.27:9999', '123.163.117.72:9999', '111.181.35.17:9999', '113.121.146.190:9999', '111.176.29.245:9999', '116.209.58.5:9999', '111.177.175.161:9999', '113.122.169.65:9999', '121.61.2.8:808', '121.61.0.140:9999', '111.176.23.161:9999', '116.209.54.236:9999', '171.41.85.124:9999', '125.126.209.156:9999', '180.119.68.211:9999', '111.177.191.214:9999', '58.50.1.139:9999', '59.62.166.108:9999', '115.151.2.63:9999', '111.177.179.41:9999', '171.41.84.200:9999', '115.151.5.40:53128', '59.62.164.163:9999', '121.61.2.128:9999', '116.209.54.117:9999', '111.177.161.26:9999', '125.123.140.246:9999', '111.181.35.55:9999', '125.123.143.70:9999', '171.41.85.163:9999', '112.85.130.88:9999', '121.61.0.165:9999', '171.80.136.10:9999', '111.177.188.81:9999', '115.151.2.101:9999', '171.41.85.201:9999', '113.121.145.6:9999', '121.61.0.98:9999', '171.41.86.14:9999', '111.177.172.77:9999', '111.177.171.222:9999', '110.52.235.11:9999', '111.176.28.141:9999', '183.148.145.122:9999', '110.52.235.206:9999', '111.177.189.246:9999'] # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'liepinSpd2.middlewares.Liepinspd2SpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'liepinSpd2.middlewares.Liepinspd2DownloaderMiddleware': 543, 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'liepinSpd2.middlewares.MyUserAgentMiddleware': 400, # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None, # 'liepinSpd2.middlewares.ProxyMiddleware': 125, # 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': None, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'liepinSpd2.pipelines.Liepinspd2Pipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: liepin/liepinSpd2/liepinSpd2/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: liepin/liepinSpd2/liepinSpd2/spiders/liepinJob.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- import scrapy import re import json from datetime import datetime import pandas as pd import time '''修改DEFAULT_CIPHERS''' from twisted.internet.ssl import AcceptableCiphers from scrapy.core.downloader import contextfactory contextfactory.DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT:!DH') from liepinSpd2.items import Liepinspd2Item class LiepinSpdier(scrapy.Spider): name = 'liepin' companylist=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469'] start_urls = [] for company in companylist: start_urls.append(f'https://www.liepin.com/company/{company}/') # 公司主要基本信息 def parse(self, response): text = response.text #职位总页数 totalPage =int(re.search(r'var totalPage = ([0-9]+);', text).group(1)) compId=re.search(r'"pcUrl":"https://www.liepin.com/company/([0-9]+)/',text).group(1) for i in range(1, totalPage + 1): print(f'第{i}页') url = f'https://www.liepin.com/company/{compId}/pn{i}' yield scrapy.Request(url,callback=self.parse_list) def parse_list(self, response): text = response.text urls = response.xpath('//div[@class="job-info"]/a/@href').extract() for url in urls: yield scrapy.Request(url,callback=self.parse_job) def parse_job(self,response): item=Liepinspd2Item() text = response.text as_of_date = datetime.now() company_name = response.xpath('//div[@class="title-info"]//a/@title')[0].extract() # print(company_name) job_name=response.xpath('//div[@class="title-info"]/h1/@title')[0].extract() #薪资/城市/经验/学历 job_label=response.xpath('//li[@data-title=""]/span/text()').extract() salary=response.xpath('//p[@class="job-item-title"]/text()')[0].extract().strip(' \r\n') city=response.xpath('//p[@class="basic-infor"]//a/text()')[0].extract() work_year=response.xpath('//div[@class="job-qualifications"]/span/text()')[1].extract() education=response.xpath('//div[@class="job-qualifications"]/span/text()')[0].extract() pub_time=response.xpath('//p[@class="basic-infor"]/time/@title')[0].extract() job_describe=' '.join(response.xpath('//div[@class="content content-word"]/text()').extract()) function=re.search(r'所属部门:',text).group(1) data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') try: for i in range(len(data)): n = 0 for j in data.loc[i, '股票简称']: if j in company_name: n += 1 if n == len(data.loc[i, '股票简称']): item['ticker'] = data.loc[i, '股票代码'] except BaseException as e: print('ticker匹配错误') item['as_of_date'] = as_of_date item['company_name'] = company_name item['job_name'] = job_name item['job_label'] = job_label item['salary'] = salary item['city'] = city item['education'] = education item['work_year'] = work_year item['pub_time'] = (datetime.strptime(pub_time, u"%Y年%m月%d日").date()) # 最后确定一下格式 item['job_describe'] = job_describe item['function'] = function item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date() # item['origin_site'] = url # print(item['pub_time'],item['ticker'],item['company_name']) yield item # except BaseException as e: # print('111error and pass') # time.sleep(1) # company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract() # # print(company_name) # job_names=response.xpath('//div[@class="job-info"]/a[@class="title"]/text()').extract() # #薪资/城市/经验/学历 # condition_clearfixs=response.xpath('//p[@class="condition clearfix"]/@title').extract() # pub_times=response.xpath('//p[@class="time-info clearfix"]/time/@title').extract() # urls=response.xpath('//div[@class="job-info"]/a/@href').extract() # for job_name, condition_clearfix, pub_time,url in zip(job_names, condition_clearfixs, pub_times,urls): # # try: # item['job_name']=job_name.replace('\r','').replace('\n','').replace('\t','').replace(' ','') # item['salary']=condition_clearfix.split('_')[0] # item['city']=condition_clearfix.split('_')[1] # item['education']=condition_clearfix.split('_')[2] # item['work_year']=condition_clearfix.split('_')[3] # item['pub_time']=pub_time#最后确定一下格式 # data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') # try: # for i in range(len(data)): # n = 0 # for j in data.loc[i, '股票简称']: # if j in company_name: # n += 1 # if n == len(data.loc[i, '股票简称']): # item['ticker'] = data.loc[i, '股票代码'] # print(n, item['ticker'], company_name) # except BaseException as e: # item['ticker'] = 'None' # print('ticker匹配错误') # item['as_of_date'] = as_of_date # item['company_name'] = company_name # item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date() # item['origin_site'] = url # print(item['pub_time'],item['ticker'],item['company_name']) # yield item ================================================ FILE: liepin/liepinSpd2/run_liepin2.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- # 获取settings.py模块的设置 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from liepinSpd2.spiders.liepinJob import LiepinSpdier settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider类 process.crawl(LiepinSpdier) # 启动爬虫,会阻塞,直到爬取完成 process.start() ================================================ FILE: liepin/liepinSpd2/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = liepinSpd2.settings [deploy] #url = http://localhost:6800/ project = liepinSpd2 ================================================ FILE: liepin/liepinSpd_500/liepinSpd/__init__.py ================================================ ================================================ FILE: liepin/liepinSpd_500/liepinSpd/dbhelper.py ================================================ import pymysql from scrapy.utils.project import get_project_settings#引入settings配置 class DBHelper(): def __init__(self): self.settings=get_project_settings()#获取settings配置数据 self.host=self.settings['MYSQL_HOST'] self.port=self.settings['MYSQL_PORT'] self.user=self.settings['MYSQL_USER'] self.passwd=self.settings['MYSQL_PASSWD'] self.db=self.settings['MYSQL_DBNAME'] #连接mysql def connectMysql(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, charset='utf8') return conn #连接数据库 def connectDatabase(self): conn=pymysql.connect(host=self.host, port=self.port, user=self.user, passwd=self.passwd, db=self.db, charset='utf8') return conn #创建数据库 def createDatabase(self): conn=self.connectMysql() sql="create database if not exists "+self.db cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #创建数据表 def createTable(self,sql): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql) cur.close() conn.close() #插入数据 def insert(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor(); cur.execute(sql,params) conn.commit() cur.close() conn.close() #更新数据 def update(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #删除数据 def delete(self,sql,*params): conn=self.connectDatabase() cur=conn.cursor() cur.execute(sql,params) conn.commit() cur.close() conn.close() #测试数据库操作 class TestDBHelper(): def __init__(self): self.dbHelper=DBHelper() def testCreateDatebase(self): self.dbHelper.createDatabase() def testCreateTable(self): sql="create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))" self.dbHelper.createTable(sql) def testInsert(self): sql="insert into testtable(name,url) values(%s,%s)" params=("test","test") self.dbHelper.insert(sql,*params) def testUpdate(self): sql="update testtable set name=%s,url=%s where id=%s" params=("update","update","1") self.dbHelper.update(sql,*params) def testDelete(self): sql="delete from testtable where id=%s" params=("1") self.dbHelper.delete(sql,*params) if __name__=="__main__": testDBHelper=TestDBHelper() #testDBHelper.testCreateDatebase() # #testDBHelper.testCreateTable() # #testDBHelper.testInsert() # #testDBHelper.testUpdate() # #testDBHelper.testDelete() # ================================================ FILE: liepin/liepinSpd_500/liepinSpd/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LiepinspdItem(scrapy.Item): # define the fields for your item here like: as_of_date = scrapy.Field() ticker = scrapy.Field() company_name = scrapy.Field() stage = scrapy.Field() size = scrapy.Field() city = scrapy.Field() industry = scrapy.Field() comp_clearfix = scrapy.Field() rate_num = scrapy.Field() job_count = scrapy.Field() registered_capital = scrapy.Field() spider_time = scrapy.Field() origin_site = scrapy.Field() ================================================ FILE: liepin/liepinSpd_500/liepinSpd/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals import scrapy from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware import random class LiepinspdSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class LiepinspdDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MyUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENTS') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent ================================================ FILE: liepin/liepinSpd_500/liepinSpd/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.enterprise import adbapi import pymysql import pymysql.cursors # class LiepinspdPipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # port = settings['MYSQL_PORT'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # # insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, # (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']), # str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']), # int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],)) # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) import pymysql class LiepinspdPipeline(object): """ 同步操作 """ def __init__(self): # 建立连接 self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8' # 创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): # sql语句 insert_sql = """ insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industry,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ # 执行插入数据到数据库操作 self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']), str(item['size']), str(item['city']), str(item['industry']), str(item['comp_clearfix']), int(item['job_count']), float(item['rate_num']), float(item['registered_capital']), item['spider_time'], item['origin_site'],)) # 提交,不进行提交无法保存到数据库 self.conn.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.conn.close() ================================================ FILE: liepin/liepinSpd_500/liepinSpd/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for liepinSpd project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html COMPANYLIST=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469'] DEFAULT_REQUEST_HEADERS = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } BOT_NAME = 'liepinSpd' MYSQL_HOST = 'rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com' MYSQL_DBNAME = 'special_data' MYSQL_USER = 'cn_ainvest_db' MYSQL_PASSWD = 'cn_ainvest_sd3a1' MYSQL_PORT = 3306 SPIDER_MODULES = ['liepinSpd.spiders'] NEWSPIDER_MODULE = 'liepinSpd.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'liepinSpd (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False USER_AGENTS = [ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)", "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413", "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", "Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", ] # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'liepinSpd.middlewares.LiepinspdSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'liepinSpd.middlewares.LiepinspdDownloaderMiddleware': 543, 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'liepinSpd.middlewares.MyUserAgentMiddleware': 400, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'liepinSpd.pipelines.LiepinspdPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: liepin/liepinSpd_500/liepinSpd/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: liepin/liepinSpd_500/liepinSpd/spiders/lpspider.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- import scrapy import re from datetime import datetime import pandas as pd import time from liepinSpd.items import LiepinspdItem class LiepinSpdier(scrapy.Spider): name = 'liepin' data = pd.read_csv('G:\workspace\y2019m02\company500.csv', encoding='utf-8') companylist=data['股票简称'] start_urls = [] for company in companylist: start_urls.append(f'https://www.liepin.com/zhaopin/?key={company}') # 公司主要基本信息 def parse(self, response): # company = response.meta['company'] text = response.text # print(text) # 抓取公司基本信息 # try: company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract() # print(company_name) comp_sum_tag = response.xpath('//div[@class="comp-summary-tag"]/a/text()').extract() # 好几个 stage=comp_sum_tag[0] # print(stage) size=comp_sum_tag[1] # print(size) city=comp_sum_tag[2] # print(city) industry=comp_sum_tag[3] # print(industy) #公司标签,list comp_clearfix = str(response.xpath('//ul[@class="comp-tag-list clearfix"]//span/text()').extract()) # print(comp_clearfix) #简历处理率 *%转化为float rate_num = response.xpath('//p[@class="rate-num"]//span/text()')[0].extract() rate_num=int(rate_num)/100 # print(rate_num) job_count = int(re.search(r'. 共([0-9]+) 个', text).group(1)) # print(job_count) #注册资本(万元) if '注册资本' in text and '万元人民币' in text: registered_capital = float(re.search(r'
  • 注册资本:(.*?)万元人民币
  • ', text).group(1)) else: registered_capital =0.0 # print(registered_capital) origin_site=re.search(r'"wapUrl":"(.*?)",', text).group(1) item = LiepinspdItem() # 匹配股票代码,判断如果股票简称全部在公司名内,则匹配股票代码 data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') try: for i in range(len(data)): n = 0 for j in data.loc[i, '股票简称']: if j in company_name: n += 1 if n == len(data.loc[i, '股票简称']): item['ticker'] = data.loc[i, '股票代码'] # print(n, item['ticker'], company_name) # else: # item['ticker'] ='未匹配' except BaseException as e: print('ticker匹配错误') item['as_of_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") item['company_name'] = company_name item['stage'] = stage item['size'] = size item['city'] = city item['industry'] = industry item['comp_clearfix'] = comp_clearfix item['rate_num'] = rate_num item['job_count'] = job_count item['registered_capital'] = registered_capital item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date() item['origin_site'] = origin_site yield item # except BaseException as e: # print('error and pass') ================================================ FILE: liepin/liepinSpd_500/run_liepin1.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- # 获取settings.py模块的设置 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from liepinSpd.spiders.lpspider import LiepinSpdier settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider类 process.crawl(LiepinSpdier) # 启动爬虫,会阻塞,直到爬取完成 process.start() ================================================ FILE: liepin/liepinSpd_500/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = liepinSpd.settings [deploy] #url = http://localhost:6800/ project = liepinSpd ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/__init__.py ================================================ ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LiepinspecialcomItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() as_of_date = scrapy.Field() ticker = scrapy.Field() company_name = scrapy.Field() # stage = scrapy.Field() size = scrapy.Field() city = scrapy.Field() industry = scrapy.Field() # comp_clearfix = scrapy.Field() # job_count = scrapy.Field() # rate_num = scrapy.Field() # registered_capital = scrapy.Field() ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from scrapy import signals import scrapy import random from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class LiepinspecialcomSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class LiepinspecialcomDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MyUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENTS') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.enterprise import adbapi import pymysql import pymysql.cursors # class LiepinspdPipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # port = settings['MYSQL_PORT'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # # insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, # (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']), # str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']), # int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],)) # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) import pymysql class LiepinspecialcomPipeline(object): """ 同步操作 """ def __init__(self): # 建立连接 self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8' # 创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): # sql语句 insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,`size`,city,industry) VALUES(%s,%s,%s,%s,%s,%s)" # 执行插入数据到数据库操作 self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['size']), str(item['city']),str(item['industry']))) # 提交,不进行提交无法保存到数据库 self.conn.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.conn.close() # class LiepinspecialcomPipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,`size`,city,industry) VALUES(%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, ( # item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['size']), str(item['city']), # str(item['industry']))) # # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for liepinSpecialCom project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'liepinSpecialCom' MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'day0123' MYSQL_USER = 'root' MYSQL_PASSWD = '123' SPIDER_MODULES = ['liepinSpecialCom.spiders'] NEWSPIDER_MODULE = 'liepinSpecialCom.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'liepinSpecialCom (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } USER_AGENTS = [ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)", "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413", "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)" "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", "Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)' "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", ] # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'liepinSpecialCom.middlewares.LiepinspecialcomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'liepinSpecialCom.middlewares.LiepinspecialcomDownloaderMiddleware': 543, 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'liepinSpecialCom.middlewares.MyUserAgentMiddleware': 400, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'liepinSpecialCom.pipelines.LiepinspecialcomPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: liepin/liepinSpecialCom/liepinSpecialCom/spiders/lpspecialcom.py ================================================ import scrapy import re from datetime import datetime import pandas as pd import time from liepinSpd.items import LiepinspdItem class LiepinSpdier(scrapy.Spider): name = 'liepin' start_urls = ['https://www.liepin.com/job/1917579081.shtml?d_sfrom=search_comp&d_ckId=d112af69ab58e7da8305520f55b31904&d_curPage=0&d_pageSize=15&d_headId=d112af69ab58e7da8305520f55b31904&d_posi=0', 'https://www.liepin.com/job/1917549017.shtml?d_sfrom=search_comp&d_ckId=1bef90aa98c2e8da734552c320527ac0&d_curPage=0&d_pageSize=15&d_headId=1bef90aa98c2e8da734552c320527ac0&d_posi=0', 'https://www.liepin.com/job/1917543155.shtml', 'https://www.liepin.com/job/1917491571.shtml?d_sfrom=search_comp&d_ckId=5874ecde43eb4bd20e75fecb2709bf85&d_curPage=0&d_pageSize=15&d_headId=5874ecde43eb4bd20e75fecb2709bf85&d_posi=0', 'https://www.liepin.com/job/1917505785.shtml?d_sfrom=search_comp&d_ckId=fe82f0f79cda01b1dd4c140ced26087c&d_curPage=0&d_pageSize=15&d_headId=fe82f0f79cda01b1dd4c140ced26087c&d_posi=0', 'https://www.liepin.com/job/1916439263.shtml?d_sfrom=search_comp&d_ckId=d3f4428da37a0cd17a6235cb4a027f1e&d_curPage=0&d_pageSize=15&d_headId=d3f4428da37a0cd17a6235cb4a027f1e&d_posi=0', 'https://www.liepin.com/job/1911157736.shtml?d_sfrom=search_comp&d_ckId=2cf44398e8273003087d5148e113ef8f&d_curPage=0&d_pageSize=15&d_headId=2cf44398e8273003087d5148e113ef8f&d_posi=0', 'https://www.liepin.com/job/1917470663.shtml?d_sfrom=search_comp&d_ckId=9087e4fc55d61d200606fb906999f728&d_curPage=0&d_pageSize=15&d_headId=9087e4fc55d61d200606fb906999f728&d_posi=0', 'https://www.liepin.com/job/1917533673.shtml?d_sfrom=search_comp&d_ckId=98408645fba7219d4d7f17f2714c96f0&d_curPage=0&d_pageSize=15&d_headId=98408645fba7219d4d7f17f2714c96f0&d_posi=0', 'https://www.liepin.com/job/1917306593.shtml?d_sfrom=search_comp&d_ckId=85f632646e2b1ad7c06f436e25fd674d&d_curPage=0&d_pageSize=15&d_headId=85f632646e2b1ad7c06f436e25fd674d&d_posi=0', 'https://www.liepin.com/job/199929552.shtml' ] # 公司主要基本信息 def parse(self, response): text = response.text # print(text) # 抓取公司基本信息 # try: company_name = response.xpath('//div[@class="about-position"]//a/text()')[0].extract() # print(company_name) # comp_sum_tag = response.xpath('//div[@class="comp-summary-tag"]/a/text()').extract() # 好几个 # stage = comp_sum_tag[0] # print(stage) size = re.search(r'公司规模:(.*?)人',text).group(1) # print(size) city = re.search(r'公司地址:(.*?)<',text).group(1) # print(city) industry = re.search(r'行业.*?>(.*?)<',text).group(1) # print(industy) # 公司标签,list # comp_clearfix = str(response.xpath('//ul[@class="comp-tag-list clearfix"]//span/text()').extract()) # print(comp_clearfix) # 简历处理率 *%转化为float # rate_num = response.xpath('//p[@class="rate-num"]//span/text()')[0].extract() # rate_num = int(rate_num) / 100 # print(rate_num) # job_count = int(re.search(r'. 共([0-9]+) 个', text).group(1)) # print(job_count) # 注册资本(万元) # registered_capital = float(re.search(r'
  • 注册资本:(.*?)万元人民币
  • ', text).group(1)) # print(registered_capital) as_of_date = datetime.now() # 最后确认一下格式是否正确 item = LiepinspdItem() # 匹配股票代码,判断如果股票简称全部在公司名内,则匹配股票代码 data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') try: for i in range(len(data)): n = 0 for j in data.loc[i, '股票简称']: if j in company_name: n += 1 if n >= len(data.loc[i, '股票简称'])-1: item['ticker'] = data.loc[i, '股票代码'] print(n, item['ticker'], company_name) except BaseException as e: item['ticker'] ='None' print('ticker匹配错误') item['as_of_date'] = as_of_date item['company_name'] = company_name # item['stage'] = stage item['size'] = size item['city'] = city item['industry'] = industry # item['comp_clearfix'] = comp_clearfix # item['rate_num'] = rate_num # item['job_count'] = job_count # item['registered_capital'] = registered_capital # time.sleep(2) yield item # except BaseException as e: # print('error and pass') ================================================ FILE: liepin/liepinSpecialCom/run_liepinspecialcom.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- # 获取settings.py模块的设置 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from liepinSpecialCom.spiders.lpspecialcom import LiepinSpdier settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider类 process.crawl(LiepinSpdier) # 启动爬虫,会阻塞,直到爬取完成 process.start() ================================================ FILE: liepin/liepinSpecialCom/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = liepinSpecialCom.settings [deploy] #url = http://localhost:6800/ project = liepinSpecialCom ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/__init__.py ================================================ ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/items.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class LiepinspecialcomjobItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() as_of_date = scrapy.Field() ticker = scrapy.Field() company_name = scrapy.Field() job_name = scrapy.Field() salary = scrapy.Field() city = scrapy.Field() work_year = scrapy.Field() pub_time = scrapy.Field() education = scrapy.Field() origin_site = scrapy.Field() ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/middlewares.py ================================================ # -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random import time from scrapy import signals from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class LiepinspecialcomjobSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class LiepinspecialcomjobDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) class MyUserAgentMiddleware(UserAgentMiddleware): ''' 设置User-Agent ''' def __init__(self, user_agent): self.user_agent = user_agent @classmethod def from_crawler(cls, crawler): return cls( user_agent=crawler.settings.get('USER_AGENTS') ) def process_request(self, request, spider): agent = random.choice(self.user_agent) request.headers['User-Agent'] = agent print(agent) class ProxyMiddleware(object): """docstring for ProxyMiddleWare""" def process_request(self, request, spider): '''对request对象加上proxy''' proxy = self.get_random_proxy() print("this is request ip:" + proxy) request.meta['proxy'] = proxy def process_response(self, request, response, spider): '''对返回的response处理''' # 如果返回的response状态不是200,重新生成当前request对象 if response.status != 200: proxy = self.get_random_proxy() print("this is response ip:" + proxy) # 对当前reque加上代理 request.meta['proxy'] = proxy return request return response def get_random_proxy(self): '''随机从文件中读取proxy''' while 1: with open('G:\workspace\common\proxies.txt', 'r') as f: proxies = f.readlines() if proxies: break else: time.sleep(1) proxy = random.choice(proxies).strip() return proxy ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/pipelines.py ================================================ # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from twisted.enterprise import adbapi import pymysql import pymysql.cursors import time # class Liepinspd2Pipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # insert_sql = "insert into liepin_job(as_of_date,ticker,company_name,job_name,salary,city,education,work_year,pub_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']), # str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['origin_site']))) # # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) class LiepinspecialcomjobPipeline(object): """ 同步操作 """ def __init__(self): # 建立连接 self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8' # 创建游标 self.cursor = self.conn.cursor() def process_item(self, item, spider): # sql语句 insert_sql = 'insert into job_info(as_of_date,ticker,company_name,job_name,salary,city,education,work_year,pub_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' # 执行插入数据到数据库操作 self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']), str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['origin_site']))) # 提交,不进行提交无法保存到数据库 self.conn.commit() def close_spider(self, spider): # 关闭游标和连接 self.cursor.close() self.conn.close() # # # class LiepinspecialcomjobPipeline(object): # def __init__(self, dbpool): # self.dbpool = dbpool # # @classmethod # def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 # """ # 数据库建立连接 # :param settings: 配置参数 # :return: 实例化参数 # """ # # adbparams = dict( # host=settings['MYSQL_HOST'], # db=settings['MYSQL_DBNAME'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 # ) # # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 # dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # # 返回实例化参数 # return cls(dbpool) # # def process_item(self, item, spider): # """ # 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 # """ # query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据 # # 添加异常处理 # query.addCallback(self.handle_error) # 处理异常 # # def do_insert(self, cursor, item): # # 对数据库进行插入操作,并不需要commit,twisted会自动commit # insert_sql = "insert into liepin_job(as_of_date,ticker,company_name,job_name,salary,city,education,work_year,pub_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" # cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']), # str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['origin_site']))) # # def handle_error(self, failure): # if failure: # # 打印错误信息 # print(failure) ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/settings.py ================================================ # -*- coding: utf-8 -*- # Scrapy settings for liepinSpecialComJob project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'liepinSpecialComJob' SPIDER_MODULES = ['liepinSpecialComJob.spiders'] NEWSPIDER_MODULE = 'liepinSpecialComJob.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'liepinSpecialComJob (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: MYSQL_HOST = 'localhost' MYSQL_DBNAME = 'day0123' MYSQL_USER = 'root' MYSQL_PASSWD = '123' DEFAULT_REQUEST_HEADERS = { 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9' } USER_AGENTS = [ "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5", "MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)", "Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413", "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19", "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30", "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0", "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36", "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19", "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3", "Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60', 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)', "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", ] # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'liepinSpecialComJob.middlewares.LiepinspecialcomjobSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 'liepinSpd2.middlewares.Liepinspd2DownloaderMiddleware': 543, 'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None, 'liepinSpd2.middlewares.MyUserAgentMiddleware': 400, # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None, # 'liepinSpd2.middlewares.ProxyMiddleware': 750, # 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': None, } ITEM_PIPELINES = { 'liepinSpecialComJob.pipelines.LiepinspecialcomjobPipeline': 300, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/spiders/lpspecialcomjob.py ================================================ import json import scrapy import re from datetime import datetime import pandas as pd import time from common.util import get_13_time t = get_13_time() from liepinSpecialComJob.items import LiepinspecialcomjobItem class LiepinSpdier(scrapy.Spider): name = 'liepin' start_urls = ['https://vip.liepin.com/883905/1405577359643.shtml', 'https://vip.liepin.com/8161070/joblist.shtml', # 'http://maker.haier.net/custompage/socialchannel/index.html?platformcode=lp', 'https://vip.liepin.com/7855333/joblist.shtml', 'https://vip.liepin.com/8090130/1409730340536.shtml', 'https://vip.liepin.com/8399212/joblist.shtml', 'https://vip.liepin.com/1198424/joblist2.shtml', 'https://vip.liepin.com/8787971/joblist.shtml', 'https://vip.liepin.com/8796178/joblist2.shtml', 'https://vip.liepin.com/8091337/1426475303042.shtml', 'https://vip.liepin.com/7904788/job.shtml', ] def parse(self, response): text = response.text company_name = re.search(r'(.*?) - 猎聘网招聘官网',text).group(1) companyId=re.search(r'CONFIG={"companyId":"([0-9]+)"}',text).group(1) next_meta = response.meta data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk') try: for i in range(len(data)): n = 0 for j in data.loc[i, '股票简称']: if j in company_name: n += 1 if n == len(data.loc[i, '股票简称']): next_meta['ticker'] = data.loc[i, '股票代码'] print(n, next_meta['ticker'], company_name) except BaseException as e: next_meta['ticker'] ='None' print('ticker匹配错误') next_meta['company_name'] = company_name next_meta['companyId'] = companyId url='https://www.liepin.com/ajaxproxy.html' # headers={ # 'Referer':'https://vip.liepin.com/8091337/1426475303042.shtml' # } yield scrapy.Request(url, callback=self.parse_list, meta=next_meta,dont_filter=True) def parse_list(self,response): next_meta = response.meta companyId = next_meta['companyId'].strip() # print(companyId,response.text) n=0 while n<95: # try: t = get_13_time() # 'https://www.liepin.com/company/sojob.json?pageSize=15&curPage=0&ecompIds=8091337&dq=&publishTime=&keywords=&_=1550383073951' url=f'https://www.liepin.com/company/sojob.json?pageSize=15&curPage={n}&ecompIds={companyId}&dq=&publishTime=&keywords=&_={t}' n+=1 headers={ 'referer':'https://www.liepin.com/ajaxproxy.html' } cookies={ '__uuid': '1550017147980.22', '_uuid': 'E4361B46FFA8441973EC46E6488BD983', 'is_lp_user': 'true', 'need_bind_tel': 'false', 'new_user': 'false', 'c_flag': 'f57e19ed294147b87179e4e6132477f5', 'imClientId': '45e417dd37f82ac674cdcbb355984626', 'imId': '45e417dd37f82ac6a36687782a0c1c67', 'imClientId_0': '45e417dd37f82ac674cdcbb355984626', 'imId_0': '45e417dd37f82ac6a36687782a0c1c67', 'gr_user_id': '374534ce-aa54-4880-88ca-7a7bb7adf340', 'bad1b2d9162fab1f80dde1897f7a2972_gr_last_sent_cs1': '463d81f04fd219c61a667e00ad0d9493', 'grwng_uid': 'f3fda8f8-0c2e-4f29-8507-f42f7a9671ec', 'fe_work_exp_add': 'true', 'ADHOC_MEMBERSHIP_CLIENT_ID1.0': 'fa804ff0-2a02-3f31-8dcb-8e13b527dfcb', 'bad1b2d9162fab1f80dde1897f7a2972_gr_cs1': '463d81f04fd219c61a667e00ad0d9493', '__tlog': '1550383052778.97%7C00000000%7C00000000%7C00000000%7C00000000', '_mscid': '00000000', 'Hm_lvt_a2647413544f5a04f00da7eee0d5e200': '1550233873,1550279247,1550281552,1550383053', 'abtest': '0', '_fecdn_': '0', '__session_seq': '2', '__uv_seq': '2', 'Hm_lpvt_a2647413544f5a04f00da7eee0d5e200': '1550383074' } next_meta['ticker'] = next_meta['ticker'] print(next_meta['ticker']) next_meta['company_name'] = next_meta['company_name'] print(next_meta['company_name']) yield scrapy.Request(url, callback=self.parse_job,meta=next_meta,headers=headers,cookies=cookies) # except BaseException as e: # print('已完成最后一页') # break def parse_job(self,response): meta = response.meta item = LiepinspecialcomjobItem() text = response.text print('****************************************') json_data = json.loads(text) as_of_date = datetime.now() job_infos=json_data['list'] for job_info in job_infos: origin_site=job_info['url'] job_name=job_info['title'] salary=job_info['salary'] city=job_info['city'] education=job_info['eduLevel'] work_year=job_info['workYear'] pub_time=job_info['time'] function=job_info['dept'] item['ticker'] = meta['ticker'].strip() item['company_name'] = meta['company_name'].strip() item['job_name']=job_name item['salary']=salary item['city']=city item['education']=education item['work_year']=work_year item['pub_time']=pub_time item['as_of_date']=as_of_date item['function']=function item['origin_site']=origin_site yield item #暂不深挖 # for url in origin_sites: # yield scrapy.Request(url, callback=self.parse_job) ================================================ FILE: liepin/liepinSpecialComJob/run_liepinspecialjob.py ================================================ # !/usr/bin/env python # -*- coding: utf-8 -*- # 获取settings.py模块的设置 from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from liepinSpecialComJob.spiders.lpspecialcomjob import LiepinSpdier settings = get_project_settings() process = CrawlerProcess(settings=settings) # 可以添加多个spider类 process.crawl(LiepinSpdier) # 启动爬虫,会阻塞,直到爬取完成 process.start() ================================================ FILE: liepin/liepinSpecialComJob/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.io/en/latest/deploy.html [settings] default = liepinSpecialComJob.settings [deploy] #url = http://localhost:6800/ project = liepinSpecialComJob ================================================ FILE: liepin/liepin_login.py ================================================ # -*- coding: utf-8 -*- ''' Required - requests - bs4 ''' # 输入密码不可见模块导入 import getpass import hashlib import requests from bs4 import BeautifulSoup class Leipin(object): def __init__(self, username, password): self.username = username self.password = password self.headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': 'passport.liepin.com', 'User-Agent': "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 'X-Requested-With': 'XMLHttpRequest', 'Upgrade-Insecure-Requests': '1', } self.proxies = { 'HTTP': 'http://120.198.231.88:80', # 'HTTP':'http://222.174.71.46:9999', } # 测试所用代理 self.session = requests.session() self.accountUrl = 'https://passport.liepin.com/h/account' self.loginUrl = 'https://passport.liepin.com/h/login.json' self.Dir = 'E:\\python\\authcode.jpg' # authcode folder 根据自己情况选择文件夹 def _md5(self): md5 = hashlib.md5() md5.update(str.encode(password)) psw = md5.hexdigest() return psw def _getAuthcode(self): r = self.session.get(self.accountUrl, headers=self.headers, timeout=10, proxies=self.proxies) page = BeautifulSoup(r.text) selector = page.find_all('div', class_="ui-tab-toggle hide")[0] imageUrl = selector.select('div > img')[0]['src'] authcodeUrl = 'https://passport.liepin.com{}'.format(imageUrl) response = self.session.get(authcodeUrl) if response.status_code == 200: with open(self.Dir, 'wb') as f: f.write(response.content) authcode = input('plz input authcode:') return authcode def login(self): payload = { 'user_login': self.username, 'isMd5': 1, 'user_pwd': self._md5(), 'user_kind': 2, # 根据你是否为正式会员而定,根据自身情况可能需要修改 'verifycode': self._getAuthcode(), 'url': '', } del self.headers['Upgrade-Insecure-Requests'] self.headers['Origin'] = 'https://passport.liepin.com' self.headers['Referer'] = 'https://passport.liepin.com/h/account' response = self.session.post(self.loginUrl, headers=self.headers, data=payload, timeout=10, proxies=self.proxies, allow_redirects=False) return response.status_code, response.text if __name__ == '__main__': userName = input("请输入你的用户名:") password = getpass.getpass("password:") lp = Leipin(userName, password) print(lp.login()) ================================================ FILE: qqmusic/qqmusic_spider.py ================================================ # -*- coding: utf-8 -*- # @Author: MediocrityXT # @Github: https://github.com/MediocrityXT import requests import execjs import os class Spider(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', 'Referer':'https://y.qq.com/portal/player.html' } def __get_songs(self, name): num = 10 url = 'https://c.y.qq.com/soso/fcgi-bin/client_search_cp?p=1&n='+str(num)+'&w='+name+'&format=json' response = requests.get(url, headers=self.headers).json()['data']['song']['list'] return response def __print_info(self, songs): #打印搜索到的的歌曲信息 index = 0 for it in songs: index = index + 1 if it['pay']['payplay']: NeedPay='(收费) ' else: NeedPay=' ' singers='' it2=it['singer'] for x in it2: singers = singers+x['name']+' ' print(index,'.',NeedPay,it['songname'],' ',singers) def __get_Sign(self,data): print(os.path.realpath(__file__)) with open(os.path.realpath(__file__)+'/../sign.js', 'r', encoding='utf-8') as f: js_content = f.read() js_exec = execjs.compile(js_content) sign = js_exec.call('getSecuritySign',data) return sign def __get_url(self,data): sign=self.__get_Sign(data) url='https://u.y.qq.com/cgi-bin/musics.fcg?-=getplaysongvkey38596056557178904'\ '&g_tk=1129808082'\ '&sign={}'\ '&loginUin=18585073516'\ '&hostUin=0'\ '&format=json'\ '&inCharset=utf8'\ '&outCharset=utf-8'\ '¬ice=0'\ '&platform=yqq.json'\ '&needNewCode=0&data='.format(sign)+data response=requests.get(url).json() return response['req_0']['data']['midurlinfo'][0]['purl'] def __set_data(self, songmid): data='{"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch","param":{"guid":"358840384","calltype":0,"userip":""}},'\ '"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey","param":{"guid":"358840384","songmid":["'+songmid+'"],"songtype":[0],"uin":"18585073516","loginflag":1,"platform":"20"}},"comm":{"uin":18585073516,"format":"json","ct":24,"cv":0}}' return data def __download_mp3(self, url, filename): abspath = os.path.abspath('.') # 获取绝对路径 os.chdir(abspath) response = requests.get(url, headers=self.headers).content path = os.path.join(abspath, filename) with open(path + '.m4a', 'wb') as f: f.write(response) print('下载完毕,保存至:%s.m4a' % path ) def run(self): while True: name = input('搜索歌曲名称:') songs = self.__get_songs(name) self.__print_info(songs) choice = int(input('请输入左边对应数字选择歌曲:'))-1 if choice >=0 & choice<len(songs): if songs[choice]['pay']['payplay']: print('无法下载收费歌曲') else: songmid=songs[choice]['songmid'] data=self.__set_data(songmid) url='https://isure.stream.qqmusic.qq.com/'+self.__get_url(data) #print(url) self.__download_mp3(url,songs[choice]['songname']) else: print('输入错误') flag = input('如需继续可以按任意键进行搜歌,否则按0结束程序') if flag == '0': break print('程序结束!') if __name__=='__main__': spider = Spider() spider.run() ================================================ FILE: qqmusic/sign.js ================================================ function __sign_hash_20200305 (n) { function l(n, t) { var o = (65535 & n) + (65535 & t); return (n >> 16) + (t >> 16) + (o >> 16) << 16 | 65535 & o } function r(n, t, o, e, u, p) { return l((i = l(l(t, n), l(e, p))) << (r = u) | i >>> 32 - r, o); var i, r } function g(n, t, o, e, u, p, i) { return r(t & o | ~t & e, n, t, u, p, i) } function a(n, t, o, e, u, p, i) { return r(t & e | o & ~e, n, t, u, p, i) } function s(n, t, o, e, u, p, i) { return r(t ^ o ^ e, n, t, u, p, i) } function v(n, t, o, e, u, p, i) { return r(o ^ (t | ~e), n, t, u, p, i) } function t(n) { return function(n) { var t, o = ""; for (t = 0; t < 32 * n.length; t += 8) o += String.fromCharCode(n[t >> 5] >>> t % 32 & 255); return o }(function(n, t) { n[t >> 5] |= 128 << t % 32, n[14 + (t + 64 >>> 9 << 4)] = t; var o, e, u, p, i, r = 1732584193, f = -271733879, h = -1732584194, c = 271733878; for (o = 0; o < n.length; o += 16) r = g(e = r, u = f, p = h, i = c, n[o], 7, -680876936), c = g(c, r, f, h, n[o + 1], 12, -389564586), h = g(h, c, r, f, n[o + 2], 17, 606105819), f = g(f, h, c, r, n[o + 3], 22, -1044525330), r = g(r, f, h, c, n[o + 4], 7, -176418897), c = g(c, r, f, h, n[o + 5], 12, 1200080426), h = g(h, c, r, f, n[o + 6], 17, -1473231341), f = g(f, h, c, r, n[o + 7], 22, -45705983), r = g(r, f, h, c, n[o + 8], 7, 1770035416), c = g(c, r, f, h, n[o + 9], 12, -1958414417), h = g(h, c, r, f, n[o + 10], 17, -42063), f = g(f, h, c, r, n[o + 11], 22, -1990404162), r = g(r, f, h, c, n[o + 12], 7, 1804603682), c = g(c, r, f, h, n[o + 13], 12, -40341101), h = g(h, c, r, f, n[o + 14], 17, -1502002290), r = a(r, f = g(f, h, c, r, n[o + 15], 22, 1236535329), h, c, n[o + 1], 5, -165796510), c = a(c, r, f, h, n[o + 6], 9, -1069501632), h = a(h, c, r, f, n[o + 11], 14, 643717713), f = a(f, h, c, r, n[o], 20, -373897302), r = a(r, f, h, c, n[o + 5], 5, -701558691), c = a(c, r, f, h, n[o + 10], 9, 38016083), h = a(h, c, r, f, n[o + 15], 14, -660478335), f = a(f, h, c, r, n[o + 4], 20, -405537848), r = a(r, f, h, c, n[o + 9], 5, 568446438), c = a(c, r, f, h, n[o + 14], 9, -1019803690), h = a(h, c, r, f, n[o + 3], 14, -187363961), f = a(f, h, c, r, n[o + 8], 20, 1163531501), r = a(r, f, h, c, n[o + 13], 5, -1444681467), c = a(c, r, f, h, n[o + 2], 9, -51403784), h = a(h, c, r, f, n[o + 7], 14, 1735328473), r = s(r, f = a(f, h, c, r, n[o + 12], 20, -1926607734), h, c, n[o + 5], 4, -378558), c = s(c, r, f, h, n[o + 8], 11, -2022574463), h = s(h, c, r, f, n[o + 11], 16, 1839030562), f = s(f, h, c, r, n[o + 14], 23, -35309556), r = s(r, f, h, c, n[o + 1], 4, -1530992060), c = s(c, r, f, h, n[o + 4], 11, 1272893353), h = s(h, c, r, f, n[o + 7], 16, -155497632), f = s(f, h, c, r, n[o + 10], 23, -1094730640), r = s(r, f, h, c, n[o + 13], 4, 681279174), c = s(c, r, f, h, n[o], 11, -358537222), h = s(h, c, r, f, n[o + 3], 16, -722521979), f = s(f, h, c, r, n[o + 6], 23, 76029189), r = s(r, f, h, c, n[o + 9], 4, -640364487), c = s(c, r, f, h, n[o + 12], 11, -421815835), h = s(h, c, r, f, n[o + 15], 16, 530742520), r = v(r, f = s(f, h, c, r, n[o + 2], 23, -995338651), h, c, n[o], 6, -198630844), c = v(c, r, f, h, n[o + 7], 10, 1126891415), h = v(h, c, r, f, n[o + 14], 15, -1416354905), f = v(f, h, c, r, n[o + 5], 21, -57434055), r = v(r, f, h, c, n[o + 12], 6, 1700485571), c = v(c, r, f, h, n[o + 3], 10, -1894986606), h = v(h, c, r, f, n[o + 10], 15, -1051523), f = v(f, h, c, r, n[o + 1], 21, -2054922799), r = v(r, f, h, c, n[o + 8], 6, 1873313359), c = v(c, r, f, h, n[o + 15], 10, -30611744), h = v(h, c, r, f, n[o + 6], 15, -1560198380), f = v(f, h, c, r, n[o + 13], 21, 1309151649), r = v(r, f, h, c, n[o + 4], 6, -145523070), c = v(c, r, f, h, n[o + 11], 10, -1120210379), h = v(h, c, r, f, n[o + 2], 15, 718787259), f = v(f, h, c, r, n[o + 9], 21, -343485551), r = l(r, e), f = l(f, u), h = l(h, p), c = l(c, i); return [r, f, h, c] }(function(n) { var t, o = []; for (o[(n.length >> 2) - 1] = void 0, t = 0; t < o.length; t += 1) o[t] = 0; for (t = 0; t < 8 * n.length; t += 8) o[t >> 5] |= (255 & n.charCodeAt(t / 8)) << t % 32; return o }(n), 8 * n.length)) } function o(n) { return t(unescape(encodeURIComponent(n))) } return function(n) { var t, o, e = "0123456789abcdef", u = ""; for (o = 0; o < n.length; o += 1) t = n.charCodeAt(o), u += e.charAt(t >>> 4 & 15) + e.charAt(15 & t); return u }(o(n)) } function r(f, h, c, l, g) { g = g || [[this], [{}]]; for (var t = [], o = null, n = [function() { return !0 } , function() {} , function() { g.length = c[h++] } , function() { g.push(c[h++]) } , function() { g.pop() } , function() { var n = c[h++] , t = g[g.length - 2 - n]; g[g.length - 2 - n] = g.pop(), g.push(t) } , function() { g.push(g[g.length - 1]) } , function() { g.push([g.pop(), g.pop()].reverse()) } , function() { g.push([l, g.pop()]) } , function() { g.push([g.pop()]) } , function() { var n = g.pop(); g.push(n[0][n[1]]) } , function() { g.push(g[g.pop()[0]][0]) } , function() { var n = g[g.length - 2]; n[0][n[1]] = g[g.length - 1] } , function() { g[g[g.length - 2][0]][0] = g[g.length - 1] } , function() { var n = g.pop() , t = g.pop(); g.push([t[0][t[1]], n]) } , function() { var n = g.pop(); g.push([g[g.pop()][0], n]) } , function() { var n = g.pop(); g.push(delete n[0][n[1]]) } , function() { var n = []; for (var t in g.pop()) n.push(t); g.push(n) } , function() { g[g.length - 1].length ? g.push(g[g.length - 1].shift(), !0) : g.push(void 0, !1) } , function() { var n = g[g.length - 2] , t = Object.getOwnPropertyDescriptor(n[0], n[1]) || { configurable: !0, enumerable: !0 }; t.get = g[g.length - 1], Object.defineProperty(n[0], n[1], t) } , function() { var n = g[g.length - 2] , t = Object.getOwnPropertyDescriptor(n[0], n[1]) || { configurable: !0, enumerable: !0 }; t.set = g[g.length - 1], Object.defineProperty(n[0], n[1], t) } , function() { h = c[h++] } , function() { var n = c[h++]; g[g.length - 1] && (h = n) } , function() { throw g[g.length - 1] } , function() { var n = c[h++] , t = n ? g.slice(-n) : []; g.length -= n, g.push(g.pop().apply(l, t)) } , function() { var n = c[h++] , t = n ? g.slice(-n) : []; g.length -= n; var o = g.pop(); g.push(o[0][o[1]].apply(o[0], t)) } , function() { var n = c[h++] , t = n ? g.slice(-n) : []; g.length -= n, t.unshift(null), g.push(new (Function.prototype.bind.apply(g.pop(), t))) } , function() { var n = c[h++] , t = n ? g.slice(-n) : []; g.length -= n, t.unshift(null); var o = g.pop(); g.push(new (Function.prototype.bind.apply(o[0][o[1]], t))) } , function() { g.push(!g.pop()) } , function() { g.push(~g.pop()) } , function() { g.push(typeof g.pop()) } , function() { g[g.length - 2] = g[g.length - 2] == g.pop() } , function() { g[g.length - 2] = g[g.length - 2] === g.pop() } , function() { g[g.length - 2] = g[g.length - 2] > g.pop() } , function() { g[g.length - 2] = g[g.length - 2] >= g.pop() } , function() { g[g.length - 2] = g[g.length - 2] << g.pop() } , function() { g[g.length - 2] = g[g.length - 2] >> g.pop() } , function() { g[g.length - 2] = g[g.length - 2] >>> g.pop() } , function() { g[g.length - 2] = g[g.length - 2] + g.pop() } , function() { g[g.length - 2] = g[g.length - 2] - g.pop() } , function() { g[g.length - 2] = g[g.length - 2] * g.pop() } , function() { g[g.length - 2] = g[g.length - 2] / g.pop() } , function() { g[g.length - 2] = g[g.length - 2] % g.pop() } , function() { g[g.length - 2] = g[g.length - 2] | g.pop() } , function() { g[g.length - 2] = g[g.length - 2] & g.pop() } , function() { g[g.length - 2] = g[g.length - 2] ^ g.pop() } , function() { g[g.length - 2] = g[g.length - 2]in g.pop() } , function() { g[g.length - 2] = g[g.length - 2]instanceof g.pop() } , function() { g[g[g.length - 1][0]] = void 0 === g[g[g.length - 1][0]] ? [] : g[g[g.length - 1][0]] } , function() { for (var e = c[h++], u = [], n = c[h++], t = c[h++], p = [], o = 0; o < n; o++) u[c[h++]] = g[c[h++]]; for (var i = 0; i < t; i++) p[i] = c[h++]; g.push(function n() { var t = u.slice(0); t[0] = [this], t[1] = [arguments], t[2] = [n]; for (var o = 0; o < p.length && o < arguments.length; o++) 0 < p[o] && (t[p[o]] = [arguments[o]]); return r(f, e, c, l, t) }) } , function() { t.push([c[h++], g.length, c[h++]]) } , function() { t.pop() } , function() { return !!o } , function() { o = null } , function() { g[g.length - 1] += String.fromCharCode(c[h++]) } , function() { g.push("") } , function() { g.push(void 0) } , function() { g.push(null) } , function() { g.push(!0) } , function() { g.push(!1) } , function() { g.length -= c[h++] } , function() { g[g.length - 1] = c[h++] } , function() { var n = g.pop() , t = g[g.length - 1]; t[0][t[1]] = g[n[0]][0] } , function() { var n = g.pop() , t = g[g.length - 1]; t[0][t[1]] = n[0][n[1]] } , function() { var n = g.pop() , t = g[g.length - 1]; g[t[0]][0] = g[n[0]][0] } , function() { var n = g.pop() , t = g[g.length - 1]; g[t[0]][0] = n[0][n[1]] } , function() { g[g.length - 2] = g[g.length - 2] < g.pop() } , function() { g[g.length - 2] = g[g.length - 2] <= g.pop() } ]; ; ) try { for (; !n[c[h++]](); ) ; if (o) throw o; return g.pop() } catch (n) { var e = t.pop(); if (void 0 === e) throw n; o = n, h = e[0], g.length = e[1], e[2] && (g[e[2]][0] = o) } } function getSecuritySign(data){ let str = 'abcdefghijklmnopqrstuvwxyz0123456789'; let count = Math.floor(Math.random() * 7 + 10); let sign = 'zza'; for(let i = 0; i < count ; i++){ sign += str[Math.floor(Math.random() * 36)]; } sign += __sign_hash_20200305('CJBPACrRuNy7'+data); return sign } ================================================ FILE: qqzone/qq_zone.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-7 """ import time # 用来延时 from selenium import webdriver driver = webdriver.Chrome() # 选择浏览器,此处我选择的Chrome QQ_NUMBER = input('请输入你的QQ号') PASSWORD = input('请输入你的QQ密码') driver.get('http://i.qq.com/') driver.switch_to.frame('login_frame') driver.find_element_by_id('switcher_plogin').click() driver.find_element_by_name('u').clear() driver.find_element_by_name('u').send_keys(QQ_NUMBER) # 此处输入你的QQ号 driver.find_element_by_name('p').clear() driver.find_element_by_name('p').send_keys(PASSWORD) # 此处输入你的QQ密码 driver.execute_script("document.getElementById('login_button').parentNode.hidefocus=false;") driver.find_element_by_xpath('//*[@id="loginform"]/div[4]/a').click() driver.find_element_by_id('login_button').click() time.sleep(10) # 因为我曾经是QQ会员,所以每次登陆时都会提醒我要不要再续费的弹窗... driver.find_element_by_id('dialog_button_1').click() # 这个地方是我把那个弹窗给点击了,配合上面的延时用的,延时是等待那个弹窗出现,然后此处点击取消 btns = driver.find_elements_by_css_selector('a.item.qz_like_btn_v3') # 此处是CSS选择器 for btn in btns: btn.click() ================================================ FILE: qsbk/qiushibaike.py ================================================ # !/usr/bin/python3 # -*- coding: utf-8 -*- # 1. 导入线程池模块 # 线程池 import gevent.monkey gevent.monkey.patch_all() from gevent.pool import Pool from queue import Queue import requests from lxml import etree class QiushiSpider(): def __init__(self, max_page): self.max_page = max_page # 2. 创建线程池,初始化线程数量 self.pool = Pool(5) self.base_url = "http://www.qiushibaike.com/8hr/page/{}/" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } # 专门存放 url 容器 self.url_queue = Queue() pass def get_url_list(self): ''' 获取 url 列表放入到 url 容器中 :return: ''' for page in range(1,self.max_page,1): url = self.base_url.format(page) self.url_queue.put(url) # 3. 实现执行任务 def exec_task(self): # 1> 获取url url = self.url_queue.get() # 2> 发送请求获取 html response = requests.get(url,headers=self.headers) html = response.text # 3> 解析 html 提取数据 eroot = etree.HTML(html) titles = eroot.xpath('//a[@class="recmd-content"]/text()') for title in titles: item = {} item["title"] = title # 4> 保存数据 print(item) self.url_queue.task_done() # 4. 实现执行任务完成后的操作,必须至少有一个参数 # result 任务执行的最终结果的返回值 def exec_task_finished(self,result): print("result:",result) print("执行任务完成") self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) def run(self): self.get_url_list() # 5. 让任务使用线程池中的线程执行并且设置执行后的回调操作 # callback 表示执行完成后的回调 for i in range(5): self.pool.apply_async(self.exec_task,callback=self.exec_task_finished) self.url_queue.join() pass if __name__ == '__main__': max_page = input("请输入您需要多少页内容:") spider = QiushiSpider(int(max_page)) spider.run() ================================================ FILE: sina/sina.py ================================================ # 这里需要使用getpass模块才能使输入密码不可见 import getpass import requests import hashlib import time """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-7 """ def get_login(phone, pwd): new_time = str(int(time.time())) sign = new_time + '_' + hashlib.md5((phone + pwd + new_time).encode("utf-8")).hexdigest() print(sign) url = "https://appblog.sina.com.cn/api/passport/v3_1/login.php" data = { "cookie_format": "1", "sign": sign, "pin": "e3eb41c951f264a6daa16b6e4367e829", "appver": "5.3.2", "appkey": "2546563246", "phone": phone, "entry": "app_blog", "pwd": pwd } headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; nxt-al10 Build/LYZ28N) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 sinablog-android/5.3.2 (Android 5.1.1; zh_CN; huawei nxt-al10/nxt-al10)", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8" } r = requests.post(url=url, data=data, headers=headers) print(r.json()) if __name__ == '__main__': phone = input("你输入你的账号:") # 这里输入密码不可见 pwd = getpass.getpass("password:") get_login(phone, pwd) ================================================ FILE: sina/spider/Ajax_weibo.py ================================================ # -*- coding: utf-8 -*- from urllib.parse import urlencode import requests, pymysql from pyquery import PyQuery as pq from selenium import webdriver from time import sleep # 连接数据库 connection = pymysql.connect(host='localhost', port=3306, user='root', passwd='zkyr1006', db='python', charset='utf8') cursor = connection.cursor() sql = "USE python;" cursor.execute(sql) connection.commit() base_url = 'https://m.weibo.cn/api/container/getIndex?' headers = { 'Host': 'm.weibo.cn', 'Referer': 'https://m.weibo.cn/u/2145291155', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } def create_sheet(bozhu): try: weibo = ''' CREATE TABLE weibo( ID VARCHAR (255) NOT NULL PRIMARY KEY, text VARCHAR (255), attitudes VARCHAR (255), comments VARCHAR (255), reposts VARCHAR (255) ) ''' # 序号 INT NOT NULL PRIMARY KEY AUTO_INCREMENT, cursor.execute(weibo) connection.commit() except: pass def url_get(): # # 自动保持cookie,不需要自己维护cookie内容 # cookies = {} # s = requests.session() # with open('E:\example\豆瓣读书爬虫\cookie.txt')as file: # raw_cookies = file.read() # for line in raw_cookies.split(';'): # key, value = line.split('=', 1) # cookies[key] = value # # 完善header # header = {'Upgrade-Insecure-Requests': '1', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Encoding': 'gzip, deflate, br', # 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', # } # # get请求,应答解码 # response = s.get(url=xl_url, headers=header,cookies=cookies) browser = webdriver.PhantomJS() browser.get(url='https://m.weibo.cn/') wb_name = browser.find_element_by_class_name("W_input") wb_name.send_keys(input('输入博主ID:')) sleep(10) search = browser.find_element_by_class_name('W_ficon ficon_search S_ficon') search.click() sleep(5) bz_num = browser.find_element_by_class_name('name_txt') bz_num.click() sleep(5) # 开启了一个新页面,需要跳转到新页面 handles = browser.window_handles browser.switch_to_window(handles[1]) # https://m.weibo.cn/api/container/getIndex?type=uid&value=2145291155&containerid=1076032145291155 # 拼接url def get_page(page): # 查询字符串 params = { 'type': 'uid', 'value': '2145291155', 'containerid': '1076032145291155', 'page': page } # 调用urlencode() 方法将params参数转化为 URL 的 GET请求参数 url = base_url + urlencode(params) try: response = requests.get(url, headers=headers) if response.status_code == 200: # print(response.json()) return response.json() except requests.ConnectionError as e: print('Error', e.args) # 存储数据,存储到数据库 def parse_page(json): if json: items = json.get('data').get('cards') for index, item in enumerate(items): if page == 1 and index == 1: continue else: item = item.get('mblog') # weibo = {} # weibo['id'] = item.get('id') # weibo['text'] = # weibo['attitudes'] = item.get('attitudes_count') # weibo['comments'] = item.get('comments_count') # weibo['reposts'] = item.get('reposts_count') weibo = [] weibo.append(item.get('id')) weibo.append(pq(item.get('text')).text()) weibo.append(item.get('attitudes_count')) weibo.append(item.get('comments_count')) weibo.append(item.get('reposts_count')) # 遇见重复数据,pass,是根据主键来判断,如果是重复数据,忽略,但是报警告 try: sql = '''INSERT INTO weibo (ID,text,attitudes,comments,reposts) VALUES (%s,%s,%s,%s,%s) ''' cursor.execute(sql, weibo) connection.commit() except: pass yield weibo if __name__ == '__main__': for page in range(1, 17): json = get_page(page) results = parse_page(json) for result in results: print(result) cursor.close() # 可以爬任意指定博主所有微博,以博主名建立表,分别储存信息 # 使用selenium+PhantomJS抓取对应博主主页链接 ================================================ FILE: sina/spider/selenium_test.py ================================================ # -*- coding: utf-8 -*- from urllib.parse import urlencode import requests,pymysql from pyquery import PyQuery as pq from selenium import webdriver from time import sleep cookies={} # with open('E:\Spider\Ajax_微博\cookie.txt')as file: # raw_cookies=file.read() # for line in raw_cookies.split(';'): # key,value=line.split('=',1) # cookies[key]=value # # print(cookies) webdriver.DesiredCapabilities.FIREFOX['firefox.page.settings.userAgent'] = "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0" browser = webdriver.Firefox() browser.get(url='https://m.weibo.cn/') sleep(2) browser.delete_all_cookies() browser.add_cookie({'name': '_T_WM', 'value': '52b0151e9490855bae540e40f881320f', 'path': '/', 'expiry': 1530671807, 'secure': False, 'httpOnly': True}) browser.add_cookie({'name': 'SUB', 'value': '_2A252ENWfDeRhGeVH6FQX9SzKyD6IHXVV-vvXrDV6PUJbkdANLWHmkW1NT0iViQc71Gn2lMbDYa_ioQFHVnRmZaqg', 'path': '/', 'expiry': 1559615823, 'secure': False, 'httpOnly': True}) browser.add_cookie({'name': 'SUHB', 'value': '0LcSnbMGfYxAbx', 'path': '/', 'expiry': 1559615823, 'secure': False, 'httpOnly': False}) browser.add_cookie({'name': 'SCF', 'value': 'AlYBAKMIZxonUEvbmR6JgqyYWHL1yaDreGv8vXPl1FRCO7xm1IoOxBvQbDb-ZqzXTud9qRC3AnLVu7nFx_MvHdc.', 'path': '/', 'expiry': 1843439823, 'secure': False, 'httpOnly': True}) browser.add_cookie({'name': 'SSOLoginState', 'value': '1528079823', 'path': '/','expiry': None, 'secure': False, 'httpOnly': False}) browser.add_cookie({'name': 'MLOGIN', 'value': '1', 'path': '/', 'expiry': 1528083425, 'secure': False, 'httpOnly': False}) browser.add_cookie({'name': 'M_WEIBOCN_PARAMS', 'value': 'uicode%3D20000174%26featurecode%3D20000320%26fid%3Dhotword', 'path': '/', 'expiry': 1528080425, 'secure': False, 'httpOnly': True}) sleep(2) browser.get(url='https://m.weibo.cn/') sleep(40) print(browser.get_cookies()) # elem=browser.find_element_by_class_name('btn btnWhite') # elem.click() # username=browser.find_element_by_id('loginName') # username.send_keys('18401570769') # browser.refresh() # print(browser.page_source) # if browser.find_element_by_class_name() # wb_name = browser.find_element_by_class_name("W_input") # wb_name.send_keys(input('输入博主ID:')) # sleep(10) # search = browser.find_element_by_class_name('W_ficon ficon_search S_ficon') # search.click() # sleep(5) # bz_num = browser.find_element_by_class_name('name_txt') # bz_num.click() # sleep(5) # # 开启了一个新页面,需要跳转到新页面 # handles = browser.window_handles # browser.switch_to_window(handles[1]) ================================================ FILE: taobao/mac_chromedriver/chromedriver ================================================ [File too large to display: 14.2 MB] ================================================ FILE: taobao/taobao_via_username_password.py ================================================ import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.ui import WebDriverWait from getpass import getpass """ 运行前必须要做的事情: 如果直接使用webdriver,不做任何修改的话,淘宝可以断定启动的浏览器是“机器人”,而不是“死的机器”。 如果想让淘宝错误地认为启动的浏览器是"死的机器",那么就需要修改webdriver。 我使用的是chromedriver,"perl -pi -e 's/cdc_/dog_/g' /usr/local/bin/chromedriver"是修改chromedriver的代码,直接在Terminal执行即可。执行完在运行此脚本,则可以成功登录。 这里我解释一下"perl -pi -e 's/cdc_/dog_/g' /usr/local/bin/chromedriver",这段代码其实就是全局修改/usr/local/bin/chromedriver中的cdc_为dog_,"/usr/local/bin/chromedriver"是chromedriver所在的文件路径。 感谢https://www.jianshu.com/p/368be2cc6ca1这篇文章的作者。 ###################################### - 已经修改的 webdriver 在仓库中请自行下载 - 不保证所有的版本都可用,以下是我用的版本,如果不适应,请下载对应的版本自行修改 - 另外感谢提供思路-- version: 版本 76.0.3809.100(正式版本) (64 位) ###################################### """ class TaobaoSpider: def __init__(self, username, password): chrome_options = webdriver.ChromeOptions() # 不加载图片,加快访问速度 chrome_options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 设置为开发者模式,避免被识别 chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) self.web_driver = webdriver.Chrome(options=chrome_options) self.web_driver_wait = WebDriverWait(self.web_driver, 10) self.url = 'https://login.taobao.com/member/login.jhtml' self.username = username self.password = password def login(self): self.web_driver.get(self.url) try: # 切换为帐号密码登录 login_method_switch = self.web_driver_wait.until( expected_conditions.presence_of_element_located((By.XPATH, '//*[@id="J_QRCodeLogin"]/div[5]/a[1]'))) login_method_switch.click() # 找到用户名输入框并输入 username_input = self.web_driver_wait.until( expected_conditions.presence_of_element_located((By.ID, 'TPL_username_1'))) username_input.send_keys(self.username) # 找到密码输入框并输入 password_input = self.web_driver_wait.until( expected_conditions.presence_of_element_located((By.ID, 'TPL_password_1'))) password_input.send_keys(self.password) # 找到登录按钮并点击 login_button = self.web_driver_wait.until( expected_conditions.presence_of_element_located((By.XPATH, '//*[@id="J_SubmitStatic"]'))) login_button.click() # 找到名字标签并打印内容 taobao_name_tag = self.web_driver_wait.until(expected_conditions.presence_of_element_located( (By.XPATH, '//*[@id="J_Col_Main"]/div/div[1]/div/div[1]/div[1]/div/div[1]/a/em'))) print(f"登陆成功:{taobao_name_tag.text}") # 休息5秒钟,然后关闭浏览器 time.sleep(5) self.web_driver.close() except Exception as e: print(e) self.web_driver.close() print("登陆失败") if __name__ == "__main__": username = input("请输入用户名:") password = getpass("请输入密码:") spider = TaobaoSpider(username, password) spider.login() ================================================ FILE: taobao/taobao_via_weibo.py ================================================ #!/usr/bin/python3 # -*- coding: utf-8 -*- """ author : CriseLYJ github : https://github.com/CriseLYJ/ update_time : 2019-4-2 """ """ 淘宝登陆有时候不会跳出二维码页面,如果失败,请重新运行程序即可 """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC class Taobao_Spider: def __init__(self, username, password): """初始化参数""" url = 'https://login.taobao.com/member/login.jhtml' self.url = url options = webdriver.ChromeOptions() # 不加载图片,加快访问速度 options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 设置为开发者模式,避免被识别 options.add_experimental_option('excludeSwitches', ['enable-automation']) self.browser = webdriver.Chrome(options=options) self.wait = WebDriverWait(self.browser, 10) # 初始化用户名 self.username = username # 初始化密码 self.password = password def run(self): """登陆接口""" self.browser.get(self.url) try: # 这里设置等待:等待输入框 login_element = self.wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd'))) login_element.click() sina_login = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login'))) sina_login.click() weibo_user = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input'))) weibo_user.send_keys(self.username) sina_password = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input'))) sina_password.send_keys(self.password) submit = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span'))) submit.click() taobao_name = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick '))) # 登陆成功打印提示信息 print("登陆成功:%s" % taobao_name.text) except Exception: self.browser.close() print("登陆失败") if __name__ == "__main__": username = input("请输入你的微博用户名:") password = input("请输入密码:") spider = Taobao_Spider(username, password) spider.run() ================================================ FILE: tieba/tieba_spider.py ================================================ #!/usr/bin/python3 # -*- coding: utf-8 -*- """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-6 """ """ 请求URL分析 https://tieba.baidu.com/f?kw=魔兽世界&ie=utf-8&pn=50 请求方式分析 GET 请求参数分析 pn每页50发生变化,其他参数固定不变 请求头分析 只需要添加User-Agent """ # 代码实现流程 # 1. 实现面向对象构建爬虫对象 # 2. 爬虫流程四步骤 # 2.1 获取url列表 # 2.2 发送请求获取响应 # 2.3 从响应中提取数据 # 2.4 保存数据 import requests class TieBa_Spier(): def __init__(self, max_pn, kw): # 初始化 self.max_pn = max_pn self.kw = kw self.base_url = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" } def get_url_list(self): """获取url列表""" return [self.base_url.format(self.kw, pn) for pn in range(0, self.max_pn, 50)] def get_content(self, url): """发送请求获取响应内容""" response = requests.get( url=url, headers=self.headers ) # print(response.text) return response.content def save_items(self, content, idx): """从响应内容中提取数据""" with open('{}.html'.format(idx), 'wb') as f: f.write(content) return None def run(self): """运行程序""" # 获取url_list url_list = self.get_url_list() for url in url_list: # 发送请求获取响应 content = self.get_content(url) # 保存数据 items = self.save_items(content, url_list.index(url) + 1) # 测试 # print(items) if __name__ == '__main__': spider = TieBa_Spier(200, "英雄联盟") spider.run() ================================================ FILE: tuchong/tuchong.py ================================================ import requests import re import os from hashlib import md5 from requests.exceptions import RequestException """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-6 """ headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': 'PHPSESSID=36c8n4lsbb8u63glevh1ksc9a1; webp_enabled=1; _ga=GA1.2.1167535880.1534758916; _gid=GA1.2.1330668796.1534758916; weilisessionid=aa3bf69b4f35c91ca4866315f1f300b1; wluuid=WLGEUST-02ADBA37-4B6C-DE33-2769-8697C4B575BB; wlsource=tc_pc_home; webp_enabled=0; _ga=GA1.3.1167535880.1534758916; _gid=GA1.3.1330668796.1534758916; _ba=BA0.2-20180820-51751-eyUyUL4rqUHUI1lh6uRM; qimo_seosource_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=%E5%85%B6%E4%BB%96%E7%BD%91%E7%AB%99; qimo_seokeywords_e7dfc0b0-b3b6-11e7-b58e-df773034efe4=%E6%9C%AA%E7%9F%A5; accessId=e7dfc0b0-b3b6-11e7-b58e-df773034efe4; pageViewNum=1; bad_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=3c85f321-a45f-11e8-92ed-072415955da9; nice_ide7dfc0b0-b3b6-11e7-b58e-df773034efe4=3c85f322-a45f-11e8-92ed-072415955da9', 'dnt': '1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } # 获取imageID def get_imageID(term, page): try: print('获取图片ID.....') url = 'https://stock.tuchong.com/api/free/search/?term=' + term + '&page=' + str(page) req = requests.get(url, headers=headers) if req.status_code == 200: json_imageid = req.json() return parse_imgID(json_imageid) except ConnectionError: return None # 解析imageID里面的图片id def parse_imgID(imageID): print('解析imageID') data = imageID.get('data') hits = data.get('hits') if hits: print('存在ID,解析') for item in hits: Id = item.get('imageId') get_ImageJPG(Id) return True # 拼接图片ID获取图片url def get_ImageJPG(id): if id: try: print('拼接url访问网页') url = 'https://stock.tuchong.com/free/image/?imageId=' + str(id) req = requests.get(url, headers=headers) if req.status_code == 200: return parse_imgURL(req.text) except ConnectionError: return None # 解析html里面的图片url def parse_imgURL(html): if html: print('解析HTML图片URL...') url = re.findall('<div.*?class="image-cover".*?<img.*?src="(.*?)">.*?</div>', html, re.S) # url = re.findall('<title>(.*?)', html, re.S) for item in url: print("准备下载...", item) download_image(item) return None def download_image(url): try: urls = 'https:' + url ir = requests.get(urls, headers=headers) if ir.status_code == 200: save_image(ir.content) return None except RequestException: return None def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) f.close() print('下载成功----------------------') def main(): term = input('输入想要搜索的内容: ') for i in range(1, 7): get_imageID(term, i) if __name__ == '__main__': main() ================================================ FILE: webWeixin/webWeixin.py ================================================ import os import re import time import sys import subprocess import requests import xml.dom.minidom import json """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-3-6 """ session = requests.session() headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } QRImgPath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'webWeixinQr.jpg' uuid = '' tip = 0 base_uri = '' redirect_uri = '' skey = '' wxsid = '' wxuin = '' pass_ticket = '' deviceId = 'e000000000000000' BaseRequest = {} ContactList = [] My = [] SyncKey = '' def getUUID(): global uuid, session url = 'https://login.weixin.qq.com/jslogin' params = { 'appid': 'wx782c26e4c19acffb', 'fun': 'new', 'lang': 'zh_CN', '_': int(time.time()), } response = session.get(url, params=params) data = response.content.decode('utf-8') # print(data) >>> window.QRLogin.code = 200; window.QRLogin.uuid = "oZwt_bFfRg=="; regx = r'window.QRLogin.code = (\d+); window.QRLogin.uuid = "(\S+?)"' pm = re.search(regx, data) code = pm.group(1) uuid = pm.group(2) if code == '200': return True return False def showQRImage(): global tip url = 'https://login.weixin.qq.com/qrcode/' + uuid params = { 't': 'webwx', '_': int(time.time()), } response = session.get(url, params=params) tip = 1 with open(QRImgPath, 'wb') as f: f.write(response.content) f.close() if sys.platform.find('darwin') >= 0: subprocess.call(['open', QRImgPath]) elif sys.platform.find('linux') >= 0: subprocess.call(['xdg-open', QRImgPath]) else: os.startfile(QRImgPath) print('请使用微信扫描二维码以登录') def waitForLogin(): global tip, base_uri, redirect_uri url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % ( tip, uuid, int(time.time())) response = session.get(url) data = response.content.decode('utf-8') # print(data) # window.code=500; regx = r'window.code=(\d+);' pm = re.search(regx, data) code = pm.group(1) if code == '201': # 已扫描 print('成功扫描,请在手机上点击确认以登录') tip = 0 elif code == '200': # 已登录 print('正在登录...') regx = r'window.redirect_uri="(\S+?)";' pm = re.search(regx, data) redirect_uri = pm.group(1) + '&fun=new' base_uri = redirect_uri[:redirect_uri.rfind('/')] # closeQRImage if sys.platform.find('darwin') >= 0: # for OSX with Preview os.system("osascript -e 'quit app \"Preview\"'") elif code == '408': # 超时 pass # elif code == '400' or code == '500': return code def login(): global skey, wxsid, wxuin, pass_ticket, BaseRequest response = session.get(redirect_uri) data = response.content.decode('utf-8') doc = xml.dom.minidom.parseString(data) root = doc.documentElement for node in root.childNodes: if node.nodeName == 'skey': skey = node.childNodes[0].data elif node.nodeName == 'wxsid': wxsid = node.childNodes[0].data elif node.nodeName == 'wxuin': wxuin = node.childNodes[0].data elif node.nodeName == 'pass_ticket': pass_ticket = node.childNodes[0].data # print('skey: %s, wxsid: %s, wxuin: %s, pass_ticket: %s' % (skey, wxsid, # wxuin, pass_ticket)) if not all((skey, wxsid, wxuin, pass_ticket)): return False BaseRequest = { 'Uin': int(wxuin), 'Sid': wxsid, 'Skey': skey, 'DeviceID': deviceId, } return True def webwxinit(): url = base_uri + \ '/webwxinit?pass_ticket=%s&skey=%s&r=%s' % ( pass_ticket, skey, int(time.time())) params = { 'BaseRequest': BaseRequest } h = headers h['ContentType'] = 'application/json; charset=UTF-8' response = session.post(url, data=json.dumps(params), headers=h) data = response.content.decode('utf-8') # print(data) global ContactList, My, SyncKey dic = json.loads(data) ContactList = dic['ContactList'] My = dic['User'] SyncKeyList = [] for item in dic['SyncKey']['List']: SyncKeyList.append('%s_%s' % (item['Key'], item['Val'])) SyncKey = '|'.join(SyncKeyList) # ErrMsg = dic['BaseResponse']['ErrMsg'] Ret = dic['BaseResponse']['Ret'] if Ret != 0: return False return True def webwxgetcontact(): url = base_uri + \ '/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % ( pass_ticket, skey, int(time.time())) h = headers h['ContentType'] = 'application/json; charset=UTF-8' response = session.get(url, headers=h) data = response.content.decode('utf-8') # print(data) dic = json.loads(data) MemberList = dic['MemberList'] # 倒序遍历,不然删除的时候出问题.. SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync", "floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp", "facebookapp", "masssendapp", "meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder", "weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts", "notification_messages", "wxitil", "userexperience_alarm"] for i in range(len(MemberList) - 1, -1, -1): Member = MemberList[i] if Member['VerifyFlag'] & 8 != 0: # 公众号/服务号 MemberList.remove(Member) elif Member['UserName'] in SpecialUsers: # 特殊账号 MemberList.remove(Member) elif Member['UserName'].find('@@') != -1: # 群聊 MemberList.remove(Member) elif Member['UserName'] == My['UserName']: # 自己 MemberList.remove(Member) return MemberList def main(): if not getUUID(): print('获取uuid失败') return showQRImage() time.sleep(1) while waitForLogin() != '200': pass os.remove(QRImgPath) if not login(): print('登录失败') return # 登录完成, 下面查询好友 if not webwxinit(): print('初始化失败') return MemberList = webwxgetcontact() print('通讯录共%s位好友' % len(MemberList)) for x in MemberList: sex = '未知' if x['Sex'] == 0 else '男' if x['Sex'] == 1 else '女' print('昵称:%s, 性别:%s, 备注:%s, 签名:%s' % (x['NickName'], sex, x['RemarkName'], x['Signature'])) if __name__ == '__main__': print('开始') main() ================================================ FILE: xiamiMusic/README ================================================ xiami music login module ======================== @upload and test date: 2020-08-17 @use module: pyppeteer==0.2.2 @author: Kris ================================================ FILE: xiamiMusic/api.py ================================================ # -*- coding: utf-8 -*- # @Author: Kris # @Mail: criselyj@163.com # @Date: 2020-08-14 17:40:11 import os import random import asyncio from pyppeteer import launch base_url = 'https://passport.xiami.com/' current_dir = os.path.dirname(os.path.realpath(__file__)) # Fix:https://github.com/miyakogi/pyppeteer/issues/183 文件权限问题。 cache_dir = os.path.join(current_dir, 'cache') if not os.path.exists(cache_dir): os.mkdir(cache_dir) class XMLogin(object): url = base_url def __init__(self, account, password): self.account = account self.password = password self.browser = None self.page = None async def send_key(self): await self.page.click('.login-switch') await self.page.type('#account', self.account, {'delay': random.randint(100, 200) - 50}) await self.page.type('#password', self.password, {'delay': random.randint(100, 200) - 50}) async def slide(self): try: await self.page.hover('#captcha') await self.page.mouse.down() await self.page.mouse.move(2000, 0, {'delay': random.randint(2000, 4000)}) await self.page.mouse.up() except Exception as e: print('error', e) exit(0) async def validate(self): try: error_element = await self.page.xpath('//div[@id="error"]') msg = await ( await error_element[0].getProperty('textContent')).jsonValue() except Exception: return None return msg async def crawl(self): # 测试环境下 headless 设置为 False # 生产环境可以修改为无头浏览器 self.browser = await launch({ 'headless': False, 'userDataDir': cache_dir, 'defaultViewport': {'width': 1440, 'height': 1000}, 'args': ['--no-sandbox'] }) self.page = await self.browser.newPage() await self.page.goto(self.url) # 伪造当前浏览状态 防止自动化工具检测 codes = ( "() =>{ Object.defineProperties(navigator,{ webdriver:" "{ get: () => false } }) }", "() =>{ window.navigator.chrome = { runtime: {}, }; }", "() =>{ Object.defineProperty(navigator, 'languages', " "{ get: () => ['en-US', 'en'] }); }", "() =>{ Object.defineProperty(navigator, 'plugins', { " "get: () => [1, 2, 3, 4, 5,6], }); }" ) for code in codes: await self.page.evaluate(code) await self.send_key() await asyncio.sleep(random.randint(2, 3)) await self.slide() # 登录 await asyncio.sleep(random.randint(2, 3)) await self.page.click('#submit') msg = await self.validate() if msg: print('[*] 错误信息:', msg) exit(0) print('[*] 登录成功') await asyncio.sleep(5) def main(): print('[*] 模拟登陆虾米音乐程序启动...') account = input('[*] 请输入账号:') password = input('[*] 请输入密码:') login = XMLogin(account, password) loop = asyncio.get_event_loop() loop.run_until_complete(login.crawl()) if __name__ == '__main__': main() ================================================ FILE: zhaopingou/zhaopingou_login.py ================================================ #!/usr/bin/ python3 # -*- coding: utf-8 -*- import requests """ info: author:CriseLYJ github:https://github.com/CriseLYJ/ update_time:2019-04-06 """ """ 模拟登陆招聘狗 """ class ZhaoPinGouLogin(object): def __init__(self, account, password): self.url = "https://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp=1554552162122" self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36", "Refer": "https://qiye.zhaopingou.com/signin?callback=https%3A%2F%2Fqiye.zhaopingou.com%2Fresume", "Host": "qiye.zhaopingou.com" } self.data = { 'userName': account, 'password': password, 'code': '', 'clientNo': '', 'userToken': '', 'clientType': '2' } self.session = requests.Session() def get_coolie(self): """模拟登陆获取cookie""" resp = self.session.post( url=self.url, headers=self.headers, data=self.data ) resp_dict = resp.json() if resp_dict["errorCode"] == 1: print("登陆成功") # 获取登陆过的cookies cookies = resp.cookies print(cookies) return cookies else: print("登陆失败") def run(self): self.get_coolie() if __name__ == '__main__': account = input("请输入你的账号:") password = input("请输入你的密码:") spider = ZhaoPinGouLogin(account, password) spider.run()