Showing preview only (308K chars total). Download the full file or copy to clipboard to get everything.
Repository: Kr1s77/awesome-python-login-model
Branch: master
Commit: b458a09bf554
Files: 99
Total size: 14.4 MB
Directory structure:
gitextract_aykf45t2/
├── .gitattributes
├── .gitignore
├── 126email/
│ └── 126email.py
├── 163email/
│ └── 163email.py
├── 163youdao/
│ └── 163youdao.py
├── Github/
│ └── login.py
├── LICENSE
├── NeteaseCloudMusicDownload/
│ └── api.py
├── README-Test.md
├── README-en-us.md
├── README.md
├── baidu/
│ ├── baidu.py
│ ├── requirements.txt
│ └── util.py
├── baidu_translate/
│ ├── Baidufanyi.py
│ └── translate.js
├── bilibili/
│ └── bilibili.py
├── csdn/
│ ├── README
│ └── selenium_csdn.py
├── douban/
│ ├── douban.py
│ └── douban_spider.py
├── facebook/
│ └── facebook.py
├── guoke/
│ ├── guoke.py
│ └── guoke_spider.py
├── jd_login/
│ ├── Method_First/
│ │ ├── Try_selenium.py
│ │ ├── ban.txt
│ │ ├── choice.txt
│ │ └── config.py
│ ├── Method_Second/
│ │ ├── Config.py
│ │ ├── Truekeyword.txt
│ │ └── main.py
│ ├── README.md
│ └── login_by_selenium.py
├── lagou/
│ └── Lagou.py
├── liepin/
│ ├── README.md
│ ├── liepinSpd/
│ │ ├── liepinSpd/
│ │ │ ├── __init__.py
│ │ │ ├── dbhelper.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspider.py
│ │ ├── run_liepin1.py
│ │ └── scrapy.cfg
│ ├── liepinSpd2/
│ │ ├── liepinSpd2/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── liepinJob.py
│ │ ├── run_liepin2.py
│ │ └── scrapy.cfg
│ ├── liepinSpd_500/
│ │ ├── liepinSpd/
│ │ │ ├── __init__.py
│ │ │ ├── dbhelper.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspider.py
│ │ ├── run_liepin1.py
│ │ └── scrapy.cfg
│ ├── liepinSpecialCom/
│ │ ├── liepinSpecialCom/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspecialcom.py
│ │ ├── run_liepinspecialcom.py
│ │ └── scrapy.cfg
│ ├── liepinSpecialComJob/
│ │ ├── liepinSpecialComJob/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspecialcomjob.py
│ │ ├── run_liepinspecialjob.py
│ │ └── scrapy.cfg
│ └── liepin_login.py
├── qqmusic/
│ ├── qqmusic_spider.py
│ └── sign.js
├── qqzone/
│ └── qq_zone.py
├── qsbk/
│ └── qiushibaike.py
├── sina/
│ ├── sina.py
│ └── spider/
│ ├── Ajax_weibo.py
│ └── selenium_test.py
├── taobao/
│ ├── mac_chromedriver/
│ │ └── chromedriver
│ ├── taobao_via_username_password.py
│ └── taobao_via_weibo.py
├── tieba/
│ └── tieba_spider.py
├── tuchong/
│ └── tuchong.py
├── webWeixin/
│ └── webWeixin.py
├── xiamiMusic/
│ ├── README
│ └── api.py
└── zhaopingou/
└── zhaopingou_login.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
*.js linguist-language=python
*.css linguist-language=python
*.html linguist-language=python
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# Mac os
.DS_Store
*/.DS_Store
# pycharm
.idea
.env
================================================
FILE: 126email/126email.py
================================================
import time
from selenium import webdriver
from getpass import getpass
def login():
acount_num = input('请输入账号:')
passwd_str = getpass('请输入密码:')
driver = webdriver.Chrome()
url = 'http://mail.126.com/'
driver.get(url)
time.sleep(30)
elem = driver.find_element_by_css_selector("iframe[id^='x-URS-iframe']")
# # 126登陆框是使用iframe进行嵌套的,所以需要先切换到该iframe
driver.switch_to.frame(elem)
acount = driver.find_element_by_name('email')
acount.clear()
acount.send_keys(acount_num)
passwd = driver.find_element_by_name('password')
passwd.clear()
passwd.send_keys(passwd_str)
time.sleep(3)
click_button = driver.find_element_by_id('dologin')
click_button.click()
time.sleep(5)
cur_cookies = driver.get_cookies()[0]
return cur_cookies
if __name__ == '__main__':
login()
================================================
FILE: 163email/163email.py
================================================
import time
from getpass import getpass
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def login():
acount_num = input('请输入账号:')
passwd_str = getpass('请输入密码:')
driver = webdriver.Chrome()
url = 'http://mail.163.com/'
driver.get(url)
# 等待页面加载完成,出现可以点击到密码登录的button
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.ID, 'lbNormal')))
driver.find_element_by_id('lbNormal').click()
# 使用CSSSelector正则匹配头部
elem = driver.find_element_by_css_selector("iframe[id^='x-URS-iframe']")
# 163登陆框是使用iframe进行嵌套的,所以需要先切换到该iframe
driver.switch_to.frame(elem)
account_el = driver.find_element_by_xpath('//input[@name="email"]')
account_el.clear()
account_el.send_keys(acount_num)
password_el = driver.find_element_by_xpath('//input[@name="password"]')
password_el.clear()
password_el.send_keys(passwd_str)
login_el = driver.find_element_by_xpath('//a[@id="dologin"]')
login_el.click()
time.sleep(10)
cur_cookies = driver.get_cookies()
return cur_cookies
if __name__ == '__main__':
login()
================================================
FILE: 163youdao/163youdao.py
================================================
import time
from selenium import webdriver
login_url = "http://account.youdao.com/login?service=dict"
xpaths = {'usernameTxtBox': ".//*[@id='username']",
'passwordTxtBox': ".//*[@id='password']",
'submitButton': ".//*[@id='login']/div[2]/div/div[1]/form/p[4]/nobr/input",
}
def login():
mydriver = webdriver.Firefox()
mydriver.get(login_url)
mydriver.maximize_window()
# Clear Username TextBox if already allowed "Remember Me"
mydriver.find_element_by_xpath(xpaths['usernameTxtBox']).clear()
username = input('Please type your user name:\n')
# Write Username in Username TextBox
mydriver.find_element_by_xpath(xpaths['usernameTxtBox']).send_keys(username)
# Clear Password TextBox if already allowed "Remember Me"
mydriver.find_element_by_xpath(xpaths['passwordTxtBox']).clear()
password = input('Please type your password:\n')
# Write Password in password TextBox
mydriver.find_element_by_xpath(xpaths['passwordTxtBox']).send_keys(password)
# Click Login button
mydriver.find_element_by_xpath(xpaths['submitButton']).click()
# add sleep
print('登录成功')
time.sleep(5)
if __name__ == '__main__':
login()
================================================
FILE: Github/login.py
================================================
# -*- coding: utf-8 -*-
# @Author: CriseLYJ
# @Date: 2020-08-14 12:13:11
import re
import requests
from getpass import getpass
class GithubLogin(object):
def __init__(self, email, password):
# 初始化信息
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
'Referer': 'https://github.com/',
'Host': 'github.com'
}
self.session = requests.Session()
self.login_url = 'https://github.com/login'
self.post_url = 'https://github.com/session'
self.email = email
self.password = password
def login_GitHub(self):
# 登录入口
post_data = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.get_token(),
'login': self.email,
'password': self.password
}
resp = self.session.post(
self.post_url, data=post_data, headers=self.headers)
print('StatusCode:', resp.status_code)
if resp.status_code != 200:
print('Login Fail')
match = re.search(r'"user-login" content="(.*?)"', resp.text)
user_name = match.group(1)
print('UserName:', user_name)
# Get login token
def get_token(self):
response = self.session.get(self.login_url, headers=self.headers)
if response.status_code != 200:
print('Get token fail')
return None
match = re.search(
r'name="authenticity_token" value="(.*?)"', response.text)
if not match:
print('Get Token Fail')
return None
return match.group(1)
if __name__ == '__main__':
email = input('Account:')
password = getpass('Password:')
login = GithubLogin(email, password)
login.login_GitHub()
================================================
FILE: LICENSE
================================================
The MIT License
Copyright (c) 2018 CriseLYJ.
https://github.com/CriseLYJ/awesome-python-login-model
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: NeteaseCloudMusicDownload/api.py
================================================
# -*- coding: utf-8 -*-
# @Author: CriseLYJ
# @Date: 2020-08-14 13:48:23
import requests
import math
import random
from Crypto.Cipher import AES
import base64
import codecs
import os
class decrypt_music(object):
def __init__(self, d):
self.d = d
self.e = '010001'
self.f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5a" \
"a76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46be" \
"e255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
self.g = '0CoJUm6Qyw8W8jud'
self.random_text = self.get_random_str()
def get_random_str(self):
str = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
res = ''
for x in range(16):
index = math.floor(random.random() * len(str))
res += str[index]
return res
def aes_encrypt(self, text, key):
iv = b'0102030405060708'
pad = 16 - len(text.encode()) % 16
text = text + pad * chr(pad)
# fix: https://github.com/Kr1s77/awesome-python-login-model/issues/100#issuecomment-673897848
# error: TypeError: Object type <class 'str'> cannot be passed to C code
encryptor = AES.new(key.encode(), AES.MODE_CBC, iv)
msg = base64.b64encode(encryptor.encrypt(text.encode()))
return msg
def rsa_encrypt(self, value, text, modulus):
'''进行rsa加密'''
text = text[::-1]
rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(value, 16) % int(modulus, 16)
return format(rs, 'x').zfill(256)
def get_data(self):
params = self.aes_encrypt(self.d, self.g)
params = self.aes_encrypt(params.decode('utf-8'), self.random_text)
enc_sec_key = self.rsa_encrypt(self.e, self.random_text, self.f)
return {
'params': params,
'encSecKey': enc_sec_key
}
class Spider(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Cookie': '_iuqxldmzr_=32; _ntes_nnid=8d4ef0883a3bcc9d3a2889b0bf36766a,1533782432391; _ntes_nuid=8d4ef0883a3bcc9d3a2889b0bf36766a; __utmc=94650624; WM_TID=GzmBlbRkRGQXeQiYuDVCfoEatU6VSsKC; playerid=19729878; __utma=94650624.1180067615.1533782433.1533816989.1533822858.9; __utmz=94650624.1533822858.9.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; WM_NI=S5gViyNVs14K%2BZoVerGK69gLlmtnH5NqzyHcCUY%2BiWm2ZaHATeI1gfsEnK%2BQ1jyP%2FROzbzDV0AyJHR4YQfBetXSRipyrYCFn%2BNdA%2FA8Mv80riS3cuMVJi%2BAFgCpXTiHBNHE%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6ee84b674afedfbd3cd7d98b8e1d0f554f888a4abc76990b184badc4f89e7af8ece2af0fea7c3b92a91eba9b7ec738e8abdd2b741e986a1b7e87a8595fadae648b0b3bc8fcb3f8eafb69acb69818b97ccec5dafee9682cb4b98bb87d2e66eb19ba2acaa5bf3b6b7b1ae5a8da6ae9bc75ef49fb7abcb5af8879f87c16fb8889db3ec7cbbae97a4c566e992aca2ae4bfc93bad9b37aab8dfd84f8479696a7ccc44ea59dc0b9d7638c9e82a9c837e2a3; JSESSIONID-WYYY=sHwCKYJYxz6ODfURChA471BMF%5CSVf3%5CTc8Qcy9h9Whj6CfMxw4YWTMV7CIx5g6rqW8OBv04YGHwwq%2B%5CD1N61qknTP%2Fym%2BHJZ1ylSH1EabbQASc9ywIT8YvOr%2FpMgvmm1cbr2%2Bd6ssMYXuTlpOIrKqp%5C%2FM611EhmfAfU47%5CSQWAs%2BYzgY%3A1533828139236'
}
def __get_songs(self, name):
d = '{"hlpretag":"<span class=\\"s-fc7\\">","hlposttag":"</span>","s":"%s","type":"1","offset":"0","total":"true","limit":"30","csrf_token":""}' % name
wyy = decrypt_music(d)
data = wyy.get_data()
url = 'https://music.163.com/weapi/cloudsearch/get/web?csrf_token='
response = requests.post(url, data=data, headers=self.headers).json()
return response['result']
def __get_mp3(self, id):
d = '{"ids":"[%s]","br":320000,"csrf_token":""}' % id
wyy = decrypt_music(d)
data = wyy.get_data()
url = 'https://music.163.com/weapi/song/enhance/player/url?csrf_token='
response = requests.post(url, data=data, headers=self.headers).json()
print(response)
return response['data'][0]['url']
def __download_mp3(self, url, filename):
abspath = os.path.abspath('.')
os.chdir(abspath)
response = requests.get(url, headers=self.headers).content
path = os.path.join(abspath, filename)
with open(filename + '.mp3', 'wb') as f:
f.write(response)
print('下载完毕,可以在%s 路径下查看' % path + '.mp3')
def __print_info(self, songs):
"""打印歌曲需要下载的歌曲信息"""
songs_list = []
for num, song in enumerate(songs):
print(num, '歌曲名字:', song['name'], '作者:', song['ar'][0]['name'])
songs_list.append((song['name'], song['id']))
return songs_list
def run(self):
while True:
name = input('请输入你需要下载的歌曲:')
songs = self.__get_songs(name)
if songs['songCount'] == 0:
print('没有搜到此歌曲,请换个关键字')
else:
songs = self.__print_info(songs['songs'])
num = input('请输入需要下载的歌曲,输入左边对应数字即可')
url = self.__get_mp3(songs[int(num)][1])
if not url:
print('歌曲需要收费,下载失败')
else:
filename = songs[int(num)][0]
self.__download_mp3(url, filename)
flag = input('如需继续可以按任意键进行搜歌,否则按0结束程序')
if flag == '0':
break
print('程序结束!')
if __name__ == '__main__':
spider = Spider()
spider.run()
================================================
FILE: README-Test.md
================================================
## Test
### Bilibili自动登录测试正常,成功率98%


### web微信


### 图虫Spider


### 淘宝web
- taobao.py为模拟登录
- 剩下的文件为爬虫
### Github

### 新增链家Spider

```
1. 爬取淘宝各子标签,按销量排名商品信息,按分类保存至MongoDB
2. 通过pandas进行数据分析
3 .将商品在各省分布、销量排行、地图分布等通过matplotlib绘图显示
```
### guoke.spider使用需谨慎,下载的比较快!10秒能下载一堆,截图我就不展示了,已经删除,东西太多了😝
### 微博
- sina.py为模拟登录
- spider文件夹中为爬虫
```
1. 输入要爬取的博主ID,获取ajax请求
2. 解析json数据,爬取博主所有微博,保存至MySQL
```
### 网易云音乐
- 新增网易云音乐下载,之前的一个小demo应该还可以用,Crypto包应该挺难搞的,安装之后还是导入不了,推荐去百度一下,百度上的这个解决方法有很多,我就不多赘述了嘿嘿!
### 知乎
- 知乎登录没有问题,不过要手动输入验证码
- 知乎登录遇到“execjs._exceptions.ProgramError: TypeError: 'exports' 未定义”
- 原因以及解决办法:
```
1. 由于是你本地的JScript引擎只有一个默认的JScript,所以会造成json未定义的错误。
2. execjs会自动使用当前电脑上的运行时环境
3. 解决办法:安装一个nodejs的V8引擎就可以了
```

### 糗事百科


### 百度翻译
- 输入英语自动翻译

================================================
FILE: README-en-us.md
================================================
<h2 align="center"><code>🐍Website_login_mode</code></h2>
<br>
<p align="center">
<img src="https://github.com/CriseLYJ/flask-video-streaming-recorder/blob/master/img/main.jpg?raw=true"
alt="Master">
</p>
<br>
<p align="center">"<i>Did you know all your doors were locked?</i>" - Riddick (The Chronicles of Riddick)</p>
<br>
<div align="center">
<sub>Created by
<a href="https://criselyj.github.io/">CriseLYJ</a>
</div>
<br>
****
# 🌟Website_login_mode
I collected some major website login methods, and some website crawling programs, some are registered through selenium, some are directly simulated login by capturing packets, some are using scrapy, I hope to help Xiaobai, this project is used for research and sharing The simulated landing mode of the big website, and the crawler program, I will continue to update. . .
## Simulate login to some common websites and crawl corresponding information
## About
The basic login is based on direct login or using selenium+webdriver. Some websites are very difficult to log in directly. For example, qq space, bilibili, etc. if you use selenium, it is relatively easy.
Although it is selenium when logging in, for efficiency, we can maintain the cookie obtained after login, and then call requests or scrapy for data collection, so the speed of data collection can be guaranteed.
## Completed
- [x] [Facebook](https://www.facebook.com/)
- [x] [无需身份验证即可抓取Twitter前端API](https://twitter.com/)
- [x] [微博网页版](http://weibo.com)
- [x] [知乎](http://zhihu.com)
- [x] [QQZone](https://qzone.qq.com/)
- [x] [CSDN](https://www.csdn.net/)
- [x] [淘宝](www.taobao.com)
- [x] [Baidu](www.baidu.com)
- [x] [果壳](https://www.guokr.com/)
- [x] [JingDong 模拟登录和自动申请京东试用](https://www.jd.com/)
- [x] [163mail](https://mail.163.com/)
- [x] [拉钩](https://www.lagou.com/)
- [x] [Bilibili](https://www.bilibili.com/)
- [x] [豆瓣](https://www.douban.com/)
- [x] [Baidu2](www.baidu.com)
- [x] [猎聘网](https://www.liepin.com/)
- [x] [微信网页版登录并获取好友列表](https://wx.qq.com/)
- [x] [Github](https://github.com/)
- [x] [爬取图虫相应的图片](https://tuchong.com/)
## show
### Bilibili automatic login test is normal, the success rate is 98%

### web Weichat

### 图虫spider


### TaoBaoweb
- taobao.py为模拟登录
- 剩下的文件为爬虫
### Github

```
1. Climb the sub-labels of Taobao, rank the product information by sales, and save to MongoDB by category.
2. Data analysis by pandas
3. Display the distribution of goods in each province, sales ranking, map distribution, etc. through matplotlib
```
### Guoke.spider use caution, download faster! 10 seconds to download a bunch, screenshots I will not show, has been deleted, too many things 😝
### Sina
- sina.py: Log in for the simulation
- spider: Folder in the crawler
```
1. Enter the blogger ID to crawl and get an ajax request
2. Parse the json data, crawl all the bloggers of the blogger, save to MySQL
```
## tips of pull request
- Welcome everyone to come pull request 💗
## Problems
- About the verification code: The method used in this project does not process the verification code. The difficulty of identifying the complex verification code is still relatively large at present. In my opinion, the best way to do reptiles is to try to avoid the verification code.
- Code invalidation: Due to website policy or style change, the code is invalid, please give me an issue. If you have already solved it, you can mention PR, thank you!
## Another
- If you have any website that is difficult to log in, such as a website that uses selenium+webdriver and can't log in, please feel free to give me an issue.
- If the repo is helpful to everyone, give a star encouragement.
## something to add
1. After writing the project for a period of time, I found that the style of the code and the ease of use of the program, scalability, and readability of the code all have certain problems, so the next most important thing is to refactor the code so that everyone can It's easier to make some small features of your own.
2. If you feel that the login of a website is very representative, please feel free to ask in the issue
3. If the login to the site is very interesting, I will add it in a later update.
4. The login mechanism of the website may change frequently, so when the current simulated login rule cannot be used, please submit it in the issue.
- If you have a lot of attention, I will continue to maintain this repository to bring more things and refactor the code.
## Acknowledgments
- Thanks for all!
## Written at the end
- I need your support.
- And I think you can give me a 🌟``star``!s
================================================
FILE: README.md
================================================
<h2 align="center"><code>🎉Life is fantastic🥳!~</code></h2>
<br>
<p align="center">
<img src="https://github.com/CriseLYJ/flask-video-streaming-recorder/blob/master/img/main.jpg?raw=true"
alt="Master">
</p>
<br>
<p align="center">"<i>Did you know all your doors were locked?</i>" - Riddick (The Chronicles of Riddick)</p>
<br>
<p align="center">
<a href="https://github.com/CriseLYJ/awesome-python-login-model/tree/master">
<img src="https://img.shields.io/badge/Branch-master-green.svg?longCache=true"
alt="Branch">
</a>
<a href="https://github.com/CriseLYJ/awesome-python-login-model/stargazers">
<img src="https://img.shields.io/github/stars/CriseLYJ/awesome-python-login-model.svg?label=Stars&style=social"
alt="Stars">
</a>
<a href="https://github.com/CriseLYJ/awesome-python-login-model/network/members">
<img src="https://img.shields.io/github/forks/CriseLYJ/awesome-python-login-model.svg?label=Forks&style=social"
alt="Forks">
</a>
<a href="http://www.gnu.org/licenses/">
<img src="https://img.shields.io/badge/License-GNU-blue.svg?longCache=true"
alt="License">
</a>
<a href="https://github.com/sindresorhus/awesome">
<img src="https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg"
alt="Awesome">
</a>
</p>
<br>
<div align="center">
<sub>Created by
<a href="https://Kr1s77.github.io/">@kris</a>
</div>
<br>
****
## 传送门
- [x] [4G 代理](https://github.com/Kr1s77/FgSurfing)
- [x] [异常处理回调,直接 hook 所有函数,和类](https://github.com/Kr1s77/abnormalities)
给个 🌟 再走吧...
## 💕Website login model
一些爬虫示例程序,以及模拟登陆程序,模拟登陆基于 selenium,有些模拟登录基于 js 逆向,持续更新,有问题可以直接提交 Issues,欢迎提交 PR, 测试通过可以直接 merge,文中所有程序都是使用 ``python3`` 编写 :-)
## About
模拟登陆基本采用的是直接登录或者使用selenium+webdriver的方式,有的网站直接登录难度很大,比如qq空间,bilibili等如果采用selenium就相对轻松一些。
虽然在登录的时候采用的是selenium,为了效率,我们可以在登录过后得到的cookie维护起来,然后调用requests或者scrapy等进行数据采集,这样数据采集的速度可以得到保证。
## WebDriver
[Chrome](https://chromedriver.chromium.org/)
[FireFox](https://github.com/mozilla/geckodriver/releases/)
## Completed
- [x] [虾米音乐](https://www.xiami.com/)
- [x] [Facebook](https://www.facebook.com/)
- [x] [微博网页版](http://weibo.com)
- [x] [知乎](http://zhihu.com)
- [x] [QQZone](https://qzone.qq.com/)
- [x] [CSDN](https://www.csdn.net/)
- [x] [淘宝-接口修复完成-可用](https://login.taobao.com/member/login.jhtml)
- [x] [CSDN--已重构](https://www.csdn.net/)
- [x] [Baidu](www.baidu.com)
- [x] [果壳](https://www.guokr.com/)
- [x] [JingDong 模拟登录和自动申请京东试用](https://www.jd.com/)
- [x] [163mail](https://mail.163.com/)
- [x] [拉钩](https://www.lagou.com/)
- [x] [Bilibili](https://www.bilibili.com/)
- [x] [豆瓣](https://www.douban.com/)
- [x] [豆瓣spider](https://www.douban.com/)
- [x] [Baidu](www.baidu.com)
- [x] [猎聘网](https://www.liepin.com/)
- [x] [微信网页版登录并获取好友列表](https://wx.qq.com/)
- [x] [Github](https://github.com/)
- [x] [爬取图虫相应的图片](https://tuchong.com/)
- [x] [网易云音乐](https://music.163.com/)
- [x] [糗事百科--改为协程版](https://www.qiushibaike.com/)
- [x] [百度贴吧spider](https://tieba.baidu.com/)
- [x] [百度翻译](https://fanyi.baidu.com/)
## catalogue
- [x] [虾米音乐](https://github.com/Kr1s77/awesome-python-login-model/tree/master/xiamiMusic)
- [x] [Facebook模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/facebook)
- [x] [微博网页版模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/sina)
- [x] [QQZone模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/qqzone)
- [x] [CSDN模拟登录--已恢复](https://github.com/Kr1s77/awesome-python-login-model/blob/master/csdn)
- [x] [淘宝爬虫--重构中](https://github.com/Kr1s77/awesome-python-login-model/tree/master/taobao)
- [x] [Baidu模拟登录一](https://github.com/Kr1s77/awesome-python-login-model/tree/master/baidu)
- [x] [果壳爬虫程序](https://github.com/Kr1s77/awesome-python-login-model/tree/master/guoke)
- [x] [JingDong 模拟登录和自动申请京东试用](https://github.com/Kr1s77/awesome-python-login-model/tree/master/jd_login)
- [x] [163mail--已恢复](https://github.com/Kr1s77/awesome-python-login-model/blob/master/163email/163email.py)
- [x] [拉钩模拟登录--已失效](https://github.com/Kr1s77/awesome-python-login-model/blob/master/lagou/Lagou.py)
- [x] [Bilibili模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/bilibili/bilibili.py)
- [x] [豆瓣](https://github.com/Kr1s77/awesome-python-login-model/blob/master/douban/douban.py)
- [x] [Baidu2模拟登录](https://github.com/Kr1s77/awesome-python-login-model/blob/master/baidu2/baidu.py)
- [x] [猎聘网模拟登录](https://github.com/Kr1s77/awesome-python-login-model/tree/master/liepin)
- [x] [微信网页版登录并获取好友列表](https://github.com/Kr1s77/awesome-python-login-model/blob/master/webWeixin/webWeixin.py)
- [x] [Github模拟登录两种解决方案都可行](https://github.com/Kr1s77/awesome-python-login-model/tree/master/Github)
- [x] [爬取图虫想要的图片](https://github.com/Kr1s77/awesome-python-login-model/blob/master/tuchong/tuchong.py)
- [x] [网易云音乐downloader](https://github.com/Kr1s77/awesome-python-login-model/blob/master/NeteaseCloudMusicDownload/wangyiyun_spider.py)
- [x] [糗事百科爬虫](https://github.com/Kr1s77/awesome-python-login-model/blob/master/qsbk/qiushibaike.py)
- [x] [淘宝登陆-访问](https://login.taobao.com/member/login.jhtml)
# Test
> [Please touch here to view test images](./README-Test.md)
## Informations
- 为感谢你们的支持,准备写一套免费爬虫的教程,保证你学会以后可以爬取市面上大部分的网站,[教程地址](https://github.com/CriseLYJ/-Python-crawler-starts-from-zero)
## tips of pull request
- 欢迎大家一起来 pull request 💗
## Problems
- 关于验证码:本项目所用的方法都没有处理验证码,识别复杂验证码的难度就目前来说,还是比较大的。以我的心得来说,做爬虫最好的方式就是尽量规避验证码。
- 代码失效:由于网站策略或者样式改变,导致代码失效,请给我提issue,如果你已经解决,可以提PR,谢谢!
- 正在对部分代码进行优化。。。
- 如果该repo对大家有帮助,记得 star 哦。
## Acknowledgments
> [@deepforce](https://github.com/deepforce) | [@cclauss](https://github.com/cclauss) | [ksoeasyxiaosi](https://github.com/ksoeasyxiaosi) | [JasonJunJun](https://github.com/JasonJunJun) | [MediocrityXT](https://github.com/MediocrityXT)
- 感谢以上开发者的支持和贡献。
## 联系我
- 欢迎反馈!
- Email : criselyj@163.com
## 注意:
- 本项目仅用于学习和交流
> 欢迎任何人参与和完善:一个人可以走的很快,但是一群人却可以走的更远
================================================
FILE: baidu/baidu.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import sys
import time
from uuid import uuid4
from getpass import getpass
import requests
from util import *
if (sys.version_info < (3, 0)):
input = raw_input
class BaiduLogin(object):
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/601.2.7 (KHTML, like Gecko) Version/9.0.1 Safari/601.2.7',
'referer': 'https://pan.baidu.com/',
}
self.sess = requests.session()
self.gid = str(uuid4()).upper()
self.token = None
self.key = None
self.public_key = None
def _init_cookies(self):
"""初始化cookies
:return:
"""
self.sess.get(url='https://pan.baidu.com/', headers=self.headers)
def _get_token(self):
"""获取登陆token
:return:
"""
url = 'https://passport.baidu.com/v2/api/?getapi'
payload = {
'getapi': '',
'tpl': 'mn',
'apiver': 'v3',
'tt': str(int(time.time() * 1000)),
'class': 'login',
'gid': self.gid,
'loginversion': 'v4',
'logintype': 'dialogLogin',
'traceid': '',
'callback': 'bd__cbs__pivyke',
}
resp = self.sess.get(url=url, params=payload, headers=self.headers)
js = parse_json(resp.text.replace("\'", "\""))
self.token = js['data']['token']
def _get_public_key(self):
"""获取RSA公钥
:return: RSA公钥
"""
url = 'https://passport.baidu.com/v2/getpublickey'
payload = {
'token': self.token,
'tpl': 'mn',
'apiver': 'v3',
'tt': str(int(time.time() * 1000)),
'gid': self.gid,
'loginversion': 'v4',
'traceid': '',
'callback': 'bd__cbs__h02h0j'
}
resp = self.sess.get(url=url, params=payload, headers=self.headers)
js = parse_json(resp.text.replace("\'", "\""))
self.key, self.public_key = js.get('key'), js.get('pubkey')
def login(self, username, password, retry=4):
"""用户名密码登陆
:param username: 用户名
:param password: 密码
:return:
"""
self._init_cookies()
self._get_token()
self._get_public_key()
url = 'https://passport.baidu.com/v2/api/?login'
data = {
'staticpage': 'https://www.baidu.com/cache/user/html/v3Jump.html',
'charset': 'UTF-8',
'token': self.token,
'tpl': 'netdisk',
'subpro': 'netdisk_web',
'apiver': 'v3',
'tt': str(int(time.time() * 1000)),
'codestring': '',
'safeflg': '0',
'u': 'https://www.baidu.com/',
'isPhone': 'false',
'detect': '1',
'gid': self.gid,
'quick_user': '0',
'logintype': 'dialogLogin',
'logLoginType': 'pc_loginDialog',
'idc': '',
'loginmerge': 'true',
'splogin': 'rate',
'username': username,
'password': encrypt_pwd(password, self.public_key),
'rsakey': self.key,
'crypttype': '12',
'ppui_logintime': 254896,
'countrycode': '',
'loginversion': 'v4',
'traceid': '',
'callback': 'parent.bd__pcbs__oxzeyj'
}
for _ in range(retry):
resp = self.sess.post(url=url, headers=self.headers, data=data)
m = re.search('.*href \+= "(.*)"\+accounts', resp.text)
result = m.group(1)
d = dict([x.split("=") for x in result.split("&")])
err_no = d.get('err_no')
if err_no == '0':
print('Login success!')
return
elif err_no in ['6', '257']:
code_string = d.get('codeString')
data['codestring'] = code_string
resp = self.sess.get(
url='https://passport.baidu.com/cgi-bin/genimage?{}'.format(code_string),
headers=self.headers
)
image_path = os.path.join(os.getcwd(), 'vcode-login.jpg')
save_image(resp, image_path)
open_image(image_path)
verify_code = input('Please enter the verify code for login(return change):')
data['verifycode'] = verify_code
elif err_no == '120021':
raise LoginError("Account is in risk, please do security verification first!")
elif err_no in ['4', '7']:
raise LoginError('Error username or password!')
else:
raise LoginError("Unknown error:" + result)
raise LoginError('Login Fail!')
class LoginError(Exception):
pass
if __name__ == '__main__':
username = input("Username: ")
password = getpass("Password: ")
b = BaiduLogin()
b.login(username=username, password=password)
================================================
FILE: baidu/requirements.txt
================================================
requests>=2.20.0
pycryptodome>=3.6.6
================================================
FILE: baidu/util.py
================================================
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import json
from base64 import b64encode
from Crypto.PublicKey import RSA
from Crypto.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
def encrypt_pwd(password, public_key):
rsa_key = RSA.importKey(public_key)
encryptor = Cipher_pkcs1_v1_5.new(rsa_key)
cipher = b64encode(encryptor.encrypt(password.encode('utf-8')))
return cipher.decode('utf-8')
def open_image(image_file):
if os.name == "nt":
os.system('start ' + image_file) # for Windows
else:
if os.uname()[0] == "Linux":
os.system("eog " + image_file) # for Linux
else:
os.system("open " + image_file) # for Mac
def save_image(resp, image_file):
with open(image_file, 'wb') as f:
for chunk in resp.iter_content(chunk_size=1024):
f.write(chunk)
def parse_json(s):
begin = s.find('{')
end = s.rfind('}') + 1
return json.loads(s[begin:end])
================================================
FILE: baidu_translate/Baidufanyi.py
================================================
# !/usr/bin/python3
# -*- coding: utf-8 -*-
"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
"""
"""
请求url分析 :https://fanyi.baidu.com/basetrans
请求方式分析 :POST
请求参数分析 : {
query: hello
from: en
to: zh
token: 6f5c83b84d69ad3633abdf18abcb030d
sign: 54706.276099}
请求头分析
"""
# 代码实现流程
# 1. 实现面对对象构建爬虫对象
# 2. 爬虫流程四步骤
# 2.1 获取URl
# 2.2 发送请求获取响应
# 2.3 从响应中提取数据
# 2.4 保存数据
import requests
import js2py
context = js2py.EvalJs()
# 翻译模式
# 0:英译中 1:中译英
translating_mode = 0
class BaiDuTranslater(object):
"""
百度翻译爬虫
"""
def __init__(self, query):
# 初始化
self.url = "https://fanyi.baidu.com/basetrans"
self.query = query
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
"Referer": "https://fanyi.baidu.com/",
"Cookie": "BAIDUID=714BFAAF02DA927F583935C7A354949A:FG=1; BIDUPSID=714BFAAF02DA927F583935C7A354949A; PSTM=1553390486; delPer=0; PSINO=5; H_PS_PSSID=28742_1463_21125_18559_28723_28557_28697_28585_28640_28604_28626_22160; locale=zh; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1553658863,1553766321,1553769980,1553770442; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1553770442; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1553766258,1553766321,1553769980,1553770442; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1553770442"
}
def make_sign(self):
# js逆向获取sign的值
with open("translate.js", "r", encoding="utf-8") as f:
context.execute(f.read())
# 调用js中的函数生成sign
sign = context.a(self.query)
# 将sign加入到data中
return sign
def make_data(self, sign):
# 判断翻译模式,选取对应的 from 和 to 值.
if translating_mode == 0:
from_str = "en"
to_str = "zh"
else:
from_str = "zh"
to_str = "en"
data = {
"query": self.query,
"from": from_str,
"to": to_str,
"token": "6f5c83b84d69ad3633abdf18abcb030d",
"sign": sign
}
return data
def get_content(self, data):
# 发送请求获取响应
response = requests.post(
url=self.url,
headers=self.headers,
data=data
)
return response.json()["trans"][0]["dst"]
def run(self):
"""运行程序"""
# 获取sign的值
sign = self.make_sign()
# 构建参数
data = self.make_data(sign)
# 获取翻译内容
content = self.get_content(data)
print(content)
if __name__ == '__main__':
translating_mode = int(input("请输入翻译模式(0:英译中 1:中译英):"))
query = input("请输入您要翻译的内容:")
translater = BaiDuTranslater(query)
translater.run()
================================================
FILE: baidu_translate/translate.js
================================================
var i = "320305.131321201"
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var e = o.charAt(t + 2);
e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
}
return r
}
function a(r) {
var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === t) {
var a = r.length;
a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
"" !== C[h] && u.push.apply(u, e(C[h].split(""))),
h !== f - 1 && u.push(t[h]);
var g = u.length;
g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
}
var l = void 0
, d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
l = null !== i ? i : (i = o.common[d] || "") || "";
for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
var p = r.charCodeAt(F);
128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
c[v++] = p >> 18 | 240,
c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
c[v++] = p >> 6 & 63 | 128),
c[v++] = 63 & p | 128)
}
for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
w += c[D],
w = n(w, A);
return w = n(w, b),
w ^= s,
0 > w && (w = (2147483647 & w) + 2147483648),
w %= 1e6,
w.toString() + "." + (w ^ S)
}
================================================
FILE: bilibili/bilibili.py
================================================
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
from PIL import Image
from io import BytesIO
from time import sleep
from getpass import getpass
import random
"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
update_time:2019-3-7
"""
class BiliBili():
"""
登陆B站, 处理验证码
电脑的缩放比例需要为100%, 否则验证码图片的获取会出现问题
"""
def __init__(self, username, password):
"""
初始化
"""
options = webdriver.ChromeOptions()
# 设置为开发者模式,避免被识别
options.add_experimental_option('excludeSwitches',
['enable-automation'])
self.browser = webdriver.Chrome(options=options)
self.url = 'https://passport.bilibili.com/login'
self.browser.get(self.url)
self.wait = WebDriverWait(self.browser, 5, 0.2)
self.username = username
self.password = password
def get_button(self):
"""
获取滑动块, 并且返回
:return: button
"""
button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'gt_slider_knob')))
return button
def get_screenshot(self, button):
"""
获取网页两次截图:
1. 鼠标悬停于button的截图
2. 鼠标点击button后的截图
:param button: 滑动块
:return: 两次截图的结果
"""
ActionChains(self.browser).move_to_element(button).perform()
screenshot1 = self.browser.get_screenshot_as_png()
screenshot1 = Image.open(BytesIO(screenshot1))
ActionChains(self.browser).click_and_hold(button).perform()
screenshot2 = self.browser.get_screenshot_as_png()
screenshot2 = Image.open(BytesIO(screenshot2))
return (screenshot1, screenshot2)
def get_position(self, button):
"""
获取验证码图片的位置
:return: 位置的四个点参数
"""
ActionChains(self.browser).move_to_element(button).perform()
img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'gt_box')))
sleep(2)
location = img.location
size = img.size
print(location, size)
top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], \
location['x'] + size['width']
return top, bottom, left, right
def get_geetest_image(self, button, name1='captcha1.png', name2='captcha2.png'):
"""
获取两次验证码的截图:
1. 鼠标悬停于button的截图
2. 鼠标点击button后的截图
:param button: 滑动块
:param name1: 原始验证码保存的名字
:param name2: 缺块验证码保存的名字
:return: 两次验证码截图的结果
"""
top, bottom, left, right = self.get_position(button)
print('验证码位置', top, bottom, left, right)
screenshot = self.get_screenshot(button)
captcha1 = screenshot[0].crop((left, top, right, bottom))
captcha1.save(name1)
captcha2 = screenshot[1].crop((left, top, right, bottom))
captcha2.save(name2)
return (captcha1, captcha2)
def login(self):
"""
打开浏览器,并且输入账号密码
:return: None
"""
self.browser.get(self.url)
username = self.wait.until(EC.element_to_be_clickable((By.ID, 'login-username')))
password = self.wait.until(EC.element_to_be_clickable((By.ID, 'login-passwd')))
sleep(1)
username.send_keys(self.username)
sleep(1)
password.send_keys(self.password)
def is_pixel_equal(self, img1, img2, x, y):
"""
判断两个像素是否相同
:param img1: 原始验证码
:param img2: 缺块验证码
:param x: 像素点的x坐标
:param y: 像素点的y坐标
:return: 像素是否相同
"""
pixel1 = img1.load()[x-1, y]
pixel2 = img2.load()[x-1, y]
threshold = 100
if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
pixel1[2] - pixel2[2]) < threshold:
return True
else:
return False
def get_gap(self, img1, img2):
"""
获取缺口偏移量
:param img1: 原始验证码
:param img2: 缺块验证码
:return: 第二个缺块的左侧的x坐标
"""
left = 60 # 大致忽略掉第一个缺块
for i in range(left, img1.size[0]):
for j in range(img1.size[1]):
if not self.is_pixel_equal(img1, img2, i, j):
left = i
return left
return left
def get_track(self, distance):
"""
获取滑块移动轨迹的列表
:param distance: 第二个缺块的左侧的x坐标
:return: 滑块移动轨迹列表
"""
track = []
current = 0
mid = distance * 2 / 3
t = 0.2
v = 0
distance += 10 # 使滑块划过目标地点, 然后回退
while current < distance:
if current < mid:
a = random.randint(1, 3)
else:
a = -random.randint(3, 5)
v0 = v
v = v0 + a * t
move = v0 * t + 0.5 * a * t * t
current += move
track.append(round(move))
for i in range(2):
track.append(-random.randint(2, 3))
for i in range(2):
track.append(-random.randint(1, 4))
print(track)
return track
def move_button(self, button, track):
"""
将滑块拖动到指定位置
:param button: 滑动块
:param track: 滑块运动轨迹列表
:return: None
"""
ActionChains(self.browser).click_and_hold(button).perform()
for i in track:
ActionChains(self.browser).move_by_offset(xoffset=i, yoffset=0).perform()
sleep(0.0005)
sleep(0.5)
ActionChains(self.browser).release().perform()
def crack(self):
"""
串接整个流程:
1. 输入账号密码
2. 获取滑动块
3. 获取两张验证码图片
4. 获取滑块移动轨迹
5. 将滑块拖动至指定位置
:return:
"""
self.login()
button = self.get_button()
captcha = self.get_geetest_image(button)
left = self.get_gap(captcha[0], captcha[1])
print(left)
track = self.get_track(left)
# 如果尝试登陆失败, 则重新验证, 最多三次
times = 0
while times < 3:
self.move_button(button, track)
try:
success = self.wait.until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'gt_info_type'), '验证通过:'))
print(success)
except TimeoutException as e:
times += 1
print('fail')
else:
print('success')
return None
if __name__ == '__main__':
ACCOUNT = input('请输入您的账号:')
PASSWORD = getpass('请输入您的密码:')
test = BiliBili(ACCOUNT, PASSWORD) # 输入账号和密码
test.crack()
================================================
FILE: csdn/README
================================================
csdn login module
========================
@upload and test date: 2020-08-17
@use module: pyppeteer==0.2.2
@author: Kris
$ pip install pyppeteer==0.2.2
================================================
FILE: csdn/selenium_csdn.py
================================================
# -*- coding: utf-8 -*-
# @Author: Kris
# @Mail: criselyj@163.com
# @Date: 2020-08-14 17:40:11
import os
import random
from getpass import getpass
import asyncio
from pyppeteer import launch
base_url = 'https://passport.csdn.net/login'
current_dir = os.path.dirname(os.path.realpath(__file__))
# Fix:https://github.com/miyakogi/pyppeteer/issues/183 文件权限问题。
cache_dir = os.path.join(current_dir, 'cache')
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
class Api(object):
def __init__(self, account, password):
self.url = base_url
self.account = account
self.password = password
self.browser = None
self.page = None
async def send_key(self):
await asyncio.sleep(random.randint(2, 3))
switch_btn = await self.page.xpath('//ul/li[@class="text-tab border-right"][2]/a')
await switch_btn[0].click()
input_account = await self.page.xpath('//div[@class="form-group"]/div/input[1]')
await input_account[0].type(self.account,
{'delay': random.randint(100, 200) - 50})
await self.page.type('#password-number', self.password,
{'delay': random.randint(100, 200) - 50})
await self.page.click('button[data-type=account]')
await asyncio.sleep(random.randint(5, 10))
async def crawl(self):
# 测试环境下 headless 设置为 False
# 生产环境可以修改为无头浏览器
self.browser = await launch({
'headless': False,
'userDataDir': cache_dir,
'defaultViewport': {'width': 1440, 'height': 1000},
'args': ['--no-sandbox']
})
self.page = await self.browser.newPage()
await self.page.goto(self.url)
# 伪造当前浏览状态 防止自动化工具检测
codes = (
"() =>{ Object.defineProperties(navigator,{ webdriver:"
"{ get: () => false } }) }",
"() =>{ window.navigator.chrome = { runtime: {}, }; }",
"() =>{ Object.defineProperty(navigator, 'languages', "
"{ get: () => ['en-US', 'en'] }); }",
"() =>{ Object.defineProperty(navigator, 'plugins', { "
"get: () => [1, 2, 3, 4, 5,6], }); }"
)
for code in codes:
await self.page.evaluate(code)
await self.send_key()
def main():
print('[*] 模拟登陆 CSDN 程序启动...')
account = input('[*] 请输入账号:')
password = getpass('[*] 请输入密码:')
login = Api(account, password)
loop = asyncio.get_event_loop()
loop.run_until_complete(login.crawl())
if __name__ == '__main__':
main()
================================================
FILE: douban/douban.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
update_time:2019-04-04
"""
"""
模拟登陆豆瓣
"""
class DouBanLogin(object):
def __init__(self, account, password):
self.url = "https://accounts.douban.com/j/mobile/login/basic"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
"""初始化数据"""
self.data = {
"ck": "",
"name": account,
"password": password,
"remember": "true",
"ticket": ""
}
self.session = requests.Session()
def get_cookie(self):
"""模拟登陆获取cookie"""
html = self.session.post(
url=self.url,
headers=self.headers,
data=self.data
).json()
if html["status"] == "success":
print("恭喜你,登陆成功")
def get_user_data(self):
"""获取用户数据表明登陆成功"""
# TODO: 这里填写你用户主页的url
url = "这里填写你用户主页的url"
# 获取用户信息页面
html = self.session.get(url).text
print(html)
def run(self):
"""运行程序"""
self.get_cookie()
self.get_user_data()
if __name__ == '__main__':
account = input("请输入你的账号:")
password = input("请输入你的密码:")
login = DouBanLogin(account, password)
login.run()
================================================
FILE: douban/douban_spider.py
================================================
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import requests
# 定义请求url
url = "https://movie.douban.com/j/search_subjects"
# 定义请求头
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
# 循环构建请求参数并且发送请求
for page_start in range(0, 100, 20):
params = {
"type": "movie",
"tag": "热门",
"sort": "recommend",
"page_limit": "20",
"page_start": page_start
}
response = requests.get(
url=url,
headers=headers,
params=params
)
# 方式一 直接转换json方法
# results = response.json()
# 方式二:手动转换
# 获取字符串
content = response.content
# 转换成字符串
results = json.loads(content)
# 解析结果
for movie in results["subjects"]:
print(movie["title"], movie["rate"])
================================================
FILE: facebook/facebook.py
================================================
from __future__ import print_function
import argparse
import requests
import pyquery
def login(session, email, password):
"""
获取cookie
"""
response = session.get('https://m.facebook.com')
# 尝试登陆
response = session.post('https://m.facebook.com/login.php', data={
'email': email,
'pass': password
}, allow_redirects=False)
if 'c_user' in response.cookies:
# 说明登陆成功
homepage_resp = session.get('https://m.facebook.com/home.php')
dom = pyquery.PyQuery(homepage_resp.text.encode('utf8'))
fb_dtsg = dom('input[name="fb_dtsg"]').val()
return fb_dtsg, response.cookies['c_user'], response.cookies['xs']
else:
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Login to Facebook')
parser.add_argument('email', help='Email address')
parser.add_argument('password', help='Login password')
args = parser.parse_args()
session = requests.session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
})
fb_dtsg, user_id, xs = login(session, args.email, args.password)
if user_id:
print('{0}:{1}:{2}'.format(fb_dtsg, user_id, xs))
else:
print('Login Failed')
================================================
FILE: guoke/guoke.py
================================================
import requests
import re
headers_login = {
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Host': 'account.guokr.com',
'Pragma': 'no-cache',
'Cookie': '__utmt=1; __utma=253067679.2102330349.1540780238.1540780238.1541122809.2; __utmb=253067679.12.9.1541122812936; __utmc=253067679; __utmz=253067679.1540780238.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=253067679.|1=Is%20Registered=No=1; session=afcf1b0f-c71b-43d2-8046-f60ae28f9b45',
'Referer': 'https://account.guokr.com/sign_in/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36'
}
session = requests.Session()
url = 'https://account.guokr.com/sign_in/'
resp = session.get(url, headers=headers_login)
html = resp.text
csrf_token = re.search(r'id="csrf_token[\s\S]*?(\d+[\s\S]*?)"', html).group(1)
captcha_rand = re.search(r'id="captchaRand[\s\S]*?(\d+)', html).group(1)
img_url = 'https://account.guokr.com/captcha/' + captcha_rand
with open('captcha.jpg', 'wb') as fw:
fw.write(session.get(img_url, headers=headers_login).content)
username = input('请输入用户名:')
password = input('请输入密码:')
captcha = input('请输入验证码 : ')
data = {
'csrf_token': csrf_token,
'username': username,
'password': password,
'captcha': captcha,
'captcha_rand': captcha_rand,
'permanent': 'y ',
}
response = session.post(url, data=data)
with open('response.html', 'w', encoding='utf-8') as fw:
fw.write(response.text)
# print(response.cookies)
# print(session.cookies)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.61 Safari/537.36'
}
homepage = 'https://www.guokr.com/i/0210199872/'
with open('homepage.html', 'w', encoding='utf-8') as fw:
res = session.get(homepage, headers=headers)
fw.write(res.text)
================================================
FILE: guoke/guoke_spider.py
================================================
# -*- coding: utf-8 -*-
import requests
from urllib.parse import urlencode
from requests import codes
import os
from multiprocessing.pool import Pool
from bs4 import BeautifulSoup as bsp
import json
import time
import re
"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
update_time:2019-3-7
"""
def get_index(offset):
base_url = 'http://www.guokr.com/apis/minisite/article.json?'
data = {
'retrieve_type': "by_subject",
'limit': "20",
'offset': offset
}
url = base_url + urlencode(data)
# print(url)
try:
resp = requests.get(url)
if codes.ok == resp.status_code:
return resp.json()
except requests.ConnectionError:
return None
# 解析出文章的url
def get_url(json):
if json.get('result'):
result = json.get('result')
for item in result:
if item.get('cell_type') is not None:
continue
yield item.get('url')
"""
try:
result=json.load(json)
if result:
for i in result.get('result'):
yield i.get('url')
"""
# 解析文章详情页
def get_text(url):
html = requests.get(url).text
print(html)
soup = bsp(html, 'lxml')
title = soup.find('h1', id='articleTitle').get_text()
autor = soup.find('div', class_="content-th-info").find('a').get_text()
article_content = soup.find('div', class_="document").find_all('p')
all_p = [i.get_text() for i in article_content if not i.find('img') and not i.find('a')] # 去除标签
article = '\n'.join(all_p)
yield {"title": title, "autor": autor, "article": article}
def save_article(content):
try:
if content.get('title'):
file_name = str(content.get('title')) + '.txt'
with open(file_name, 'w', encoding='utf-8') as f:
# f.write(json.dumps(content,ensure_ascii=False))
f.write('\n'.join([str(content.get('title')), str(content.get('autor')), str(content.get('article'))]))
print('Downloaded article path is %s' % file_name)
else:
file_name = str(content.get('title')) + '.txt'
print('Already Downloaded', file_name)
except requests.ConnectionError:
print('Failed to Save Image,item %s' % content)
def main(offset):
result = get_index(offset)
all_url = get_url(result)
for url in all_url:
article = get_text(url)
for art in article:
# print(art)
save_article(art)
GROUP_START = 0
GROUP_END = 7
if __name__ == '__main__':
for i in range(GROUP_START, GROUP_END + 1):
main(offset=i * 20 + 18)
time.sleep(1)
================================================
FILE: jd_login/Method_First/Try_selenium.py
================================================
#coding=utf-8
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import os
from pyquery import PyQuery as pq
from config import settings as SET
import re
#browser_for_login为正常浏览器,用于登录
browser_for_login = webdriver.Chrome()
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
#无头模式
browser = webdriver.Chrome(chrome_options=chrome_options)
wait = WebDriverWait(browser,10)
total_num_of_products = SET['total_products']
total_num_of_products_cur = 0
choice_list=[]
ban_list=[]
#所有的sleep为了是减慢速度, 防止被检查异常
def do_try(url):
try:
browser.switch_to.window(browser.window_handles[1])
browser.get(url)
time.sleep(2)
button = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#product-intro > div.info > div.try-info.clearfix.bigImg > div.info-detail.chosen > div > div.btn-wrap > a'))
)
#如果按钮不是‘申请使用’,则说明该商品申请出错或者已经申请过了,则跳回到试用商品列表界面
if button.text!='申请试用':
browser.switch_to.window(browser.window_handles[0])
return False
button.click()
#等待关注商铺的信息出来,然后点击关注即可。如果无需关注,则可能会抛出超时异常
button2 = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'body > div.ui-dialog > div.ui-dialog-content > div > div > div.btn > a.y'))
)
time.sleep(1)
button2.click()
time.sleep(2)
browser.switch_to.window(browser.window_handles[0])
return True
except TimeoutException:
browser.switch_to.window(browser.window_handles[0]) #抛出超时异常则返回到试用商品列表界面即可
return True
def get_try(page):
url='https://try.jd.com/activity/getActivityList'+'?page='+str(page)
browser.get(url)
time.sleep(2)
html = browser.page_source
#print(html)
#利用PyQuery获得所有关于试用商品跳转的class=item的<li>标签
doc = pq(html)
#因为已经申请过的商品的<li>标签中的class除了item,还有applied,故将其删除之后申请便可跳过已申请的商品
doc('.applied').remove()
items = doc('.root61 .container .w .goods-list .items .con .clearfix .item').items()
#print(type(items))
#print(items)
items=list(items)
for item in items:
#获得每个商品的标题,如果进行商品过滤则有可能有用
title = item('.p-name').text()
if check_name(title) == False:
continue
price_text = item('.p-price').text()[6:]
price = float(price_text)
if price < float(SET['price_limit']):
continue
try_url = 'https:'+item('.link').attr('href')
print('价格: ',price)
print(title)
#print(try_url)
time.sleep(1)
global total_num_of_products_cur
global total_num_of_products
if do_try(try_url) == True:
total_num_of_products_cur +=1
print("申请成功")
print('')
else :
print("申请失败")
print('')
#到达指定个数之后退出
if total_num_of_products_cur >= total_num_of_products:
return
def Control_try(total_page):
browser.execute_script('window.open()')
browser.switch_to.window(browser.window_handles[0])
for page in range(1,total_page+1):
print('开始申请第'+str(page)+'页')
get_try(page)
global total_num_of_products
global total_num_of_products_cur
if total_num_of_products_cur >= total_num_of_products:
return
print('第'+str(page)+'页申请完成')
#成功登录后将browser_for_login的cookies取出放到无头browser中即可
def login():
browser_for_login.get('https://passport.jd.com/new/login.aspx')
while browser_for_login.current_url!='https://www.jd.com/':
time.sleep(2)
cookies = browser_for_login.get_cookies()
browser_for_login.close()
browser.get('https://www.jd.com')
for cookie in cookies:
browser.add_cookie(cookie)
browser.get('https://www.jd.com')
def auto_showdown():
if SET['auto_shutdown'] == True:
print('\n5秒后将自动关机')
time.sleep(5)
os.system('shutdown -s -t 1')
def deal_file():
global choice_list
global ban_list
if SET['choice']==True:
with open('choice.txt','r') as f:
choice_list = re.split('[ |.|,|!|\n]',f.read())
f.close()
if SET['ban']==True:
with open('ban.txt','r') as f:
ban_list = re.split('[ |.|,|!|\n]',f.read())
f.close()
def check_name(title):
is_choice = False
if len(choice_list)==0:
is_choice = True
for ch in choice_list:
if ch in title:
is_choice = True
break
if is_choice == False:
return False
is_ban = False
for ba in ban_list:
if ba in title:
is_ban = True
break
if is_ban == True:
return False
return True
if __name__ == '__main__':
deal_file()
login()
#申请前SET['total_num_of_page']页
Control_try(SET['total_num_of_page'])
browser.close()
print('申请完成')
auto_showdown()
================================================
FILE: jd_login/Method_First/ban.txt
================================================
Ȥ ˬ
================================================
FILE: jd_login/Method_First/choice.txt
================================================
ž ȳ Ǯ
================================================
FILE: jd_login/Method_First/config.py
================================================
#coding:utf-8
settings = {
'auto_shutdown':False, #是否自动关机,默认为False
'total_products':300, #要申请的商品个数上限,默认为300
'total_num_of_page':50, #申请前total_num_of_page页
'choice':False, #是否按照商品名称选择要申请的商品,如果设置为True,则应该创建choice.txt文件
#并将想要的商品名称写进去即可。默认为False
'ban':False #是否按照商品名称选择要过滤掉的商品,如果设置为True,则应该创建ban.txt文件
#并将想过滤掉的商品名称写进去即可。(不同商品名称之间用,.!空格或换行符隔开即可)默认为False
}
================================================
FILE: jd_login/Method_Second/Config.py
================================================
"""
Config.py
配置文件
"""
settings = {
#一天申请的限制个数
'maxApplyNum' : 300 ,
#试用类型
#家用电器737 手机数码652 电脑办公670 家居家装1620 服饰鞋包1315 生鲜美食12218 钟表奢品5025 家庭清洁15901 食品饮料1320
'cids' : ['737', '652' ,'670', '1620', '1315', '12218' ,'5025' , '15901' ,'1320' ,] ,
#申请商品价格下限 单位 元
'goodPrice' : 30 ,
#浏览器button最长等待时间 单位秒
'waitTime' : 10 ,
#试用结束后是否自动关机 True代表关机
'shutdown' : False ,
}
================================================
FILE: jd_login/Method_Second/Truekeyword.txt
================================================
/ 表带
手表 手机 华为 huawei mate vivo oppo 小米 苹果 apple MacBook 电脑 笔记本 ipad/ 套 膜 钢化 全包 壳 支架 防水袋
自拍杆 三脚架 内存卡 /
/ 流量卡 手机卡 不限速 上网卡 日租卡 无限流量 0月租 纯流量 电信号码 移动号码 联通号码
/ 苹果皮 智能机器人 手机电池 机器人盒子 儿童麦克风
/ 腕带 充电器底座 专用电插排
/ 补光灯 手机声卡 有线话筒 美颜灯
/ 游戏手柄 吃鸡神器 手机散热器 吃鸡按键 吃鸡辅助 手游充电线 走位神器
单反 微单 相机 / 清洁棒 除灰 手柄
苹果 MacBook / 专用键盘布
/ 运动臂包 壁虎支架
/ 手链 项链 耳钉 耳环 珠宝 耳坠 吊坠 平安扣 菩提 手串 佛珠 戒指 手镯 挂件 文玩 镯子 骨链 脚链 尾戒 弥勒佛 档位珠 转运珠 木料 阿梵尼 束发带 领针 胸针 银饰 金饰
/ 头部按摩器 头部按摩仪 脑部按摩爪 肩颈按摩器 电动按摩捶 洁面仪 腰部按摩器 全身按摩垫 颈部按摩器 颈椎腰按摩器
/ 足疗机
/ 离子精华导入仪 美容仪 补水仪 蒸脸器 眼部按摩仪 点歌机 电陶炉专用清洁刮刀 颈部护颈按摩仪 洗脸仪 洁面仪 颈椎按摩器 脖子肩颈仪 按摩护颈仪
/ 电视架 电视挂架 电视机支架 电视支架 外机架 室外空调支架 托架配件 指环
/ 收款播放器 收钱提示音响
/ 丰胸 乳房 下垂 胸部 乳腺 美胸 文胸
/ 电热护颈 颈部热敷
/ 冲牙器 水牙线 洗牙器 洁牙机
/ 滤芯 滤网 过滤网 空调滤
/ 洗衣机罩
/ 吸顶喇叭 定压喇叭 时序器
/ 时序器 转换插头 切换器 监控摄像头 监控电源 电源适配器
/ 财务软件 用友 管理软件 服务手册
/ 硒鼓 粉盒 粉仓 感光鼓 文档保护页 底片夹 扫描配件 幻灯片片夹 墨水 搓纸轮 分页器 碳粉 墨粉
/ 方块机 显示器支架 电脑架
/ 键盘膜 卡套挂绳 桌牌 党员牌 口取纸 分类纸
/ 机顶盒 挂历 台历
/ 水族胶 莫斯胶 啫喱胶 珊瑚胶 水草胶
/ 拼装模型 海绵胶带 标签纸 贴纸
车 / 贴 蜡划 车痕 修复液 车漆 钥匙包 流氓灯 大灯 手机架 支架 方向盘套 车把套 密封 缝隙 封条 除积碳 保险杠 防撞 防擦 门锁 脚垫 烟灰缸 烟缸 遮挡 摆件 车饰 抛光 喷雾 镀膜 饰品 挂件 钥匙 后视镜 雨眉 洗车液
/ 雨刮
/ 精油 除臭 除味 香包 香囊 硅藻 除味剂 碳包 香水 添加剂 燃油宝 清洗剂 竹炭包 补胎液 活矿石
/ 马桶盖 毛巾架 浴巾架 喷头 浴室置物架 铰链 合页 西餐垫 挂钩 阀 水管 接头 衣钩 拉手 暖气管 软管
/ 打泡器 起泡器 口诀表 机油 汽油 循环泵 继电器 插座面板 墙壁电源 金属开孔器 接线端子 电线连接器 展架 kt板 海报架
/ 全包装修 全国承建 乡墅 装饰画 挂画 福字 防水涂料 补漏 缝剂 拉手 补漆 保护条 墙角 壁纸 墙纸 补墙漆 翻新漆 背景墙 地漏 落水器
/ 鞋 裤 背心 夹克 外套 羽绒服 护腰带 童装 鞋拔 耳勺 掏耳朵 爆炸盒子 手工相册 嵌甲 甲沟炎 雪地靴 男靴 女靴 睡衣 泳衣
/ 上饵器 老花镜 衬衫 打火机 帽 剑 眼镜 烟嘴 外套 针织衫 运动服 毛衣 打底衫 羊绒衫 T恤 靴 蕾丝手套 棉服 羽绒服
/ 暖宫贴 宫寒贴 痛经 暖宫 暖宝宝 暖贴 暖身贴 发热贴 被子固定器 修复颈椎 中药热疗 颈椎病
/ 空调套 空调罩 挂机罩
/ 十字绣 钻石绣 草坪 人造草皮 圆顶吊顶 充气泵 打气泵 轨道 滑道 阻尼缓冲 防雾剂 吸顶灯
/ 灭火器 预付款 防水插座 灯带 投光灯 厂房灯 广告牌 射灯 室外灯 院灯 路灯 灯笼 玄关灯 过道灯 走廊灯 吊灯
/ 仿古电话 点烟器 贴墙仪 火花塞
/ 面膜 唇膏 香盒
/ 弹弓 小拼图 牙套矫正器
/ 刮痧板 刮脸板 电子烟 镜框 营养师 减肥 减脂 烟油 烟液 烟雾 长筒袜 高筒袜 半截袜子 过膝袜 甲沟 童袜 棉衣 戒烟贴 平衡球 训练梯 敏捷梯 烟嘴
/ 信息素 棉袜 袜子 车包 荧光粉 发光粉末 夜光沙 领带夹 内衣 檀香 供香 挡风板 防风罩 风衣 永生花 迷彩雨披 高尔夫球 望远镜 连帽 烟盒 纸手表
/ 舌帽 字母灯 溜冰鞋 太阳花 饰唯美 太阳镜
/ 缓冲门 阻尼弹簧 门吸 地吸 松动剂 润滑剂 情趣灯 防撞胶粒 胶垫 防撞贴
/ 海参 闸蟹 阳澄湖 章鱼 八爪鱼 鱿鱼 海螺
/ 蛋挞皮 牙签机
/ 灭鼠药 杀鼠剂 蟑螂药 灭蟑螂 杀蟑螂 除蟑螂 捕鼠器 灭鼠器 驱鼠器 甲醛检测 检测甲醛 防静电喷雾 边夹 发卡 bb夹 发箍 儿童头饰 发饰
/ 除味盒 去味剂 除臭剂 除味剂 蔬果净 湿巾
恒源祥 /
/ 连衣裙 冬季衣服 毛呢大衣
/ 芡实茶 祛湿茶 花茶包 薏米茶 除湿气茶 薏仁茶 姜茶 枳椇子 润甘元 茶叶 菊花茶 水果茶 绿茶 蒙顶山茶 粉墨茶 向阳汤 玫瑰花茶 养生茶 荷叶茶 普洱茶 柑普茶 昆仑菊花 云雾茶 葛根茶 菊苣根茶 枸杞茶 葛根片 胖大海 金银花 罗汉果 金花砖茶 崖黑茶 黑茶 大红袍 武夷岩茶 柑普茶 荷叶茶 润喉茶
/ 餐厅用酒 原浆老酒 朗姆酒 鸡尾酒 烹饪酒 黄酒 花雕酒 梅子酒 果酒 低度酒 米酒 饭酒 甜酒 花酒 梅酒 女士酒 竹筒酒 百年井窖 塞罕坝酒 苦荞酒 白兰地 预调酒 秘酿 威士忌 浒魄酒 冰酒 相公寨酒 高粱酒 养生酒 老酒
/ 碱粉 碱面 碳酸钠
/ 成人益生菌 调理肠胃饮品
/ 胖子卫衣 加肥加大 宽松大码 情趣礼品 半身裙 纱裙 牙齿矫正器
/ 笔记本电脑充电器
================================================
FILE: jd_login/Method_Second/main.py
================================================
"""
京东试用自动申请程序,每天仅需执行一次即可
"""
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from pyquery import PyQuery as pq
import json
import os
import getpass
import base64
#载入自己编写的配置文件
from Config import settings
#全局变量
#打开无界面的chrome浏览器
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
#不打印不重要的日志信息
chrome_options.add_argument('log-level=2')
browser = webdriver.Chrome(chrome_options = chrome_options)
#设置浏览器最长等待时间
wait = WebDriverWait(browser, settings['waitTime'])
#打开用于登陆的chrome浏览器
browser_login = webdriver.Chrome()
#设置浏览器最长等待时间
wait_login = WebDriverWait(browser_login, settings['waitTime'])
def readCookies():
"""
从文件中读取cookies并返回 文件不存在则返回False
"""
#不存在cookies文件
if os.path.exists("cookies.json") == False:
print("cookies文件不存在!")
return False
with open("cookies.json","r") as f:
cookies = json.load(f)
return cookies
def writeCookies(cookies):
"""
从浏览器中向文件写入cookies
"""
with open("cookies.json", "w") as f:
json.dump(cookies, f)
def closeSW(iApplyNum):
"""
在文件中输出申请个数 iApplyNum
关闭了浏览器和程序
"""
#等待5秒
time.sleep(5)
#保存浏览器cookies到文件中
cookies = browser.get_cookies()
writeCookies(cookies)
#关闭浏览器
browser.quit()
with open("log.txt", 'a') as f:
#输出申请时间和数量
f.write( time.ctime() + " 申请数量:" + str(iApplyNum) + '\n')
#是否关闭电脑
if settings['shutdown'] == True:
os.system("shutdown -s -f")
#退出程序
exit()
def genekeys():
#打开正确/屏蔽词文件,并处理
keys = []
for line in open("Truekeyword.txt", 'r' ,encoding='UTF-8' ):
line = line[0:line.find('\n')]
if line == '':
continue
line = line.split('/')
line[0] = line[0].strip()
line[1] = line[1].strip()
if line[0] == '':
line[0] = []
else:
line[0] = line[0].split(' ')
if line[1] == '':
line[1] = []
else:
line[1] = line[1].split(' ')
keys.append(line)
return keys
def goodJudge(goodName, goodPrice, keys):
"""
根据商品名称和价格判断是否试用该商品
"""
if goodPrice < settings['goodPrice']:
return False
for key in keys:
booltrue = False
if key[0] == []:
booltrue = True
for tk in key[0]:
if tk == '':
continue
if tk in goodName:
booltrue = True
break
if booltrue == False:
continue
for tk in key[1]:
if tk == '':
continue
if tk in goodName:
return False
return True
def do_try(url):
"""
对于某个商品申请试用
url为申请网址 iApplyNum为当前申请成功的个数
"""
try:
#切换到选项卡1
browser.switch_to.window(browser.window_handles[1])
#访问商品网页
browser.get(url)
#停2秒
time.sleep(2)
#获取网页的html源码
html = browser.page_source
#初始化pyquery
doc = pq(html)
#获取申请试用的botton
button = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'#product-intro > div.info > div.try-info.clearfix.bigImg > div.info-detail.chosen > div > div.btn-wrap > a'))
)
#如果上面写的不是申请试用,就申请下一个
if button.text!='申请试用':
return False
#点击申请试用
button.click()
#找到关注并申请的按钮
button2 = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR,'body > div.ui-dialog > div.ui-dialog-content > div > div > div.btn > a.y'))
)
time.sleep(1)
#点击关注
button2.click()
#此时试用一件商品完成
time.sleep(2)
return True
#抛出超时异常
except TimeoutException:
#这件商品不申请了,返回
return False
def get_try(cid, iApplyNum, maxApplyNum, keys):
browser.get('https://try.jd.com/activity/getActivityList?page=1&cids='+cid)
#获取网页的html源码
html = browser.page_source
#初始化pyquery
doc = pq(html)
#CSS选择器 找出总页数
pageitem = doc('.root61 .container .w .p-wrap .p-skip').items()
#为了应对命名空间而采用的粗暴办法
pagestr = list(pageitem)[0].text()
pagestr = pagestr[2:]
pagestr = pagestr[0:pagestr.find('\n')]
pagenum = int(pagestr)
print("商品总页数:" + str(pagenum+1) )
for i in range(pagenum):
if i >=1:
#切换到下一页
browser.get('https://try.jd.com/activity/getActivityList?page='+str(i+1)+'&cids='+cid)
#停2秒
time.sleep(2)
html = browser.page_source
doc = pq(html)
#CSS选择器 找出商品列表
items = doc('.root61 .container .w .goods-list .items .con .clearfix .item').items()
#迭代器转换为list类型
items=list(items)
#对于每个商品进行处理
for item in items:
#按钮为已申请
if item('.try-item .try-button').text() == '已申请':
#已经申请过的不申请
continue
#商品名称
itemname = item('.try-item .p-name').text()
#商品价格
itempricetext = item('.try-item .p-price').text()
#截取多余的文本
#找不到价格 出现暂无报价的情况
if itempricetext.find('¥') == -1:
itemprice = 0
else:
itempricetext = itempricetext[itempricetext.find('¥')+1:]
#goodPrice 商品价格
itemprice = float(itempricetext)
if goodJudge(itemname, itemprice, keys) == False:
#不申请了
continue
itemurl = item('.try-item .link')
#试用该商品
if do_try('https:'+itemurl.attr('href')) == True:
print("申请成功 " +str(itemprice) + " " + itemname)
iApplyNum = iApplyNum + 1
#停3秒
time.sleep(2)
browser.switch_to.window(browser.window_handles[0])
if iApplyNum >= maxApplyNum:
print("已经成功申请" + str(maxApplyNum) + "件商品 申请结束")
closeSW(iApplyNum)
time.sleep(2)
print(cid+'类:第'+str(i+1)+'页申请完成')
return iApplyNum
def trycid():
"""
控制申请类别和数量 返回已申请数量iApplyNum
"""
keys = genekeys()
#京东限制 每天最大申请数量
maxApplyNum = settings['maxApplyNum']
iApplyNum = 0
#获取试用类型
cids = settings['cids']
browser.get('https://try.jd.com/')
browser.get('https://try.jd.com/activity/getActivityList')
#执行js脚本 打开一个新选项卡
browser.execute_script('window.open()')
browser.switch_to.window(browser.window_handles[0])
for cid in cids:
iApplyNum = get_try(cid, iApplyNum, maxApplyNum, keys)
return iApplyNum
def login():
"""
登陆函数
"""
#必须访问一次京东
browser_login.get('https://jd.com')
#读取文件中的cookies
cookies = readCookies()
if cookies != False:
#如果从文件中读取到了cookies,就放入浏览器中
for cookie in cookies:
browser_login.add_cookie(cookie)
#直接去登陆界面
browser_login.get('https://passport.jd.com/login.aspx')
#找到账户登陆的窗口
button_login = browser_login.find_elements_by_css_selector('#content > div.login-wrap > div.w > div > div.login-tab.login-tab-r > a')
button_login = button_login[0]
#点击
button_login.click()
time.sleep(2)
#取得用户名和密码的过程
#如果文件不存在
if os.path.exists("login.txt") == False:
username = input("请输入京东用户名:")
password = getpass.getpass("请输入京东密码(输入不会显示在屏幕上):")
else:
#从文件中读入用户名和密码
with open("login.txt",) as f:
up = f.read()
up = up.split('\n')
username = up[0].encode()
password = up[1].encode()
#base64解码
username = base64.b64decode(username)
username = username.decode()
password = base64.b64decode(password)
password = password.decode()
#找到输入框
input_username = browser_login.find_element_by_name('loginname')
#输入用户名
input_username.send_keys(username)
#找到密码框
input_password = browser_login.find_element_by_name('nloginpwd')
#输入密码
input_password.send_keys(password)
#找到登录按钮
button_logOK = browser_login.find_elements_by_id('loginsubmit')
button_logOK = button_logOK[0]
time.sleep(2)
#点击
button_logOK.click()
#循环检测是否登陆
while 1:
try:
wait_login.until(
EC.presence_of_element_located((By.CSS_SELECTOR,
'#ttbar-login > div.dt.cw-icon > a'))
)
break
except TimeoutException:
continue
print('登陆成功!')
time.sleep(2)
#登录成功后 若不存在login.txt,则把用户名和密码写入文件
if os.path.exists("login.txt") == False:
#base64编码
username = username.encode()
username = base64.b64encode(username)
password = password.encode()
password = base64.b64encode(password)
# 写入文件中
with open("login.txt", "w") as f:
f.write(username.decode() +"\n")
f.write(password.decode())
#把登陆浏览器的cookie转移到无界面浏览器上
#取得原浏览器的所有cookie
cookies = browser_login.get_cookies()
browser.get('https://www.jd.com')
#cookies是一个以字典为元素的list
for cookie in cookies:
browser.add_cookie(cookie)
#关闭登陆浏览器
browser_login.quit()
if __name__ == '__main__':
#登陆
login()
#开始申请 iApplyNum为申请成功的个数
iApplyNum = trycid()
#申请结束
closeSW(iApplyNum)
================================================
FILE: jd_login/README.md
================================================
#### Jd Spider.
================================================
FILE: jd_login/login_by_selenium.py
================================================
# tested on ubuntu15.04
import time
from selenium import webdriver
login_url = 'https://passport.jd.com/new/login.aspx'
driver = webdriver.PhantomJS()
driver.get(login_url)
time.sleep(5)
account = driver.find_element_by_id('loginname')
password = driver.find_element_by_id('nloginpwd')
submit = driver.find_element_by_id('loginsubmit')
account.clear()
password.clear()
account.send_keys('yourname')
password.send_keys('yourpassword')
submit.click()
time.sleep(5)
# cookie和前面一样的方式获取和保存
cookies = driver.get_cookies()
driver.close()
================================================
FILE: lagou/Lagou.py
================================================
# -*- coding:utf-8 -*-
import re
import os
import time
import json
import sys
import subprocess
import requests
import hashlib
from bs4 import BeautifulSoup
"""
info:
author:CriseLYJ
github:https://github.com/CriseLYJ/
update_time:2019-3-6
"""
class Lagou_login(object):
def __init__(self):
self.session = requests.session()
self.CaptchaImagePath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'captcha.jpg'
self.HEADERS = {'Referer': 'https://passport.lagou.com/login/login.html',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36'
' Core/1.53.4882.400 QQBrowser/9.7.13059.400',
'X-Requested-With': 'XMLHttpRequest'}
# 密码加密
def encryptPwd(self, passwd):
# 对密码进行了md5双重加密
passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest()
# veennike 这个值是在js文件找到的一个写死的值
passwd = 'veenike' + passwd + 'veenike'
passwd = hashlib.md5(passwd.encode('utf-8')).hexdigest()
return passwd
# 获取请求token
def getTokenCode(self):
login_page = 'https://passport.lagou.com/login/login.html'
data = self.session.get(login_page, headers=self.HEADERS)
soup = BeautifulSoup(data.content, "lxml", from_encoding='utf-8')
'''
要从登录页面提取token,code, 在头信息里面添加
<!-- 页面样式 --><!-- 动态token,防御伪造请求,重复提交 -->
<script type="text/javascript">
window.X_Anti_Forge_Token = 'dde4db4a-888e-47ca-8277-0c6da6a8fc19';
window.X_Anti_Forge_Code = '61142241';
</script>
'''
anti_token = {'X-Anit-Forge-Token': 'None',
'X-Anit-Forge-Code': '0'}
anti = soup.findAll('script')[1].getText().splitlines()
anti = [str(x) for x in anti]
anti_token['X-Anit-Forge-Token'] = re.findall(r'= \'(.+?)\'', anti[1])[0]
anti_token['X-Anit-Forge-Code'] = re.findall(r'= \'(.+?)\'', anti[2])[0]
return anti_token
# 人工读取验证码并返回
def getCaptcha(self):
captchaImgUrl = 'https://passport.lagou.com/vcode/create?from=register&refresh=%s' % time.time()
# 写入验证码图片
f = open(self.CaptchaImagePath, 'wb')
f.write(self.session.get(captchaImgUrl, headers=self.HEADERS).content)
f.close()
# 打开验证码图片
if sys.platform.find('darwin') >= 0:
subprocess.call(['open', self.CaptchaImagePath])
elif sys.platform.find('linux') >= 0:
subprocess.call(['xdg-open', self.CaptchaImagePath])
else:
os.startfile(self.CaptchaImagePath)
# 输入返回验证码
captcha = input("请输入当前地址(% s)的验证码: " % self.CaptchaImagePath)
print('你输入的验证码是:% s' % captcha)
return captcha
# 登陆操作
def login(self, user, passwd, captchaData=None, token_code=None):
postData = {'isValidate': 'true',
'password': passwd,
# 如需验证码,则添加上验证码
'request_form_verifyCode': (captchaData if captchaData != None else ''),
'submit': '',
'username': user
}
login_url = 'https://passport.lagou.com/login/login.json'
# 头信息添加tokena
login_headers = self.HEADERS.copy()
token_code = self.getTokenCode() if token_code is None else token_code
login_headers.update(token_code)
# data = {"content":{"rows":[]},"message":"该帐号不存在或密码错误,请重新输入","state":400}
response = self.session.post(login_url, data=postData, headers=login_headers)
data = json.loads(response.content.decode('utf-8'))
if data['state'] == 1:
return response.content
elif data['state'] == 10010:
print(data['message'])
captchaData = self.getCaptcha()
token_code = {'X-Anit-Forge-Code': data['submitCode'], 'X-Anit-Forge-Token': data['submitToken']}
return self.login(user, passwd, captchaData, token_code)
else:
print(data['message'])
return False
if __name__ == "__main__":
username = input("请输入你的手机号或者邮箱\n >>>:")
passwd = input("请输入你的密码\n >>>:")
lg = Lagou_login()
passwd = lg.encryptPwd(passwd)
data = lg.login(username, passwd)
if data:
print(data)
print('登录成功')
else:
print('登录不成功')
================================================
FILE: liepin/README.md
================================================
# scrapy_liepin
scrapy爬猎聘,通过公司名搜索公司职位
================================================
FILE: liepin/liepinSpd/liepinSpd/__init__.py
================================================
================================================
FILE: liepin/liepinSpd/liepinSpd/dbhelper.py
================================================
import pymysql
from scrapy.utils.project import get_project_settings#引入settings配置
class DBHelper():
def __init__(self):
self.settings=get_project_settings()#获取settings配置数据
self.host=self.settings['MYSQL_HOST']
self.port=self.settings['MYSQL_PORT']
self.user=self.settings['MYSQL_USER']
self.passwd=self.settings['MYSQL_PASSWD']
self.db=self.settings['MYSQL_DBNAME']
#连接mysql
def connectMysql(self):
conn=pymysql.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
charset='utf8')
return conn
#连接数据库
def connectDatabase(self):
conn=pymysql.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
db=self.db,
charset='utf8')
return conn
#创建数据库
def createDatabase(self):
conn=self.connectMysql()
sql="create database if not exists "+self.db
cur=conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
#创建数据表
def createTable(self,sql):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
#插入数据
def insert(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor();
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#更新数据
def update(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#删除数据
def delete(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#测试数据库操作
class TestDBHelper():
def __init__(self):
self.dbHelper=DBHelper()
def testCreateDatebase(self):
self.dbHelper.createDatabase()
def testCreateTable(self):
sql="create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))"
self.dbHelper.createTable(sql)
def testInsert(self):
sql="insert into testtable(name,url) values(%s,%s)"
params=("test","test")
self.dbHelper.insert(sql,*params)
def testUpdate(self):
sql="update testtable set name=%s,url=%s where id=%s"
params=("update","update","1")
self.dbHelper.update(sql,*params)
def testDelete(self):
sql="delete from testtable where id=%s"
params=("1")
self.dbHelper.delete(sql,*params)
if __name__=="__main__":
testDBHelper=TestDBHelper()
#testDBHelper.testCreateDatebase() #
#testDBHelper.testCreateTable() #
#testDBHelper.testInsert() #
#testDBHelper.testUpdate() #
#testDBHelper.testDelete() #
================================================
FILE: liepin/liepinSpd/liepinSpd/items.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LiepinspdItem(scrapy.Item):
# define the fields for your item here like:
as_of_date = scrapy.Field()
ticker = scrapy.Field()
company_name = scrapy.Field()
stage = scrapy.Field()
size = scrapy.Field()
city = scrapy.Field()
industry = scrapy.Field()
comp_clearfix = scrapy.Field()
rate_num = scrapy.Field()
job_count = scrapy.Field()
registered_capital = scrapy.Field()
spider_time = scrapy.Field()
origin_site = scrapy.Field()
================================================
FILE: liepin/liepinSpd/liepinSpd/middlewares.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import scrapy
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class LiepinspdSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LiepinspdDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENTS')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
================================================
FILE: liepin/liepinSpd/liepinSpd/pipelines.py
================================================
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
# class LiepinspdPipeline(object):
# def __init__(self, dbpool):
# self.dbpool = dbpool
#
# @classmethod
# def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
# """
# 数据库建立连接
# :param settings: 配置参数
# :return: 实例化参数
# """
#
# adbparams = dict(
# host=settings['MYSQL_HOST'],
# db=settings['MYSQL_DBNAME'],
# user=settings['MYSQL_USER'],
# password=settings['MYSQL_PASSWORD'],
# port = settings['MYSQL_PORT'],
# cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
# )
# # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
# dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# # 返回实例化参数
# return cls(dbpool)
#
# def process_item(self, item, spider):
# """
# 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
# """
# query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# # 添加异常处理
# query.addCallback(self.handle_error) # 处理异常
#
# def do_insert(self, cursor, item):
# # 对数据库进行插入操作,并不需要commit,twisted会自动commit
#
# insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insert_sql,
# (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']),
# str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']),
# int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],))
# def handle_error(self, failure):
# if failure:
# # 打印错误信息
# print(failure)
import pymysql
class LiepinspdPipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = """
insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industry,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql,
(item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']),
str(item['size']), str(item['city']), str(item['industry']), str(item['comp_clearfix']),
int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),
item['spider_time'], item['origin_site'],))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
================================================
FILE: liepin/liepinSpd/liepinSpd/settings.py
================================================
# -*- coding: utf-8 -*-
# Scrapy settings for liepinSpd project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
COMPANYLIST=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469']
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
BOT_NAME = 'liepinSpd'
MYSQL_HOST = 'rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com'
MYSQL_DBNAME = 'special_data'
MYSQL_USER = 'cn_ainvest_db'
MYSQL_PASSWD = 'cn_ainvest_sd3a1'
MYSQL_PORT = 3306
SPIDER_MODULES = ['liepinSpd.spiders']
NEWSPIDER_MODULE = 'liepinSpd.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'liepinSpd (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
USER_AGENTS = [
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
"Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
"Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3",
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
"Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'liepinSpd.middlewares.LiepinspdSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'liepinSpd.middlewares.LiepinspdDownloaderMiddleware': 543,
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'liepinSpd.middlewares.MyUserAgentMiddleware': 400,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'liepinSpd.pipelines.LiepinspdPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
================================================
FILE: liepin/liepinSpd/liepinSpd/spiders/__init__.py
================================================
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
================================================
FILE: liepin/liepinSpd/liepinSpd/spiders/lpspider.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import re
from datetime import datetime
import pandas as pd
import time
from liepinSpd.items import LiepinspdItem
class LiepinSpdier(scrapy.Spider):
name = 'liepin'
companylist=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469']
start_urls = []
for company in companylist:
start_urls.append(f'https://www.liepin.com/company/{company}/')
# 公司主要基本信息
def parse(self, response):
# company = response.meta['company']
text = response.text
# print(text)
# 抓取公司基本信息
# try:
company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract()
# print(company_name)
comp_sum_tag = response.xpath('//div[@class="comp-summary-tag"]/a/text()').extract()
# 好几个
stage=comp_sum_tag[0]
# print(stage)
size=comp_sum_tag[1]
# print(size)
city=comp_sum_tag[2]
# print(city)
industry=comp_sum_tag[3]
# print(industy)
#公司标签,list
comp_clearfix = str(response.xpath('//ul[@class="comp-tag-list clearfix"]//span/text()').extract())
# print(comp_clearfix)
#简历处理率 *%转化为float
rate_num = response.xpath('//p[@class="rate-num"]//span/text()')[0].extract()
rate_num=int(rate_num)/100
# print(rate_num)
job_count = int(re.search(r'<small data-selector="total">. 共([0-9]+) 个', text).group(1))
# print(job_count)
#注册资本(万元)
if '注册资本' in text and '万元人民币' in text:
registered_capital = float(re.search(r'<li>注册资本:(.*?)万元人民币</li>', text).group(1))
else:
registered_capital =0.0
# print(registered_capital)
origin_site=re.search(r'"wapUrl":"(.*?)",', text).group(1)
item = LiepinspdItem()
# 匹配股票代码,判断如果股票简称全部在公司名内,则匹配股票代码
data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk')
try:
for i in range(len(data)):
n = 0
for j in data.loc[i, '股票简称']:
if j in company_name:
n += 1
if n == len(data.loc[i, '股票简称']):
item['ticker'] = data.loc[i, '股票代码']
# print(n, item['ticker'], company_name)
# else:
# item['ticker'] ='未匹配'
except BaseException as e:
print('ticker匹配错误')
item['as_of_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item['company_name'] = company_name
item['stage'] = stage
item['size'] = size
item['city'] = city
item['industry'] = industry
item['comp_clearfix'] = comp_clearfix
item['rate_num'] = rate_num
item['job_count'] = job_count
item['registered_capital'] = registered_capital
item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date()
item['origin_site'] = origin_site
yield item
# except BaseException as e:
# print('error and pass')
================================================
FILE: liepin/liepinSpd/run_liepin1.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# 获取settings.py模块的设置
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from liepinSpd.spiders.lpspider import LiepinSpdier
settings = get_project_settings()
process = CrawlerProcess(settings=settings)
# 可以添加多个spider类
process.crawl(LiepinSpdier)
# 启动爬虫,会阻塞,直到爬取完成
process.start()
================================================
FILE: liepin/liepinSpd/scrapy.cfg
================================================
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = liepinSpd.settings
[deploy]
#url = http://localhost:6800/
project = liepinSpd
================================================
FILE: liepin/liepinSpd2/liepinSpd2/__init__.py
================================================
================================================
FILE: liepin/liepinSpd2/liepinSpd2/items.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class Liepinspd2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
as_of_date = scrapy.Field()
ticker = scrapy.Field()
company_name = scrapy.Field()
job_name = scrapy.Field()
job_label = scrapy.Field()
salary = scrapy.Field()
city = scrapy.Field()
education = scrapy.Field()
work_year = scrapy.Field()
pub_time = scrapy.Field()
job_describe = scrapy.Field()
# origin_site = scrapy.Field()
function = scrapy.Field()
spider_time = scrapy.Field()
================================================
FILE: liepin/liepinSpd2/liepinSpd2/middlewares.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import time
from scrapy import signals
import scrapy
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
from common.proxy_set import Proxies_set
class Liepinspd2SpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class Liepinspd2DownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENTS')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
print(agent)
# class ProxyMiddleware(object):
# '''
# 设置Proxy
# '''
#
# def __init__(self, ip):
# self.ip = ip
#
# @classmethod
# def from_crawler(cls, crawler):
# return cls(ip=crawler.settings.get('PROXIES'))
#
# def process_request(self, request, spider):
# ip = random.choice(self.ip)
# request.meta['proxy'] = ip
# import random
# import scrapy
# from scrapy import log
# logger = logging.getLogger()
class ProxyMiddleware(object):
"""docstring for ProxyMiddleWare"""
def process_request(self, request, spider):
'''对request对象加上proxy'''
proxy = self.get_random_proxy()
print("this is request ip:" + proxy)
request.meta['proxy'] = proxy
def process_response(self, request, response, spider):
'''对返回的response处理'''
# 如果返回的response状态不是200,重新生成当前request对象
if response.status != 200:
proxy = self.get_random_proxy()
print("this is response ip:" + proxy)
# 对当前reque加上代理
request.meta['proxy'] = proxy
return request
return response
def get_random_proxy(self):
'''随机从文件中读取proxy'''
while 1:
with open('G:\workspace\common\proxies.txt', 'r') as f:
proxies = f.readlines()
if proxies:
break
else:
time.sleep(1)
proxy = random.choice(proxies).strip()
return proxy
================================================
FILE: liepin/liepinSpd2/liepinSpd2/pipelines.py
================================================
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
import time
# class Liepinspd2Pipeline(object):
# def __init__(self, dbpool):
# self.dbpool = dbpool
#
# @classmethod
# def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
# """
# 数据库建立连接
# :param settings: 配置参数
# :return: 实例化参数
# """
#
# adbparams = dict(
# host=settings['MYSQL_HOST'],
# db=settings['MYSQL_DBNAME'],
# user=settings['MYSQL_USER'],
# password=settings['MYSQL_PASSWORD'],
# cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
# )
# # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
# dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# # 返回实例化参数
# return cls(dbpool)
#
# def process_item(self, item, spider):
# """
# 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
# """
# query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# # 添加异常处理
# query.addCallback(self.handle_error) # 处理异常
#
# def do_insert(self, cursor, item):
# # 对数据库进行插入操作,并不需要commit,twisted会自动commit
# insert_sql = "insert into liepin_job(as_of_date,ticker,company_name,job_name,salary,city,education,work_year,pub_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']),
# str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['origin_site'])))
#
# def handle_error(self, failure):
# if failure:
# # 打印错误信息
# print(failure)
class Liepinspd2Pipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = """
insert into job_info(as_of_date,ticker,company_name,job_name,job_label,salary,city,education,work_year,pub_time,job_describe,spider_time,function) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql, (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['job_name']),str(item['job_label']),
str(item['salary']),str(item['city']),str(item['education']),str(item['work_year']),str(item['pub_time']),str(item['job_describe']),item['spider_time'],str(item['function'])))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
================================================
FILE: liepin/liepinSpd2/liepinSpd2/settings.py
================================================
# -*- coding: utf-8 -*-
# Scrapy settings for liepinSpd2 project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'liepinSpd2'
SPIDER_MODULES = ['liepinSpd2.spiders']
NEWSPIDER_MODULE = 'liepinSpd2.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'liepinSpd2 (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'day0123'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123'
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
USER_AGENTS = [
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
"Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
"Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3",
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
]
PROXIES=['27.25.194.221:9999', '113.121.147.180:9999', '111.177.170.22:9999', '116.209.53.31:9999', '111.177.189.211:9999', '111.177.188.174:9999', '111.177.181.31:9999', '211.152.33.24:48749', '125.123.142.33:9999', '125.126.192.172:9999', '58.55.206.201:9999', '58.55.202.19:9999', '171.80.174.156:9999', '183.148.133.134:9999', '111.177.172.24:9999', '124.94.199.7:9999', '121.61.1.161:9999', '58.55.192.211:9999', '183.148.133.148:9999', '59.62.164.224:9999', '111.177.165.34:9999', '111.177.178.183:9999', '121.61.25.243:9999', '27.25.196.242:9999', '117.91.232.146:9999', '111.177.178.107:9999', '111.177.188.158:9999', '111.177.179.103:9999', '111.177.181.81:9999', '183.148.133.158:9999', '110.52.235.25:9999', '111.177.187.63:9999', '111.177.172.18:9999', '111.177.178.175:9999', '116.209.54.63:9999', '183.148.140.20:9999', '116.209.52.115:9999', '117.90.2.139:9999', '111.177.177.212:9999', '119.102.189.134:9999', '119.102.188.140:9999', '119.102.188.156:9999', '121.61.2.196:9999', '49.86.180.90:9999', '219.139.141.112:9999', '111.177.189.26:9999', '111.177.191.179:9999', '122.192.174.244:9999', '111.177.167.67:9999', '125.123.139.143:9999', '125.126.210.203:9999', '125.123.140.229:9999', '171.41.84.191:9999', '111.177.185.8:9999', '110.52.235.27:9999', '123.163.117.72:9999', '111.181.35.17:9999', '113.121.146.190:9999', '111.176.29.245:9999', '116.209.58.5:9999', '111.177.175.161:9999', '113.122.169.65:9999', '121.61.2.8:808', '121.61.0.140:9999', '111.176.23.161:9999', '116.209.54.236:9999', '171.41.85.124:9999', '125.126.209.156:9999', '180.119.68.211:9999', '111.177.191.214:9999', '58.50.1.139:9999', '59.62.166.108:9999', '115.151.2.63:9999', '111.177.179.41:9999', '171.41.84.200:9999', '115.151.5.40:53128', '59.62.164.163:9999', '121.61.2.128:9999', '116.209.54.117:9999', '111.177.161.26:9999', '125.123.140.246:9999', '111.181.35.55:9999', '125.123.143.70:9999', '171.41.85.163:9999', '112.85.130.88:9999', '121.61.0.165:9999', '171.80.136.10:9999', '111.177.188.81:9999', '115.151.2.101:9999', '171.41.85.201:9999', '113.121.145.6:9999', '121.61.0.98:9999', '171.41.86.14:9999', '111.177.172.77:9999', '111.177.171.222:9999', '110.52.235.11:9999', '111.176.28.141:9999', '183.148.145.122:9999', '110.52.235.206:9999', '111.177.189.246:9999']
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'liepinSpd2.middlewares.Liepinspd2SpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'liepinSpd2.middlewares.Liepinspd2DownloaderMiddleware': 543,
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'liepinSpd2.middlewares.MyUserAgentMiddleware': 400,
# 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': None,
# 'liepinSpd2.middlewares.ProxyMiddleware': 125,
# 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': None,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'liepinSpd2.pipelines.Liepinspd2Pipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
================================================
FILE: liepin/liepinSpd2/liepinSpd2/spiders/__init__.py
================================================
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
================================================
FILE: liepin/liepinSpd2/liepinSpd2/spiders/liepinJob.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import re
import json
from datetime import datetime
import pandas as pd
import time
'''修改DEFAULT_CIPHERS'''
from twisted.internet.ssl import AcceptableCiphers
from scrapy.core.downloader import contextfactory
contextfactory.DEFAULT_CIPHERS = AcceptableCiphers.fromOpenSSLCipherString('DEFAULT:!DH')
from liepinSpd2.items import Liepinspd2Item
class LiepinSpdier(scrapy.Spider):
name = 'liepin'
companylist=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469']
start_urls = []
for company in companylist:
start_urls.append(f'https://www.liepin.com/company/{company}/')
# 公司主要基本信息
def parse(self, response):
text = response.text
#职位总页数
totalPage =int(re.search(r'var totalPage = ([0-9]+);', text).group(1))
compId=re.search(r'"pcUrl":"https://www.liepin.com/company/([0-9]+)/',text).group(1)
for i in range(1, totalPage + 1):
print(f'第{i}页')
url = f'https://www.liepin.com/company/{compId}/pn{i}'
yield scrapy.Request(url,callback=self.parse_list)
def parse_list(self, response):
text = response.text
urls = response.xpath('//div[@class="job-info"]/a/@href').extract()
for url in urls:
yield scrapy.Request(url,callback=self.parse_job)
def parse_job(self,response):
item=Liepinspd2Item()
text = response.text
as_of_date = datetime.now()
company_name = response.xpath('//div[@class="title-info"]//a/@title')[0].extract()
# print(company_name)
job_name=response.xpath('//div[@class="title-info"]/h1/@title')[0].extract()
#薪资/城市/经验/学历
job_label=response.xpath('//li[@data-title=""]/span/text()').extract()
salary=response.xpath('//p[@class="job-item-title"]/text()')[0].extract().strip(' \r\n')
city=response.xpath('//p[@class="basic-infor"]//a/text()')[0].extract()
work_year=response.xpath('//div[@class="job-qualifications"]/span/text()')[1].extract()
education=response.xpath('//div[@class="job-qualifications"]/span/text()')[0].extract()
pub_time=response.xpath('//p[@class="basic-infor"]/time/@title')[0].extract()
job_describe=' '.join(response.xpath('//div[@class="content content-word"]/text()').extract())
function=re.search(r'所属部门:</span><label>(.*?)</label></li>',text).group(1)
data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk')
try:
for i in range(len(data)):
n = 0
for j in data.loc[i, '股票简称']:
if j in company_name:
n += 1
if n == len(data.loc[i, '股票简称']):
item['ticker'] = data.loc[i, '股票代码']
except BaseException as e:
print('ticker匹配错误')
item['as_of_date'] = as_of_date
item['company_name'] = company_name
item['job_name'] = job_name
item['job_label'] = job_label
item['salary'] = salary
item['city'] = city
item['education'] = education
item['work_year'] = work_year
item['pub_time'] = (datetime.strptime(pub_time, u"%Y年%m月%d日").date()) # 最后确定一下格式
item['job_describe'] = job_describe
item['function'] = function
item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date()
# item['origin_site'] = url
# print(item['pub_time'],item['ticker'],item['company_name'])
yield item
# except BaseException as e:
# print('111error and pass')
# time.sleep(1)
# company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract()
# # print(company_name)
# job_names=response.xpath('//div[@class="job-info"]/a[@class="title"]/text()').extract()
# #薪资/城市/经验/学历
# condition_clearfixs=response.xpath('//p[@class="condition clearfix"]/@title').extract()
# pub_times=response.xpath('//p[@class="time-info clearfix"]/time/@title').extract()
# urls=response.xpath('//div[@class="job-info"]/a/@href').extract()
# for job_name, condition_clearfix, pub_time,url in zip(job_names, condition_clearfixs, pub_times,urls):
# # try:
# item['job_name']=job_name.replace('\r','').replace('\n','').replace('\t','').replace(' ','')
# item['salary']=condition_clearfix.split('_')[0]
# item['city']=condition_clearfix.split('_')[1]
# item['education']=condition_clearfix.split('_')[2]
# item['work_year']=condition_clearfix.split('_')[3]
# item['pub_time']=pub_time#最后确定一下格式
# data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk')
# try:
# for i in range(len(data)):
# n = 0
# for j in data.loc[i, '股票简称']:
# if j in company_name:
# n += 1
# if n == len(data.loc[i, '股票简称']):
# item['ticker'] = data.loc[i, '股票代码']
# print(n, item['ticker'], company_name)
# except BaseException as e:
# item['ticker'] = 'None'
# print('ticker匹配错误')
# item['as_of_date'] = as_of_date
# item['company_name'] = company_name
# item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date()
# item['origin_site'] = url
# print(item['pub_time'],item['ticker'],item['company_name'])
# yield item
================================================
FILE: liepin/liepinSpd2/run_liepin2.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# 获取settings.py模块的设置
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from liepinSpd2.spiders.liepinJob import LiepinSpdier
settings = get_project_settings()
process = CrawlerProcess(settings=settings)
# 可以添加多个spider类
process.crawl(LiepinSpdier)
# 启动爬虫,会阻塞,直到爬取完成
process.start()
================================================
FILE: liepin/liepinSpd2/scrapy.cfg
================================================
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = liepinSpd2.settings
[deploy]
#url = http://localhost:6800/
project = liepinSpd2
================================================
FILE: liepin/liepinSpd_500/liepinSpd/__init__.py
================================================
================================================
FILE: liepin/liepinSpd_500/liepinSpd/dbhelper.py
================================================
import pymysql
from scrapy.utils.project import get_project_settings#引入settings配置
class DBHelper():
def __init__(self):
self.settings=get_project_settings()#获取settings配置数据
self.host=self.settings['MYSQL_HOST']
self.port=self.settings['MYSQL_PORT']
self.user=self.settings['MYSQL_USER']
self.passwd=self.settings['MYSQL_PASSWD']
self.db=self.settings['MYSQL_DBNAME']
#连接mysql
def connectMysql(self):
conn=pymysql.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
charset='utf8')
return conn
#连接数据库
def connectDatabase(self):
conn=pymysql.connect(host=self.host,
port=self.port,
user=self.user,
passwd=self.passwd,
db=self.db,
charset='utf8')
return conn
#创建数据库
def createDatabase(self):
conn=self.connectMysql()
sql="create database if not exists "+self.db
cur=conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
#创建数据表
def createTable(self,sql):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql)
cur.close()
conn.close()
#插入数据
def insert(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor();
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#更新数据
def update(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#删除数据
def delete(self,sql,*params):
conn=self.connectDatabase()
cur=conn.cursor()
cur.execute(sql,params)
conn.commit()
cur.close()
conn.close()
#测试数据库操作
class TestDBHelper():
def __init__(self):
self.dbHelper=DBHelper()
def testCreateDatebase(self):
self.dbHelper.createDatabase()
def testCreateTable(self):
sql="create table testtable(id int primary key auto_increment,name varchar(50),url varchar(200))"
self.dbHelper.createTable(sql)
def testInsert(self):
sql="insert into testtable(name,url) values(%s,%s)"
params=("test","test")
self.dbHelper.insert(sql,*params)
def testUpdate(self):
sql="update testtable set name=%s,url=%s where id=%s"
params=("update","update","1")
self.dbHelper.update(sql,*params)
def testDelete(self):
sql="delete from testtable where id=%s"
params=("1")
self.dbHelper.delete(sql,*params)
if __name__=="__main__":
testDBHelper=TestDBHelper()
#testDBHelper.testCreateDatebase() #
#testDBHelper.testCreateTable() #
#testDBHelper.testInsert() #
#testDBHelper.testUpdate() #
#testDBHelper.testDelete() #
================================================
FILE: liepin/liepinSpd_500/liepinSpd/items.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LiepinspdItem(scrapy.Item):
# define the fields for your item here like:
as_of_date = scrapy.Field()
ticker = scrapy.Field()
company_name = scrapy.Field()
stage = scrapy.Field()
size = scrapy.Field()
city = scrapy.Field()
industry = scrapy.Field()
comp_clearfix = scrapy.Field()
rate_num = scrapy.Field()
job_count = scrapy.Field()
registered_capital = scrapy.Field()
spider_time = scrapy.Field()
origin_site = scrapy.Field()
================================================
FILE: liepin/liepinSpd_500/liepinSpd/middlewares.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import scrapy
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class LiepinspdSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LiepinspdDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENTS')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
================================================
FILE: liepin/liepinSpd_500/liepinSpd/pipelines.py
================================================
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
# class LiepinspdPipeline(object):
# def __init__(self, dbpool):
# self.dbpool = dbpool
#
# @classmethod
# def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
# """
# 数据库建立连接
# :param settings: 配置参数
# :return: 实例化参数
# """
#
# adbparams = dict(
# host=settings['MYSQL_HOST'],
# db=settings['MYSQL_DBNAME'],
# user=settings['MYSQL_USER'],
# password=settings['MYSQL_PASSWORD'],
# port = settings['MYSQL_PORT'],
# cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
# )
# # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
# dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# # 返回实例化参数
# return cls(dbpool)
#
# def process_item(self, item, spider):
# """
# 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
# """
# query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# # 添加异常处理
# query.addCallback(self.handle_error) # 处理异常
#
# def do_insert(self, cursor, item):
# # 对数据库进行插入操作,并不需要commit,twisted会自动commit
#
# insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insert_sql,
# (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']),
# str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']),
# int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],))
# def handle_error(self, failure):
# if failure:
# # 打印错误信息
# print(failure)
import pymysql
class LiepinspdPipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = """
insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industry,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql,
(item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']),
str(item['size']), str(item['city']), str(item['industry']), str(item['comp_clearfix']),
int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),
item['spider_time'], item['origin_site'],))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
================================================
FILE: liepin/liepinSpd_500/liepinSpd/settings.py
================================================
# -*- coding: utf-8 -*-
# Scrapy settings for liepinSpd project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
COMPANYLIST=['7894126', '7941798', '5464493', '8280653', '8657147', '5696000', '6918711', '8801813', '7909112', '929719', '8635277', '9208490', '9427534', '7873563', '869131', '1983198', '8521820', '8441886', '9425884', '8269623', '8143143', '8144649', '8571478', '8646314', '9086358', '8361354', '8090600', '9652027', '9662729', '8029798', '8024700', '9274661', '8614537', '1852098', '845611', '7910884', '1947829', '6657987', '8463020', '8130349', '8323671', '723421', '1573297', '9582057', '1866404', '1074696', '8586065', '4811624', '857922', '7975388', '7931578', '6615613', '8243943', '682357', '8916773', '1050201', '950043', '7939262', '1730543', '9469426', '7883086', '8628525', '7868218', '8096323', '7862738', '7023768', '8862767', '9538671', '7953390', '515361', '2104592', '993518', '8212985', '1766564', '892388', '8646248', '9857531', '1043007', '8042835', '8980779', '571837', '7862722', '7935093', '8130825', '9111311', '8051561', '9107424', '856576', '7862125', '7947928', '854827', '4209085', '859352', '7931740', '7939262', '548548', '7916182', '8354065', '9740398', '8155722', '2331894', '884195', '9651734', '8534019', '7855573', '9617356', '886895', '2431058', '1939058', '8246296', '9145034', '8161625', '4450360', '540933', '4817469']
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
BOT_NAME = 'liepinSpd'
MYSQL_HOST = 'rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com'
MYSQL_DBNAME = 'special_data'
MYSQL_USER = 'cn_ainvest_db'
MYSQL_PASSWD = 'cn_ainvest_sd3a1'
MYSQL_PORT = 3306
SPIDER_MODULES = ['liepinSpd.spiders']
NEWSPIDER_MODULE = 'liepinSpd.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'liepinSpd (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
USER_AGENTS = [
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
"Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3",
"Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3",
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
"Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
"Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
"Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
]
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'liepinSpd.middlewares.LiepinspdSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'liepinSpd.middlewares.LiepinspdDownloaderMiddleware': 543,
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'liepinSpd.middlewares.MyUserAgentMiddleware': 400,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'liepinSpd.pipelines.LiepinspdPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
================================================
FILE: liepin/liepinSpd_500/liepinSpd/spiders/__init__.py
================================================
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
================================================
FILE: liepin/liepinSpd_500/liepinSpd/spiders/lpspider.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import re
from datetime import datetime
import pandas as pd
import time
from liepinSpd.items import LiepinspdItem
class LiepinSpdier(scrapy.Spider):
name = 'liepin'
data = pd.read_csv('G:\workspace\y2019m02\company500.csv', encoding='utf-8')
companylist=data['股票简称']
start_urls = []
for company in companylist:
start_urls.append(f'https://www.liepin.com/zhaopin/?key={company}')
# 公司主要基本信息
def parse(self, response):
# company = response.meta['company']
text = response.text
# print(text)
# 抓取公司基本信息
# try:
company_name = response.xpath('//div[@class="name-and-welfare"]//h1/text()')[0].extract()
# print(company_name)
comp_sum_tag = response.xpath('//div[@class="comp-summary-tag"]/a/text()').extract()
# 好几个
stage=comp_sum_tag[0]
# print(stage)
size=comp_sum_tag[1]
# print(size)
city=comp_sum_tag[2]
# print(city)
industry=comp_sum_tag[3]
# print(industy)
#公司标签,list
comp_clearfix = str(response.xpath('//ul[@class="comp-tag-list clearfix"]//span/text()').extract())
# print(comp_clearfix)
#简历处理率 *%转化为float
rate_num = response.xpath('//p[@class="rate-num"]//span/text()')[0].extract()
rate_num=int(rate_num)/100
# print(rate_num)
job_count = int(re.search(r'<small data-selector="total">. 共([0-9]+) 个', text).group(1))
# print(job_count)
#注册资本(万元)
if '注册资本' in text and '万元人民币' in text:
registered_capital = float(re.search(r'<li>注册资本:(.*?)万元人民币</li>', text).group(1))
else:
registered_capital =0.0
# print(registered_capital)
origin_site=re.search(r'"wapUrl":"(.*?)",', text).group(1)
item = LiepinspdItem()
# 匹配股票代码,判断如果股票简称全部在公司名内,则匹配股票代码
data = pd.read_csv('G:\workspace\y2019m01\/first_lagou\company300.csv', encoding='gbk')
try:
for i in range(len(data)):
n = 0
for j in data.loc[i, '股票简称']:
if j in company_name:
n += 1
if n == len(data.loc[i, '股票简称']):
item['ticker'] = data.loc[i, '股票代码']
# print(n, item['ticker'], company_name)
# else:
# item['ticker'] ='未匹配'
except BaseException as e:
print('ticker匹配错误')
item['as_of_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
item['company_name'] = company_name
item['stage'] = stage
item['size'] = size
item['city'] = city
item['industry'] = industry
item['comp_clearfix'] = comp_clearfix
item['rate_num'] = rate_num
item['job_count'] = job_count
item['registered_capital'] = registered_capital
item['spider_time'] = datetime.strptime(str(datetime.now())[:10], '%Y-%m-%d').date()
item['origin_site'] = origin_site
yield item
# except BaseException as e:
# print('error and pass')
================================================
FILE: liepin/liepinSpd_500/run_liepin1.py
================================================
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# 获取settings.py模块的设置
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from liepinSpd.spiders.lpspider import LiepinSpdier
settings = get_project_settings()
process = CrawlerProcess(settings=settings)
# 可以添加多个spider类
process.crawl(LiepinSpdier)
# 启动爬虫,会阻塞,直到爬取完成
process.start()
================================================
FILE: liepin/liepinSpd_500/scrapy.cfg
================================================
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = liepinSpd.settings
[deploy]
#url = http://localhost:6800/
project = liepinSpd
================================================
FILE: liepin/liepinSpecialCom/liepinSpecialCom/__init__.py
================================================
================================================
FILE: liepin/liepinSpecialCom/liepinSpecialCom/items.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class LiepinspecialcomItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
as_of_date = scrapy.Field()
ticker = scrapy.Field()
company_name = scrapy.Field()
# stage = scrapy.Field()
size = scrapy.Field()
city = scrapy.Field()
industry = scrapy.Field()
# comp_clearfix = scrapy.Field()
# job_count = scrapy.Field()
# rate_num = scrapy.Field()
# registered_capital = scrapy.Field()
================================================
FILE: liepin/liepinSpecialCom/liepinSpecialCom/middlewares.py
================================================
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy import signals
import scrapy
import random
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
class LiepinspecialcomSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class LiepinspecialcomDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyUserAgentMiddleware(UserAgentMiddleware):
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('USER_AGENTS')
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
================================================
FILE: liepin/liepinSpecialCom/liepinSpecialCom/pipelines.py
================================================
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import pymysql
import pymysql.cursors
# class LiepinspdPipeline(object):
# def __init__(self, dbpool):
# self.dbpool = dbpool
#
# @classmethod
# def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
# """
# 数据库建立连接
# :param settings: 配置参数
# :return: 实例化参数
# """
#
# adbparams = dict(
# host=settings['MYSQL_HOST'],
# db=settings['MYSQL_DBNAME'],
# user=settings['MYSQL_USER'],
# password=settings['MYSQL_PASSWORD'],
# port = settings['MYSQL_PORT'],
# cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
# )
# # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
# dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# # 返回实例化参数
# return cls(dbpool)
#
# def process_item(self, item, spider):
# """
# 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
# """
# query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# # 添加异常处理
# query.addCallback(self.handle_error) # 处理异常
#
# def do_insert(self, cursor, item):
# # 对数据库进行插入操作,并不需要commit,twisted会自动commit
#
# insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,stage,`size`,city,industy,comp_clearfix,job_count,rate_num,registered_capital,spider_time,origin_site) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
# cursor.execute(insert_sql,
# (item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['stage']),
# str(item['size']), str(item['city']), str(item['industy']), str(item['comp_clearfix']),
# int(item['job_count']), float(item['rate_num']), float(item['registered_capital']),item['spider_time'],item['origin_site'],))
# def handle_error(self, failure):
# if failure:
# # 打印错误信息
# print(failure)
import pymysql
class LiepinspecialcomPipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect('rm-2zewagytttzk6f24xno.mysql.rds.aliyuncs.com', 'cn_ainvest_db', 'cn_ainvest_sd3a1', 'special_data') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,`size`,city,industry) VALUES(%s,%s,%s,%s,%s,%s)"
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql,
(item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['size']),
str(item['city']),str(item['industry'])))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
# class LiepinspecialcomPipeline(object):
# def __init__(self, dbpool):
# self.dbpool = dbpool
#
# @classmethod
# def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
# """
# 数据库建立连接
# :param settings: 配置参数
# :return: 实例化参数
# """
#
# adbparams = dict(
# host=settings['MYSQL_HOST'],
# db=settings['MYSQL_DBNAME'],
# user=settings['MYSQL_USER'],
# password=settings['MYSQL_PASSWORD'],
# cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
# )
# # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
# dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# # 返回实例化参数
# return cls(dbpool)
#
# def process_item(self, item, spider):
# """
# 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
# """
# query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# # 添加异常处理
# query.addCallback(self.handle_error) # 处理异常
#
# def do_insert(self, cursor, item):
# # 对数据库进行插入操作,并不需要commit,twisted会自动commit
# insert_sql = "insert into company_base_info(as_of_date,ticker,company_name,`size`,city,industry) VALUES(%s,%s,%s,%s,%s,%s)"
# cursor.execute(insert_sql, (
# item['as_of_date'], str(item['ticker']), str(item['company_name']), str(item['size']), str(item['city']),
# str(item['industry'])))
#
# def handle_error(self, failure):
# if failure:
# # 打印错误信息
# print(failure)
================================================
FILE: liepin/liepinSpecialCom/liepinSpecialCom/settings.py
================================================
# -*- coding: utf-8 -*-
# Scrapy settings for liepinSpecialCom project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'liepinSpecialCom'
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'day0123'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123'
SPIDER_MODULES = ['liepinSpecialCom.spiders']
NEWSPIDER_MODULE = 'liepinSpecialCom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'liepinSpecialCom (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
USER_AGENTS = [
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_
gitextract_aykf45t2/
├── .gitattributes
├── .gitignore
├── 126email/
│ └── 126email.py
├── 163email/
│ └── 163email.py
├── 163youdao/
│ └── 163youdao.py
├── Github/
│ └── login.py
├── LICENSE
├── NeteaseCloudMusicDownload/
│ └── api.py
├── README-Test.md
├── README-en-us.md
├── README.md
├── baidu/
│ ├── baidu.py
│ ├── requirements.txt
│ └── util.py
├── baidu_translate/
│ ├── Baidufanyi.py
│ └── translate.js
├── bilibili/
│ └── bilibili.py
├── csdn/
│ ├── README
│ └── selenium_csdn.py
├── douban/
│ ├── douban.py
│ └── douban_spider.py
├── facebook/
│ └── facebook.py
├── guoke/
│ ├── guoke.py
│ └── guoke_spider.py
├── jd_login/
│ ├── Method_First/
│ │ ├── Try_selenium.py
│ │ ├── ban.txt
│ │ ├── choice.txt
│ │ └── config.py
│ ├── Method_Second/
│ │ ├── Config.py
│ │ ├── Truekeyword.txt
│ │ └── main.py
│ ├── README.md
│ └── login_by_selenium.py
├── lagou/
│ └── Lagou.py
├── liepin/
│ ├── README.md
│ ├── liepinSpd/
│ │ ├── liepinSpd/
│ │ │ ├── __init__.py
│ │ │ ├── dbhelper.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspider.py
│ │ ├── run_liepin1.py
│ │ └── scrapy.cfg
│ ├── liepinSpd2/
│ │ ├── liepinSpd2/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── liepinJob.py
│ │ ├── run_liepin2.py
│ │ └── scrapy.cfg
│ ├── liepinSpd_500/
│ │ ├── liepinSpd/
│ │ │ ├── __init__.py
│ │ │ ├── dbhelper.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspider.py
│ │ ├── run_liepin1.py
│ │ └── scrapy.cfg
│ ├── liepinSpecialCom/
│ │ ├── liepinSpecialCom/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspecialcom.py
│ │ ├── run_liepinspecialcom.py
│ │ └── scrapy.cfg
│ ├── liepinSpecialComJob/
│ │ ├── liepinSpecialComJob/
│ │ │ ├── __init__.py
│ │ │ ├── items.py
│ │ │ ├── middlewares.py
│ │ │ ├── pipelines.py
│ │ │ ├── settings.py
│ │ │ └── spiders/
│ │ │ ├── __init__.py
│ │ │ └── lpspecialcomjob.py
│ │ ├── run_liepinspecialjob.py
│ │ └── scrapy.cfg
│ └── liepin_login.py
├── qqmusic/
│ ├── qqmusic_spider.py
│ └── sign.js
├── qqzone/
│ └── qq_zone.py
├── qsbk/
│ └── qiushibaike.py
├── sina/
│ ├── sina.py
│ └── spider/
│ ├── Ajax_weibo.py
│ └── selenium_test.py
├── taobao/
│ ├── mac_chromedriver/
│ │ └── chromedriver
│ ├── taobao_via_username_password.py
│ └── taobao_via_weibo.py
├── tieba/
│ └── tieba_spider.py
├── tuchong/
│ └── tuchong.py
├── webWeixin/
│ └── webWeixin.py
├── xiamiMusic/
│ ├── README
│ └── api.py
└── zhaopingou/
└── zhaopingou_login.py
SYMBOL INDEX (318 symbols across 52 files)
FILE: 126email/126email.py
function login (line 5) | def login():
FILE: 163email/163email.py
function login (line 9) | def login():
FILE: 163youdao/163youdao.py
function login (line 13) | def login():
FILE: Github/login.py
class GithubLogin (line 10) | class GithubLogin(object):
method __init__ (line 12) | def __init__(self, email, password):
method login_GitHub (line 26) | def login_GitHub(self):
method get_token (line 48) | def get_token(self):
FILE: NeteaseCloudMusicDownload/api.py
class decrypt_music (line 14) | class decrypt_music(object):
method __init__ (line 15) | def __init__(self, d):
method get_random_str (line 24) | def get_random_str(self):
method aes_encrypt (line 32) | def aes_encrypt(self, text, key):
method rsa_encrypt (line 42) | def rsa_encrypt(self, value, text, modulus):
method get_data (line 48) | def get_data(self):
class Spider (line 58) | class Spider(object):
method __init__ (line 59) | def __init__(self):
method __get_songs (line 66) | def __get_songs(self, name):
method __get_mp3 (line 74) | def __get_mp3(self, id):
method __download_mp3 (line 83) | def __download_mp3(self, url, filename):
method __print_info (line 92) | def __print_info(self, songs):
method run (line 100) | def run(self):
FILE: baidu/baidu.py
class BaiduLogin (line 17) | class BaiduLogin(object):
method __init__ (line 18) | def __init__(self):
method _init_cookies (line 29) | def _init_cookies(self):
method _get_token (line 35) | def _get_token(self):
method _get_public_key (line 56) | def _get_public_key(self):
method login (line 75) | def login(self, username, password, retry=4):
class LoginError (line 151) | class LoginError(Exception):
FILE: baidu/util.py
function encrypt_pwd (line 11) | def encrypt_pwd(password, public_key):
function open_image (line 18) | def open_image(image_file):
function save_image (line 28) | def save_image(resp, image_file):
function parse_json (line 34) | def parse_json(s):
FILE: baidu_translate/Baidufanyi.py
class BaiDuTranslater (line 39) | class BaiDuTranslater(object):
method __init__ (line 44) | def __init__(self, query):
method make_sign (line 54) | def make_sign(self):
method make_data (line 64) | def make_data(self, sign):
method get_content (line 81) | def get_content(self, data):
method run (line 90) | def run(self):
FILE: baidu_translate/translate.js
function n (line 4) | function n(r, o) {
function a (line 14) | function a(r) {
FILE: bilibili/bilibili.py
class BiliBili (line 21) | class BiliBili():
method __init__ (line 27) | def __init__(self, username, password):
method get_button (line 42) | def get_button(self):
method get_screenshot (line 50) | def get_screenshot(self, button):
method get_position (line 66) | def get_position(self, button):
method get_geetest_image (line 81) | def get_geetest_image(self, button, name1='captcha1.png', name2='captc...
method login (line 100) | def login(self):
method is_pixel_equal (line 113) | def is_pixel_equal(self, img1, img2, x, y):
method get_gap (line 131) | def get_gap(self, img1, img2):
method get_track (line 146) | def get_track(self, distance):
method move_button (line 175) | def move_button(self, button, track):
method crack (line 189) | def crack(self):
FILE: csdn/selenium_csdn.py
class Api (line 20) | class Api(object):
method __init__ (line 21) | def __init__(self, account, password):
method send_key (line 28) | async def send_key(self):
method crawl (line 41) | async def crawl(self):
function main (line 68) | def main():
FILE: douban/douban.py
class DouBanLogin (line 17) | class DouBanLogin(object):
method __init__ (line 18) | def __init__(self, account, password):
method get_cookie (line 33) | def get_cookie(self):
method get_user_data (line 43) | def get_user_data(self):
method run (line 51) | def run(self):
FILE: facebook/facebook.py
function login (line 8) | def login(session, email, password):
FILE: guoke/guoke_spider.py
function get_index (line 21) | def get_index(offset):
function get_url (line 39) | def get_url(json):
function get_text (line 56) | def get_text(url):
function save_article (line 68) | def save_article(content):
function main (line 83) | def main(offset):
FILE: jd_login/Method_First/Try_selenium.py
function do_try (line 33) | def do_try(url):
function get_try (line 61) | def get_try(page):
function Control_try (line 111) | def Control_try(total_page):
function login (line 124) | def login():
function auto_showdown (line 137) | def auto_showdown():
function deal_file (line 143) | def deal_file():
function check_name (line 156) | def check_name(title):
FILE: jd_login/Method_Second/main.py
function readCookies (line 36) | def readCookies():
function writeCookies (line 48) | def writeCookies(cookies):
function closeSW (line 55) | def closeSW(iApplyNum):
function genekeys (line 79) | def genekeys():
function goodJudge (line 100) | def goodJudge(goodName, goodPrice, keys):
function do_try (line 126) | def do_try(url):
function get_try (line 171) | def get_try(cid, iApplyNum, maxApplyNum, keys):
function trycid (line 243) | def trycid():
function login (line 262) | def login():
FILE: lagou/Lagou.py
class Lagou_login (line 20) | class Lagou_login(object):
method __init__ (line 21) | def __init__(self):
method encryptPwd (line 31) | def encryptPwd(self, passwd):
method getTokenCode (line 40) | def getTokenCode(self):
method getCaptcha (line 65) | def getCaptcha(self):
method login (line 85) | def login(self, user, passwd, captchaData=None, token_code=None):
FILE: liepin/liepinSpd/liepinSpd/dbhelper.py
class DBHelper (line 4) | class DBHelper():
method __init__ (line 6) | def __init__(self):
method connectMysql (line 15) | def connectMysql(self):
method connectDatabase (line 23) | def connectDatabase(self):
method createDatabase (line 33) | def createDatabase(self):
method createTable (line 43) | def createTable(self,sql):
method insert (line 52) | def insert(self,sql,*params):
method update (line 62) | def update(self,sql,*params):
method delete (line 72) | def delete(self,sql,*params):
class TestDBHelper (line 83) | class TestDBHelper():
method __init__ (line 84) | def __init__(self):
method testCreateDatebase (line 87) | def testCreateDatebase(self):
method testCreateTable (line 90) | def testCreateTable(self):
method testInsert (line 94) | def testInsert(self):
method testUpdate (line 98) | def testUpdate(self):
method testDelete (line 103) | def testDelete(self):
FILE: liepin/liepinSpd/liepinSpd/items.py
class LiepinspdItem (line 11) | class LiepinspdItem(scrapy.Item):
FILE: liepin/liepinSpd/liepinSpd/middlewares.py
class LiepinspdSpiderMiddleware (line 13) | class LiepinspdSpiderMiddleware(object):
method from_crawler (line 19) | def from_crawler(cls, crawler):
method process_spider_input (line 25) | def process_spider_input(self, response, spider):
method process_spider_output (line 32) | def process_spider_output(self, response, result, spider):
method process_spider_exception (line 40) | def process_spider_exception(self, response, exception, spider):
method process_start_requests (line 48) | def process_start_requests(self, start_requests, spider):
method spider_opened (line 57) | def spider_opened(self, spider):
class LiepinspdDownloaderMiddleware (line 61) | class LiepinspdDownloaderMiddleware(object):
method from_crawler (line 67) | def from_crawler(cls, crawler):
method process_request (line 73) | def process_request(self, request, spider):
method process_response (line 85) | def process_response(self, request, response, spider):
method process_exception (line 94) | def process_exception(self, request, exception, spider):
method spider_opened (line 104) | def spider_opened(self, spider):
class MyUserAgentMiddleware (line 108) | class MyUserAgentMiddleware(UserAgentMiddleware):
method __init__ (line 113) | def __init__(self, user_agent):
method from_crawler (line 117) | def from_crawler(cls, crawler):
method process_request (line 122) | def process_request(self, request, spider):
FILE: liepin/liepinSpd/liepinSpd/pipelines.py
class LiepinspdPipeline (line 61) | class LiepinspdPipeline(object):
method __init__ (line 66) | def __init__(self):
method process_item (line 72) | def process_item(self, item, spider):
method close_spider (line 86) | def close_spider(self, spider):
FILE: liepin/liepinSpd/liepinSpd/spiders/lpspider.py
class LiepinSpdier (line 13) | class LiepinSpdier(scrapy.Spider):
method parse (line 21) | def parse(self, response):
FILE: liepin/liepinSpd2/liepinSpd2/items.py
class Liepinspd2Item (line 11) | class Liepinspd2Item(scrapy.Item):
FILE: liepin/liepinSpd2/liepinSpd2/middlewares.py
class Liepinspd2SpiderMiddleware (line 17) | class Liepinspd2SpiderMiddleware(object):
method from_crawler (line 23) | def from_crawler(cls, crawler):
method process_spider_input (line 29) | def process_spider_input(self, response, spider):
method process_spider_output (line 36) | def process_spider_output(self, response, result, spider):
method process_spider_exception (line 44) | def process_spider_exception(self, response, exception, spider):
method process_start_requests (line 52) | def process_start_requests(self, start_requests, spider):
method spider_opened (line 61) | def spider_opened(self, spider):
class Liepinspd2DownloaderMiddleware (line 65) | class Liepinspd2DownloaderMiddleware(object):
method from_crawler (line 71) | def from_crawler(cls, crawler):
method process_request (line 77) | def process_request(self, request, spider):
method process_response (line 89) | def process_response(self, request, response, spider):
method process_exception (line 98) | def process_exception(self, request, exception, spider):
method spider_opened (line 108) | def spider_opened(self, spider):
class MyUserAgentMiddleware (line 112) | class MyUserAgentMiddleware(UserAgentMiddleware):
method __init__ (line 117) | def __init__(self, user_agent):
method from_crawler (line 121) | def from_crawler(cls, crawler):
method process_request (line 126) | def process_request(self, request, spider):
class ProxyMiddleware (line 156) | class ProxyMiddleware(object):
method process_request (line 159) | def process_request(self, request, spider):
method process_response (line 165) | def process_response(self, request, response, spider):
method get_random_proxy (line 176) | def get_random_proxy(self):
FILE: liepin/liepinSpd2/liepinSpd2/pipelines.py
class Liepinspd2Pipeline (line 56) | class Liepinspd2Pipeline(object):
method __init__ (line 61) | def __init__(self):
method process_item (line 67) | def process_item(self, item, spider):
method close_spider (line 78) | def close_spider(self, spider):
FILE: liepin/liepinSpd2/liepinSpd2/spiders/liepinJob.py
class LiepinSpdier (line 18) | class LiepinSpdier(scrapy.Spider):
method parse (line 26) | def parse(self, response):
method parse_list (line 36) | def parse_list(self, response):
method parse_job (line 42) | def parse_job(self,response):
FILE: liepin/liepinSpd_500/liepinSpd/dbhelper.py
class DBHelper (line 4) | class DBHelper():
method __init__ (line 6) | def __init__(self):
method connectMysql (line 15) | def connectMysql(self):
method connectDatabase (line 23) | def connectDatabase(self):
method createDatabase (line 33) | def createDatabase(self):
method createTable (line 43) | def createTable(self,sql):
method insert (line 52) | def insert(self,sql,*params):
method update (line 62) | def update(self,sql,*params):
method delete (line 72) | def delete(self,sql,*params):
class TestDBHelper (line 83) | class TestDBHelper():
method __init__ (line 84) | def __init__(self):
method testCreateDatebase (line 87) | def testCreateDatebase(self):
method testCreateTable (line 90) | def testCreateTable(self):
method testInsert (line 94) | def testInsert(self):
method testUpdate (line 98) | def testUpdate(self):
method testDelete (line 103) | def testDelete(self):
FILE: liepin/liepinSpd_500/liepinSpd/items.py
class LiepinspdItem (line 11) | class LiepinspdItem(scrapy.Item):
FILE: liepin/liepinSpd_500/liepinSpd/middlewares.py
class LiepinspdSpiderMiddleware (line 13) | class LiepinspdSpiderMiddleware(object):
method from_crawler (line 19) | def from_crawler(cls, crawler):
method process_spider_input (line 25) | def process_spider_input(self, response, spider):
method process_spider_output (line 32) | def process_spider_output(self, response, result, spider):
method process_spider_exception (line 40) | def process_spider_exception(self, response, exception, spider):
method process_start_requests (line 48) | def process_start_requests(self, start_requests, spider):
method spider_opened (line 57) | def spider_opened(self, spider):
class LiepinspdDownloaderMiddleware (line 61) | class LiepinspdDownloaderMiddleware(object):
method from_crawler (line 67) | def from_crawler(cls, crawler):
method process_request (line 73) | def process_request(self, request, spider):
method process_response (line 85) | def process_response(self, request, response, spider):
method process_exception (line 94) | def process_exception(self, request, exception, spider):
method spider_opened (line 104) | def spider_opened(self, spider):
class MyUserAgentMiddleware (line 108) | class MyUserAgentMiddleware(UserAgentMiddleware):
method __init__ (line 113) | def __init__(self, user_agent):
method from_crawler (line 117) | def from_crawler(cls, crawler):
method process_request (line 122) | def process_request(self, request, spider):
FILE: liepin/liepinSpd_500/liepinSpd/pipelines.py
class LiepinspdPipeline (line 61) | class LiepinspdPipeline(object):
method __init__ (line 66) | def __init__(self):
method process_item (line 72) | def process_item(self, item, spider):
method close_spider (line 86) | def close_spider(self, spider):
FILE: liepin/liepinSpd_500/liepinSpd/spiders/lpspider.py
class LiepinSpdier (line 13) | class LiepinSpdier(scrapy.Spider):
method parse (line 22) | def parse(self, response):
FILE: liepin/liepinSpecialCom/liepinSpecialCom/items.py
class LiepinspecialcomItem (line 11) | class LiepinspecialcomItem(scrapy.Item):
FILE: liepin/liepinSpecialCom/liepinSpecialCom/middlewares.py
class LiepinspecialcomSpiderMiddleware (line 15) | class LiepinspecialcomSpiderMiddleware(object):
method from_crawler (line 21) | def from_crawler(cls, crawler):
method process_spider_input (line 27) | def process_spider_input(self, response, spider):
method process_spider_output (line 34) | def process_spider_output(self, response, result, spider):
method process_spider_exception (line 42) | def process_spider_exception(self, response, exception, spider):
method process_start_requests (line 50) | def process_start_requests(self, start_requests, spider):
method spider_opened (line 59) | def spider_opened(self, spider):
class LiepinspecialcomDownloaderMiddleware (line 63) | class LiepinspecialcomDownloaderMiddleware(object):
method from_crawler (line 69) | def from_crawler(cls, crawler):
method process_request (line 75) | def process_request(self, request, spider):
method process_response (line 87) | def process_response(self, request, response, spider):
method process_exception (line 96) | def process_exception(self, request, exception, spider):
method spider_opened (line 106) | def spider_opened(self, spider):
class MyUserAgentMiddleware (line 110) | class MyUserAgentMiddleware(UserAgentMiddleware):
method __init__ (line 115) | def __init__(self, user_agent):
method from_crawler (line 119) | def from_crawler(cls, crawler):
method process_request (line 124) | def process_request(self, request, spider):
FILE: liepin/liepinSpecialCom/liepinSpecialCom/pipelines.py
class LiepinspecialcomPipeline (line 62) | class LiepinspecialcomPipeline(object):
method __init__ (line 67) | def __init__(self):
method process_item (line 73) | def process_item(self, item, spider):
method close_spider (line 83) | def close_spider(self, spider):
FILE: liepin/liepinSpecialCom/liepinSpecialCom/spiders/lpspecialcom.py
class LiepinSpdier (line 10) | class LiepinSpdier(scrapy.Spider):
method parse (line 26) | def parse(self, response):
FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/items.py
class LiepinspecialcomjobItem (line 11) | class LiepinspecialcomjobItem(scrapy.Item):
FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/middlewares.py
class LiepinspecialcomjobSpiderMiddleware (line 13) | class LiepinspecialcomjobSpiderMiddleware(object):
method from_crawler (line 19) | def from_crawler(cls, crawler):
method process_spider_input (line 25) | def process_spider_input(self, response, spider):
method process_spider_output (line 32) | def process_spider_output(self, response, result, spider):
method process_spider_exception (line 40) | def process_spider_exception(self, response, exception, spider):
method process_start_requests (line 48) | def process_start_requests(self, start_requests, spider):
method spider_opened (line 57) | def spider_opened(self, spider):
class LiepinspecialcomjobDownloaderMiddleware (line 61) | class LiepinspecialcomjobDownloaderMiddleware(object):
method from_crawler (line 67) | def from_crawler(cls, crawler):
method process_request (line 73) | def process_request(self, request, spider):
method process_response (line 85) | def process_response(self, request, response, spider):
method process_exception (line 94) | def process_exception(self, request, exception, spider):
method spider_opened (line 104) | def spider_opened(self, spider):
class MyUserAgentMiddleware (line 108) | class MyUserAgentMiddleware(UserAgentMiddleware):
method __init__ (line 113) | def __init__(self, user_agent):
method from_crawler (line 117) | def from_crawler(cls, crawler):
method process_request (line 122) | def process_request(self, request, spider):
class ProxyMiddleware (line 128) | class ProxyMiddleware(object):
method process_request (line 131) | def process_request(self, request, spider):
method process_response (line 137) | def process_response(self, request, response, spider):
method get_random_proxy (line 148) | def get_random_proxy(self):
FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/pipelines.py
class LiepinspecialcomjobPipeline (line 57) | class LiepinspecialcomjobPipeline(object):
method __init__ (line 62) | def __init__(self):
method process_item (line 68) | def process_item(self, item, spider):
method close_spider (line 77) | def close_spider(self, spider):
FILE: liepin/liepinSpecialComJob/liepinSpecialComJob/spiders/lpspecialcomjob.py
class LiepinSpdier (line 13) | class LiepinSpdier(scrapy.Spider):
method parse (line 28) | def parse(self, response):
method parse_list (line 54) | def parse_list(self,response):
method parse_job (line 103) | def parse_job(self,response):
FILE: liepin/liepin_login.py
class Leipin (line 14) | class Leipin(object):
method __init__ (line 15) | def __init__(self, username, password):
method _md5 (line 38) | def _md5(self):
method _getAuthcode (line 44) | def _getAuthcode(self):
method login (line 57) | def login(self):
FILE: qqmusic/qqmusic_spider.py
class Spider (line 10) | class Spider(object):
method __init__ (line 11) | def __init__(self):
method __get_songs (line 17) | def __get_songs(self, name):
method __print_info (line 23) | def __print_info(self, songs):
method __get_Sign (line 38) | def __get_Sign(self,data):
method __get_url (line 46) | def __get_url(self,data):
method __set_data (line 63) | def __set_data(self, songmid):
method __download_mp3 (line 68) | def __download_mp3(self, url, filename):
method run (line 77) | def run(self):
FILE: qqmusic/sign.js
function __sign_hash_20200305 (line 1) | function __sign_hash_20200305 (n) {
function r (line 121) | function r(f, h, c, l, g) {
function getSecuritySign (line 403) | function getSecuritySign(data){
FILE: qsbk/qiushibaike.py
class QiushiSpider (line 13) | class QiushiSpider():
method __init__ (line 15) | def __init__(self, max_page):
method get_url_list (line 29) | def get_url_list(self):
method exec_task (line 39) | def exec_task(self):
method exec_task_finished (line 63) | def exec_task_finished(self,result):
method run (line 69) | def run(self):
FILE: sina/sina.py
function get_login (line 15) | def get_login(phone, pwd):
FILE: sina/spider/Ajax_weibo.py
function create_sheet (line 30) | def create_sheet(bozhu):
function url_get (line 48) | def url_get():
function get_page (line 84) | def get_page(page):
function parse_page (line 104) | def parse_page(json):
FILE: taobao/taobao_via_username_password.py
class TaobaoSpider (line 27) | class TaobaoSpider:
method __init__ (line 29) | def __init__(self, username, password):
method login (line 42) | def login(self):
FILE: taobao/taobao_via_weibo.py
class Taobao_Spider (line 19) | class Taobao_Spider:
method __init__ (line 21) | def __init__(self, username, password):
method run (line 39) | def run(self):
FILE: tieba/tieba_spider.py
class TieBa_Spier (line 29) | class TieBa_Spier():
method __init__ (line 31) | def __init__(self, max_pn, kw):
method get_url_list (line 40) | def get_url_list(self):
method get_content (line 44) | def get_content(self, url):
method save_items (line 53) | def save_items(self, content, idx):
method run (line 59) | def run(self):
FILE: tuchong/tuchong.py
function get_imageID (line 26) | def get_imageID(term, page):
function parse_imgID (line 39) | def parse_imgID(imageID):
function get_ImageJPG (line 52) | def get_ImageJPG(id):
function parse_imgURL (line 65) | def parse_imgURL(html):
function download_image (line 76) | def download_image(url):
function save_image (line 87) | def save_image(content):
function main (line 96) | def main():
FILE: webWeixin/webWeixin.py
function getUUID (line 38) | def getUUID():
function showQRImage (line 65) | def showQRImage():
function waitForLogin (line 92) | def waitForLogin():
function login (line 129) | def login():
function webwxinit (line 163) | def webwxinit():
function webwxgetcontact (line 197) | def webwxgetcontact():
function main (line 232) | def main():
FILE: xiamiMusic/api.py
class XMLogin (line 18) | class XMLogin(object):
method __init__ (line 21) | def __init__(self, account, password):
method send_key (line 27) | async def send_key(self):
method slide (line 34) | async def slide(self):
method validate (line 45) | async def validate(self):
method crawl (line 54) | async def crawl(self):
function main (line 92) | def main():
FILE: zhaopingou/zhaopingou_login.py
class ZhaoPinGouLogin (line 16) | class ZhaoPinGouLogin(object):
method __init__ (line 18) | def __init__(self, account, password):
method get_coolie (line 35) | def get_coolie(self):
method run (line 53) | def run(self):
Condensed preview — 99 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (306K chars).
[
{
"path": ".gitattributes",
"chars": 102,
"preview": " *.js linguist-language=python\n *.css linguist-language=python\n *.html linguist-language=python\n"
},
{
"path": ".gitignore",
"chars": 1259,
"preview": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packagi"
},
{
"path": "126email/126email.py",
"chars": 846,
"preview": "import time\nfrom selenium import webdriver\nfrom getpass import getpass\n\ndef login():\n acount_num = input('请输入账号:')\n "
},
{
"path": "163email/163email.py",
"chars": 1254,
"preview": "import time\nfrom getpass import getpass\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom "
},
{
"path": "163youdao/163youdao.py",
"chars": 1232,
"preview": "import time\nfrom selenium import webdriver\n\n\nlogin_url = \"http://account.youdao.com/login?service=dict\"\n\nxpaths = {'user"
},
{
"path": "Github/login.py",
"chars": 1910,
"preview": "# -*- coding: utf-8 -*-\n# @Author: CriseLYJ\n# @Date: 2020-08-14 12:13:11\n\nimport re\nimport requests\nfrom getpass impor"
},
{
"path": "LICENSE",
"chars": 1124,
"preview": "The MIT License\n\nCopyright (c) 2018 CriseLYJ.\nhttps://github.com/CriseLYJ/awesome-python-login-model\n\nPermission is here"
},
{
"path": "NeteaseCloudMusicDownload/api.py",
"chars": 5604,
"preview": "# -*- coding: utf-8 -*-\n# @Author: CriseLYJ\n# @Date: 2020-08-14 13:48:23\n\nimport requests\nimport math\nimport random\nfr"
},
{
"path": "README-Test.md",
"chars": 1091,
"preview": "## Test\n\n### Bilibili自动登录测试正常,成功率98%\n\n\n\n\n\n### web微信\n\n {\n for (var t = 0; t < o.length - 2; t += 3) {\n var e = o.charAt"
},
{
"path": "bilibili/bilibili.py",
"chars": 6887,
"preview": "from selenium import webdriver\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.webdriver.support i"
},
{
"path": "csdn/README",
"chars": 154,
"preview": "csdn login module\n========================\n\n@upload and test date: 2020-08-17\n@use module: pyppeteer==0.2.2\n@author: Kri"
},
{
"path": "csdn/selenium_csdn.py",
"chars": 2584,
"preview": "# -*- coding: utf-8 -*-\n# @Author: Kris\n# @Mail: criselyj@163.com\n# @Date: 2020-08-14 17:40:11\nimport os\nimport random"
},
{
"path": "douban/douban.py",
"chars": 1438,
"preview": "#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\nimport requests\n\n\"\"\"\ninfo:\nauthor:CriseLYJ\ngithub:https://github.com/Cris"
},
{
"path": "douban/douban_spider.py",
"chars": 862,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\nimport json\n\nimport requests\n\n# 定义请求url\nurl = \"https://movie.douban.com/j/sea"
},
{
"path": "facebook/facebook.py",
"chars": 1367,
"preview": "from __future__ import print_function\n\nimport argparse\nimport requests\nimport pyquery\n\n\ndef login(session, email, passwo"
},
{
"path": "guoke/guoke.py",
"chars": 1924,
"preview": "import requests\nimport re\n\nheaders_login = {\n 'Accept-Language': 'zh-CN,zh;q=0.8',\n 'Cache-Control': 'no-cache',\n "
},
{
"path": "guoke/guoke_spider.py",
"chars": 2674,
"preview": "# -*- coding: utf-8 -*-\nimport requests\nfrom urllib.parse import urlencode\nfrom requests import codes\nimport os\nfrom mul"
},
{
"path": "jd_login/Method_First/Try_selenium.py",
"chars": 5185,
"preview": "#coding=utf-8\nfrom selenium import webdriver\nfrom selenium.webdriver.support.ui import WebDriverWait\nfrom selenium.commo"
},
{
"path": "jd_login/Method_First/ban.txt",
"chars": 3,
"preview": "Ȥ ˬ"
},
{
"path": "jd_login/Method_First/choice.txt",
"chars": 6,
"preview": "ž ȳ Ǯ"
},
{
"path": "jd_login/Method_First/config.py",
"chars": 437,
"preview": "#coding:utf-8\n\nsettings = {\n 'auto_shutdown':False, #是否自动关机,默认为False\n 'total_products':300, #要申请的商品个数上限,默认为300\n"
},
{
"path": "jd_login/Method_Second/Config.py",
"chars": 405,
"preview": "\"\"\"\nConfig.py\n配置文件\n\"\"\"\n\nsettings = {\n #一天申请的限制个数\n 'maxApplyNum' : 300 ,\n #试用类型\n #家用电器737 手机数码652 电脑办公670 家居家"
},
{
"path": "jd_login/Method_Second/Truekeyword.txt",
"chars": 2378,
"preview": "/ 表带\n手表 手机 华为 huawei mate vivo oppo 小米 苹果 apple MacBook 电脑 笔记本 ipad/ 套 膜 钢化 全包 壳 支架 防水袋\n自拍杆 三脚架 内存卡 /\n/ 流量卡 手机卡 不限速 上网卡"
},
{
"path": "jd_login/Method_Second/main.py",
"chars": 9501,
"preview": "\"\"\"\n京东试用自动申请程序,每天仅需执行一次即可\n\"\"\"\n\nfrom selenium import webdriver\nfrom selenium.webdriver.support.ui import WebDriverWait\nfr"
},
{
"path": "jd_login/README.md",
"chars": 16,
"preview": "#### Jd Spider.\n"
},
{
"path": "jd_login/login_by_selenium.py",
"chars": 536,
"preview": "# tested on ubuntu15.04\nimport time\nfrom selenium import webdriver\n\nlogin_url = 'https://passport.jd.com/new/login.aspx'"
},
{
"path": "lagou/Lagou.py",
"chars": 4503,
"preview": "# -*- coding:utf-8 -*-\nimport re\nimport os\nimport time\nimport json\nimport sys\nimport subprocess\nimport requests\nimport h"
},
{
"path": "liepin/README.md",
"chars": 39,
"preview": "# scrapy_liepin\n\nscrapy爬猎聘,通过公司名搜索公司职位\n"
},
{
"path": "liepin/liepinSpd/liepinSpd/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "liepin/liepinSpd/liepinSpd/dbhelper.py",
"chars": 3121,
"preview": "import pymysql\nfrom scrapy.utils.project import get_project_settings#引入settings配置\n\nclass DBHelper():\n\n def __init__(s"
},
{
"path": "liepin/liepinSpd/liepinSpd/items.py",
"chars": 659,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# https://doc.scrapy."
},
{
"path": "liepin/liepinSpd/liepinSpd/middlewares.py",
"chars": 4142,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://doc.scr"
},
{
"path": "liepin/liepinSpd/liepinSpd/pipelines.py",
"chars": 3543,
"preview": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES s"
},
{
"path": "liepin/liepinSpd/liepinSpd/settings.py",
"chars": 18509,
"preview": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for liepinSpd project\n#\n# For simplicity, this file contains only settings co"
},
{
"path": "liepin/liepinSpd/liepinSpd/spiders/__init__.py",
"chars": 161,
"preview": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on "
},
{
"path": "liepin/liepinSpd/liepinSpd/spiders/lpspider.py",
"chars": 4329,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nimport scrapy\nimport re\nfrom datetime import datetime\nimport pandas as p"
},
{
"path": "liepin/liepinSpd/run_liepin1.py",
"chars": 376,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# 获取settings.py模块的设置\nfrom scrapy.crawler import CrawlerProcess\nfrom scra"
},
{
"path": "liepin/liepinSpd/scrapy.cfg",
"chars": 261,
"preview": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrap"
},
{
"path": "liepin/liepinSpd2/liepinSpd2/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "liepin/liepinSpd2/liepinSpd2/items.py",
"chars": 713,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# https://doc.scrapy."
},
{
"path": "liepin/liepinSpd2/liepinSpd2/middlewares.py",
"chars": 5690,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://doc.scr"
},
{
"path": "liepin/liepinSpd2/liepinSpd2/pipelines.py",
"chars": 3191,
"preview": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES s"
},
{
"path": "liepin/liepinSpd2/liepinSpd2/settings.py",
"chars": 14989,
"preview": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for liepinSpd2 project\n#\n# For simplicity, this file contains only settings c"
},
{
"path": "liepin/liepinSpd2/liepinSpd2/spiders/__init__.py",
"chars": 161,
"preview": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on "
},
{
"path": "liepin/liepinSpd2/liepinSpd2/spiders/liepinJob.py",
"chars": 6905,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nimport scrapy\nimport re\nimport json\nfrom datetime import datetime\nimport"
},
{
"path": "liepin/liepinSpd2/run_liepin2.py",
"chars": 378,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# 获取settings.py模块的设置\nfrom scrapy.crawler import CrawlerProcess\nfrom scra"
},
{
"path": "liepin/liepinSpd2/scrapy.cfg",
"chars": 263,
"preview": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrap"
},
{
"path": "liepin/liepinSpd_500/liepinSpd/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "liepin/liepinSpd_500/liepinSpd/dbhelper.py",
"chars": 3121,
"preview": "import pymysql\nfrom scrapy.utils.project import get_project_settings#引入settings配置\n\nclass DBHelper():\n\n def __init__(s"
},
{
"path": "liepin/liepinSpd_500/liepinSpd/items.py",
"chars": 659,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# https://doc.scrapy."
},
{
"path": "liepin/liepinSpd_500/liepinSpd/middlewares.py",
"chars": 4142,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://doc.scr"
},
{
"path": "liepin/liepinSpd_500/liepinSpd/pipelines.py",
"chars": 3543,
"preview": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES s"
},
{
"path": "liepin/liepinSpd_500/liepinSpd/settings.py",
"chars": 18509,
"preview": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for liepinSpd project\n#\n# For simplicity, this file contains only settings co"
},
{
"path": "liepin/liepinSpd_500/liepinSpd/spiders/__init__.py",
"chars": 161,
"preview": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on "
},
{
"path": "liepin/liepinSpd_500/liepinSpd/spiders/lpspider.py",
"chars": 3179,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\nimport scrapy\nimport re\nfrom datetime import datetime\nimport pandas as p"
},
{
"path": "liepin/liepinSpd_500/run_liepin1.py",
"chars": 376,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# 获取settings.py模块的设置\nfrom scrapy.crawler import CrawlerProcess\nfrom scra"
},
{
"path": "liepin/liepinSpd_500/scrapy.cfg",
"chars": 261,
"preview": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrap"
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/items.py",
"chars": 636,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# https://doc.scrapy."
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/middlewares.py",
"chars": 4185,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://doc.scr"
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/pipelines.py",
"chars": 4780,
"preview": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES s"
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/settings.py",
"chars": 12551,
"preview": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for liepinSpecialCom project\n#\n# For simplicity, this file contains only sett"
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/spiders/__init__.py",
"chars": 161,
"preview": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on "
},
{
"path": "liepin/liepinSpecialCom/liepinSpecialCom/spiders/lpspecialcom.py",
"chars": 4631,
"preview": "import scrapy\nimport re\nfrom datetime import datetime\nimport pandas as pd\nimport time\n\nfrom liepinSpd.items import Liepi"
},
{
"path": "liepin/liepinSpecialCom/run_liepinspecialcom.py",
"chars": 387,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# 获取settings.py模块的设置\nfrom scrapy.crawler import CrawlerProcess\nfrom scra"
},
{
"path": "liepin/liepinSpecialCom/scrapy.cfg",
"chars": 275,
"preview": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrap"
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/items.py",
"chars": 593,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your scraped items\n#\n# See documentation in:\n# https://doc.scrapy."
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/middlewares.py",
"chars": 5193,
"preview": "# -*- coding: utf-8 -*-\n\n# Define here the models for your spider middleware\n#\n# See documentation in:\n# https://doc.scr"
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/pipelines.py",
"chars": 4823,
"preview": "# -*- coding: utf-8 -*-\n\n# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES s"
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/settings.py",
"chars": 12791,
"preview": "# -*- coding: utf-8 -*-\n\n# Scrapy settings for liepinSpecialComJob project\n#\n# For simplicity, this file contains only s"
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/spiders/__init__.py",
"chars": 161,
"preview": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on "
},
{
"path": "liepin/liepinSpecialComJob/liepinSpecialComJob/spiders/lpspecialcomjob.py",
"chars": 6041,
"preview": "import json\n\nimport scrapy\nimport re\nfrom datetime import datetime\nimport pandas as pd\nimport time\nfrom common.util impo"
},
{
"path": "liepin/liepinSpecialComJob/run_liepinspecialjob.py",
"chars": 393,
"preview": "# !/usr/bin/env python\n# -*- coding: utf-8 -*-\n\n# 获取settings.py模块的设置\nfrom scrapy.crawler import CrawlerProcess\nfrom scra"
},
{
"path": "liepin/liepinSpecialComJob/scrapy.cfg",
"chars": 281,
"preview": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# https://scrap"
},
{
"path": "liepin/liepin_login.py",
"chars": 2975,
"preview": "# -*- coding: utf-8 -*-\n'''\nRequired\n- requests \n- bs4\n'''\n# 输入密码不可见模块导入\nimport getpass\nimport hashlib\nimport requests\nf"
},
{
"path": "qqmusic/qqmusic_spider.py",
"chars": 3704,
"preview": "# -*- coding: utf-8 -*-\n# @Author: MediocrityXT\n# @Github: https://github.com/MediocrityXT\n\n\nimport requests\nimport exec"
},
{
"path": "qqmusic/sign.js",
"chars": 12787,
"preview": "function __sign_hash_20200305 (n) {\n function l(n, t) {\n var o = (65535 & n) + (65535 & t);\n return (n "
},
{
"path": "qqzone/qq_zone.py",
"chars": 1126,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\ninfo:\nauthor:CriseLYJ\ngithub:https://github.com/CriseLYJ/\nupdate_time:"
},
{
"path": "qsbk/qiushibaike.py",
"chars": 2034,
"preview": "# !/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n# 1. 导入线程池模块\n# 线程池\nimport gevent.monkey\ngevent.monkey.patch_all()\nfrom geve"
},
{
"path": "sina/sina.py",
"chars": 1232,
"preview": "# 这里需要使用getpass模块才能使输入密码不可见\nimport getpass\nimport requests\nimport hashlib\nimport time\n\n\"\"\"\ninfo:\nauthor:CriseLYJ\ngithub:"
},
{
"path": "sina/spider/Ajax_weibo.py",
"chars": 4814,
"preview": "# -*- coding: utf-8 -*-\nfrom urllib.parse import urlencode\nimport requests, pymysql\nfrom pyquery import PyQuery as pq\nfr"
},
{
"path": "sina/spider/selenium_test.py",
"chars": 2562,
"preview": "# -*- coding: utf-8 -*-\nfrom urllib.parse import urlencode\nimport requests,pymysql\nfrom pyquery import PyQuery as pq\nfro"
},
{
"path": "taobao/taobao_via_username_password.py",
"chars": 3165,
"preview": "import time\n\nfrom selenium import webdriver\nfrom selenium.webdriver.common.by import By\nfrom selenium.webdriver.support "
},
{
"path": "taobao/taobao_via_weibo.py",
"chars": 2603,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\"\"\"\nauthor : CriseLYJ\ngithub : https://github.com/CriseLYJ/\nupdate_time : 201"
},
{
"path": "tieba/tieba_spider.py",
"chars": 1690,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n\"\"\"\ninfo:\nauthor:CriseLYJ\ngithub:https://github.com/CriseLYJ/\nupdate_time:20"
},
{
"path": "tuchong/tuchong.py",
"chars": 3444,
"preview": "import requests\nimport re\nimport os\nfrom hashlib import md5\nfrom requests.exceptions import RequestException\n\n\"\"\"\ninfo:\n"
},
{
"path": "webWeixin/webWeixin.py",
"chars": 6698,
"preview": "import os\nimport re\nimport time\nimport sys\nimport subprocess\nimport requests\nimport xml.dom.minidom\nimport json\n\n\"\"\"\ninf"
},
{
"path": "xiamiMusic/README",
"chars": 129,
"preview": "xiami music login module\n========================\n\n@upload and test date: 2020-08-17\n@use module: pyppeteer==0.2.2\n@auth"
},
{
"path": "xiamiMusic/api.py",
"chars": 3221,
"preview": "# -*- coding: utf-8 -*-\n# @Author: Kris\n# @Mail: criselyj@163.com\n# @Date: 2020-08-14 17:40:11\nimport os\nimport random"
},
{
"path": "zhaopingou/zhaopingou_login.py",
"chars": 1589,
"preview": "#!/usr/bin/ python3\n# -*- coding: utf-8 -*-\nimport requests\n\n\"\"\"\ninfo:\nauthor:CriseLYJ\ngithub:https://github.com/CriseLY"
}
]
// ... and 1 more files (download for full content)
About this extraction
This page contains the full source code of the Kr1s77/awesome-python-login-model GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 99 files (14.4 MB), approximately 96.0k tokens, and a symbol index with 318 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.