Repository: lining0806/PythonSpiderNotes
Branch: master
Commit: da645036061f
Files: 39
Total size: 67.4 KB
Directory structure:
gitextract_ojeilycn/
├── Captcha1/
│ ├── !Test.bat
│ ├── ReadMe.md
│ ├── pic/
│ │ └── fnord.tif
│ ├── pytesser_pro/
│ │ ├── __init__.py
│ │ ├── errors.py
│ │ ├── pytesser_pro.py
│ │ └── util.py
│ └── tess_test.py
├── NewsSpider/
│ ├── NewsSpider.py
│ └── ReadMe.md
├── QunarSpider/
│ ├── QunarSpider.py
│ └── ReadMe.md
├── ReadMe.md
├── Spider_Java/
│ ├── README.md
│ ├── Spider_Java1/
│ │ ├── .classpath
│ │ ├── .project
│ │ ├── lib/
│ │ │ └── mongo-java-driver-2.13.0-rc1.jar
│ │ └── src/
│ │ ├── synchronizetest/
│ │ │ └── Test.java
│ │ └── wallstreetcnsave/
│ │ └── WallstreetcnSaveTest.java
│ └── Spider_Java2/
│ ├── .classpath
│ ├── .project
│ ├── lib/
│ │ └── mongo-java-driver-2.13.0-rc1.jar
│ └── src/
│ ├── synchronizetest/
│ │ └── Test.java
│ └── wallstreetcnsave/
│ └── WallstreetcnSaveTest.java
├── Spider_Python/
│ ├── README.md
│ └── WallstreetcnSaveTest.py
├── WechatSearchProjects/
│ ├── README.md
│ ├── Spider_Main.py
│ ├── WechatSearchTest.py
│ └── Wechatproject/
│ ├── Wechatproject/
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders/
│ │ ├── __init__.py
│ │ └── spider.py
│ └── scrapy.cfg
└── ZhihuSpider/
├── ReadMe.md
├── ZhihuSpider.py
└── config.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: Captcha1/!Test.bat
================================================
python tess_test.py ./pic/get_price_img.png
pause
================================================
FILE: Captcha1/ReadMe.md
================================================
### 验证码识别项目第一版:Captcha1
本项目采用Tesseract V3.01版本(V3.02版本在训练时有改动,多shapeclustering过程)
**Tesseract用法:**
* 配置环境变量TESSDATA_PREFIX =“D:\Tesseract-ocr\”,即tessdata的目录,在源码中会到这个路径下查找相应的字库文件用来识别。
* 命令格式:
`tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfile...]`
* 只识别成数字
`tesseract imagename outputbase -l eng digits`
* 解决empty page!!
**-psm N**
7 = Treat the image as a single text line
tesseract imagename outputbase -l eng -psm 7
* configfile 参数值为tessdata\configs 和 tessdata\tessconfigs 目录下的文件名:
`tesseract imagename outputbase -l eng nobatch`
**验证码识别项目使用方法1:**
* 将下载的图片放到./pic目录下,
验证码图片名称:get_random.jpg
价格图片名称:get_price_img.png
* 命令格式:
验证码图片识别:python tess_test.py ./pic/get_random.jpg
价格图片识别:python tess_test.py ./pic/get_price_img.png
打印出识别的结果
若要将结果存在临时文本文件**temp.txt**中,则修改pytessr_pro.py中代码"**cleanup_scratch_flag = True**"改为"**cleanup_scratch_flag = False**"
================================================
FILE: Captcha1/pytesser_pro/__init__.py
================================================
================================================
FILE: Captcha1/pytesser_pro/errors.py
================================================
"""Test for exceptions raised in the tesseract.exe logfile"""
class Tesser_General_Exception(Exception):
pass
class Tesser_Invalid_Filetype(Tesser_General_Exception):
pass
def check_for_errors(logfile = "tesseract.log"):
inf = file(logfile)
text = inf.read()
inf.close()
# All error conditions result in "Error" somewhere in logfile
if text.find("Error") != -1:
raise Tesser_General_Exception, text
================================================
FILE: Captcha1/pytesser_pro/pytesser_pro.py
================================================
import Image
import subprocess
import util
import errors
tesseract_exe_name = "tesseract" # Name of executable to be called at command line
scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format
scratch_text_name_root = "temp" # Leave out the .txt extension
cleanup_scratch_flag = False # Temporary files cleaned up after OCR operation
def call_tesseract(input_filename, output_filename, bool_digits=False):
"""Calls external tesseract.exe on input file (restrictions on types),
outputting output_filename+'txt'"""
# args = [tesseract_exe_name, input_filename, output_filename]
if bool_digits:
# args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_digits" # price
args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_digits -psm 7 nobatch" # price
else:
args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_characters" # English letters
# args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_eng -psm 7 nobatch" # English letters
# print args
proc = subprocess.Popen(args, shell=True)
retcode = proc.wait()
if retcode != 0:
errors.check_for_errors()
def image_to_string(im, cleanup = cleanup_scratch_flag, bool_digits=False):
"""Converts im to file, applies tesseract, and fetches resulting text.
If cleanup=True, delete scratch files after operation."""
try:
util.image_to_scratch(im, scratch_image_name)
call_tesseract(scratch_image_name, scratch_text_name_root, bool_digits)
text = util.retrieve_text(scratch_text_name_root)
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return text
def image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True, bool_digits=False):
"""Applies tesseract to filename; or, if image is incompatible and graceful_errors=True,
converts to compatible format and then applies tesseract. Fetches resulting text.
If cleanup=True, delete scratch files after operation."""
try:
try:
call_tesseract(filename, scratch_text_name_root, bool_digits)
text = util.retrieve_text(scratch_text_name_root)
except errors.Tesser_General_Exception:
if graceful_errors:
im = Image.open(filename)
text = image_to_string(im, cleanup, bool_digits)
else:
raise
finally:
if cleanup:
util.perform_cleanup(scratch_image_name, scratch_text_name_root)
return text
================================================
FILE: Captcha1/pytesser_pro/util.py
================================================
"""Utility functions for processing images for delivery to Tesseract"""
import os
def image_to_scratch(im, scratch_image_name):
"""Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract"""
im.save(scratch_image_name, dpi=(200,200))
def retrieve_text(scratch_text_name_root):
inf = file(scratch_text_name_root + '.txt')
text = inf.read()
inf.close()
return text
def perform_cleanup(scratch_image_name, scratch_text_name_root):
"""Clean up temporary files from disk"""
for name in (scratch_image_name, scratch_text_name_root + '.txt', "tesseract.log"):
try:
os.remove(name)
except OSError:
pass
================================================
FILE: Captcha1/tess_test.py
================================================
# coding: utf-8
import os
import sys
import subprocess
from pytesser_pro.pytesser_pro import *
import Image, ImageEnhance, ImageFilter
from pylab import *
# 二值化并转格式
def binary(image_name, binary_image_name):
# 白底黑字
args = "convert -monochrome "+image_name+" "+binary_image_name
# print args
proc = subprocess.Popen(args, shell=True)
proc.wait()
im = Image.open(binary_image_name)
w, h = im.size
data = list(im.getdata())
if (data[0], data[w-1], data[(h-1)*w], data[h*w-1]) == (0, 0, 0, 0): # 0-黑色,255-白色
# 若非白底黑字则灰度反转
args1 = "convert -negate "+binary_image_name+" "+binary_image_name
proc1 = subprocess.Popen(args1, shell=True)
proc1.wait()
# 计算范围内点的个数
def numpoint(im):
w, h = im.size
# print w, h
data = list(im.getdata())
mumpoint = 0
for x in range(w):
for y in range(h):
if data[y*w+x] == 0: # 0-黑色,255-白色
mumpoint += 1
return mumpoint
# 投影法去干扰线
def pointmidu(binary_image_name, midu_image_name):
im = Image.open(binary_image_name)
w, h = im.size
# print w, h
len = 5
for x in range(0, w, len):
box = (x, 0, x+len, h)
im_box = im.crop(box)
num = numpoint(im_box)
# print num
if num < 20:
for i in range(x, x+len):
for j in range(h):
im.putpixel((i, j), 255) # 0-黑色,255-白色
data = list(im.getdata())
data_column = []
for x in range(w):
temp = 0
for y in range(h):
if data[y*w+x] == 0: # 0-黑色,255-白色
temp += 1
data_column.append(temp)
# print data_column
start = 0
for i in range(0, w, 1):
if data_column[i] != 0:
break
else:
start += 1
# print start
end = w-1
for j in range(w-1, -1, -1):
if data_column[j] != 0:
break
else:
end += -1
# print end
box_new = (start, 0, end+1, h)
im_box_new = im.crop(box_new)
im_box_new.save(midu_image_name)
# 图像增强
def filter_enhance(midu_image_name, midu_image_name_pro1):
im = Image.open(midu_image_name)
# 去噪
im = im.filter(ImageFilter.MedianFilter())
# 亮度加强
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
# im.show()
im.save(midu_image_name_pro1)
# 字符分割
def seg(midu_image_name_pro1, midu_image_name_pro2, num):
im = Image.open(midu_image_name_pro1)
w, h = im.size
# print w, h, w/num
len = 2
for i in range(num-1):
start = (i+1)*w/num
end = start+len
for m in range(start, end+1):
for n in range(h):
im.putpixel((m, n), 255) # 0-黑色,255-白色
im.save(midu_image_name_pro2)
def get_aim1_point(im):
aim = []
w, h = im.size
# print w, h
data = list(im.getdata())
for x in range(0, w, 1):
for y in range(0, h, 1):
if data[y*w+x] == 0: # 0-黑色,255-白色
start_point = (x, y)
# print start_point
aim.append(start_point)
break
return aim
def get_aim2_point(im):
aim = []
w, h = im.size
# print w, h
data = list(im.getdata())
for x in range(0, w, 1):
for y in range(h-1, -1, -1):
if data[y*w+x] == 0: # 0-黑色,255-白色
start_point = (x, y)
# print start_point
aim.append(start_point)
break
return aim
if __name__=='__main__':
if len(sys.argv) == 1:
image_name = "./pic/get_random.jpg" # 验证码图片名称
digits = False
# image_name = "./pic/get_price_img.png" # 价格图片名称
# digits = True
elif len(sys.argv) == 2:
if sys.argv[1].find("get_random") != -1:
image_name = sys.argv[1]
digits = False
elif sys.argv[1].find("get_price_img") != -1:
image_name = sys.argv[1]
digits = True
else:
print "Please Input the Correct Image Name!"
sys.exit(0)
else:
print "Too Many Arguments!"
sys.exit(0)
# 二值化并转格式
binary_image_name = os.path.splitext(image_name)[0]+"_binary.png"
binary(image_name, binary_image_name)
im = Image.open(binary_image_name)
print im.format, im.size, im.mode
if digits:
text = image_file_to_string(binary_image_name, bool_digits=digits)
print text.replace("\n", "")
else:
# 投影法去干扰线
fpathandname , fext = os.path.splitext(binary_image_name)
midu_image_name = fpathandname+"_midu"+fext
pointmidu(binary_image_name, midu_image_name)
fpathandname , fext = os.path.splitext(midu_image_name)
# 去干扰线
# im = Image.open(midu_image_name)
# w, h = im.size
# data = list(im.getdata())
# aim1 = get_aim1_point(im)
# for x, y in aim1:
# curr = data[y*w+x]
# prev = data[(y-1)*w+x]
# next = data[(y+1)*w+x]
#
# if prev == 0 and next == 0: # 0-黑色,255-白色
# continue
# if prev == 0:
# im.putpixel((x, y), 255)
# im.putpixel((x, y-1), 255)
# elif next == 0:
# im.putpixel((x, y), 255)
# im.putpixel((x, y+1), 255)
# else:
# im.putpixel((x, y), 255)
# data = list(im.getdata())
# aim2 = get_aim2_point(im)
# for x, y in aim2:
# curr = data[y*w+x]
# prev = data[(y-1)*w+x]
# next = data[(y+1)*w+x]
#
# if prev == 0 and next == 0: # 0-黑色,255-白色
# continue
# if prev == 0:
# im.putpixel((x, y), 255)
# im.putpixel((x, y-1), 255)
# elif next == 0:
# im.putpixel((x, y), 255)
# im.putpixel((x, y+1), 255)
# else:
# im.putpixel((x, y), 255)
# midu_image_name_new = fpathandname+"_new"+fext
# im.save(midu_image_name_new)
# 图像增强
midu_image_name_pro1 = fpathandname+"_pro1"+fext
filter_enhance(midu_image_name, midu_image_name_pro1)
# 字符分割
# num = 4
# midu_image_name_pro2 = fpathandname+"_pro2"+fext
# seg(midu_image_name_pro1, midu_image_name_pro2, num)
# im = Image.open(midu_image_name)
# text = image_to_string(im)
# print text.replace("\n", "")
text = image_file_to_string(midu_image_name_pro1, bool_digits=digits)
print text.replace("\n", "")
================================================
FILE: NewsSpider/NewsSpider.py
================================================
# -*- coding: utf-8 -*-
import os
import sys
import urllib2
import requests
import re
from lxml import etree
def StringListSave(save_path, filename, slist):
if not os.path.exists(save_path):
os.makedirs(save_path)
path = save_path+"/"+filename+".txt"
with open(path, "w+") as fp:
for s in slist:
fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8")))
def Page_Info(myPage):
'''Regex'''
mypage_Info = re.findall(r'
', myPage, re.S)
return mypage_Info
def New_Page_Info(new_page):
'''Regex(slowly) or Xpath(fast)'''
# new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S)
# # new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S) # bugs
# results = []
# for url, item in new_page_Info:
# results.append((item, url+".html"))
# return results
dom = etree.HTML(new_page)
new_items = dom.xpath('//tr/td/a/text()')
new_urls = dom.xpath('//tr/td/a/@href')
assert(len(new_items) == len(new_urls))
return zip(new_items, new_urls)
def Spider(url):
i = 0
print "downloading ", url
myPage = requests.get(url).content.decode("gbk")
# myPage = urllib2.urlopen(url).read().decode("gbk")
myPageResults = Page_Info(myPage)
save_path = u"网易新闻抓取"
filename = str(i)+"_"+u"新闻排行榜"
StringListSave(save_path, filename, myPageResults)
i += 1
for item, url in myPageResults:
print "downloading ", url
new_page = requests.get(url).content.decode("gbk")
# new_page = urllib2.urlopen(url).read().decode("gbk")
newPageResults = New_Page_Info(new_page)
filename = str(i)+"_"+item
StringListSave(save_path, filename, newPageResults)
i += 1
if __name__ == '__main__':
print "start"
start_url = "http://news.163.com/rank/"
Spider(start_url)
print "end"
================================================
FILE: NewsSpider/ReadMe.md
================================================
### 网络爬虫之最基本的爬虫:爬取[网易新闻排行榜](http://news.163.com/rank/)
**一些说明:**
* 使用urllib2或requests包来爬取页面。
* 使用正则表达式分析一级页面,使用Xpath来分析二级页面。
* 将得到的标题和链接,保存为本地文件。
================================================
FILE: QunarSpider/QunarSpider.py
================================================
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os
import time
import datetime
import codecs
import multiprocessing as mp
from os import makedirs
from os.path import exists
from selenium import webdriver
from selenium.webdriver.common.proxy import *
site = 'http://flight.qunar.com'
hot_city_list = [u'上海', u'北京', u'广州', u'深圳']
num = len(hot_city_list)
def one_driver_ticket(driver, from_city, to_city):
# time = datetime.datetime.now()
date = datetime.date.today()
tomorrow = date+datetime.timedelta(days=1)
# date格式转为string格式
tomorrow_string = tomorrow.strftime('%Y-%m-%d')
driver.find_element_by_name('fromCity').clear()
driver.find_element_by_name('fromCity').send_keys(from_city)
driver.find_element_by_name('toCity').clear()
driver.find_element_by_name('toCity').send_keys(to_city)
driver.find_element_by_name('fromDate').clear()
driver.find_element_by_name('fromDate').send_keys(tomorrow_string)
driver.find_element_by_xpath('//button[@type="submit"]').click()
time.sleep(5) # 控制间隔时间,等待浏览器反映
flag = True
page_num = 0
while flag:
# 保存页面
# print driver.page_source
source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML")
print type(source_code)
dstdir = u'./ticket/'
if not exists(dstdir):
makedirs(dstdir)
f = codecs.open(dstdir+from_city+u','+to_city+unicode(tomorrow_string)+u','+unicode(str(page_num+1))+u'.html', 'w+', 'utf8')
f.write(source_code)
f.close()
next_page = None
try:
next_page = driver.find_element_by_id('nextXI3')
except Exception as e:
print e
pass
print "page: %d" % (page_num+1)
if next_page:
try:
next_page.click()
time.sleep(2) # 控制间隔时间,等待浏览器反映
page_num += 1
except Exception as e:
print 'next_page could not be clicked'
print e
flag = False
else:
flag = False
def get_proxy_list(file_path):
proxy_list = []
try:
f = open(file_path, 'r')
all_lines = f.readlines() # readlines()每次按行读取整个文件内容,将读取到的内容放到一个列表中,返回list类型。
for line in all_lines:
proxy_list.append(line.replace('\r', '').replace('\n', ''))
f.close()
except Exception as e:
print e
return proxy_list
def ticket_worker_proxy(city_proxy):
city = city_proxy.split(',')[0]
proxy = city_proxy.split(',')[1]
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': proxy,
'ftpProxy': proxy,
'sslProxy': proxy,
'noProxy': '' # 过滤不需要代理的地址
})
driver = webdriver.Firefox(proxy=proxy)
driver.get(site)
driver.maximize_window() # 将浏览器最大化显示
for i in xrange(num):
if city == hot_city_list[i]:
continue
from_city = city
to_city = hot_city_list[i]
one_driver_ticket(driver, from_city, to_city)
driver.close()
def all_ticket_proxy():
hot_city_proxy_list = []
proxy_list = get_proxy_list('./proxy/proxy.txt') # ./表示当前目录,../表示上一级目录
for i in xrange(num):
hot_city_proxy_list.append(hot_city_list[i]+','+proxy_list[i])
pool = mp.Pool(processes=1)
pool.map(ticket_worker_proxy, hot_city_proxy_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]
pool.close()
pool.join()
def ticket_worker_no_proxy(city):
driver = webdriver.Firefox()
# chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
# os.environ['webdriver.chrome.driver'] = chromedriver
# driver = webdriver.Chrome(chromedriver)
driver.get(site)
driver.maximize_window() # 将浏览器最大化显示
time.sleep(5) # 控制间隔时间,等待浏览器反映
for i in xrange(num):
if city == hot_city_list[i]:
continue
from_city = city
to_city = hot_city_list[i]
one_driver_ticket(driver, from_city, to_city)
driver.close()
def all_ticket_no_proxy():
pool = mp.Pool(processes=1)
pool.map(ticket_worker_no_proxy, hot_city_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]
pool.close()
pool.join()
if __name__ == '__main__':
print "start"
start = datetime.datetime.now()
# all_ticket_proxy() # proxy
all_ticket_no_proxy() # no proxy
end = datetime.datetime.now()
print "end"
print "time: ", end-start
================================================
FILE: QunarSpider/ReadMe.md
================================================
### 网络爬虫之Selenium使用代理登陆:爬取[去哪儿](http://flight.qunar.com/)网站
**一些说明:**
* 使用selenium模拟浏览器登陆,获取翻页操作。
* 代理可以存入一个文件,程序读取并使用。
* 支持多进程抓取。
================================================
FILE: ReadMe.md
================================================
# [Python入门网络爬虫之精华版](https://github.com/lining0806/PythonSpiderNotes)
***
Python学习网络爬虫主要分3个大的版块:**抓取**,**分析**,**存储**
另外,比较常用的爬虫框架[Scrapy](http://scrapy.org/),这里最后也详细介绍一下。
首先列举一下本人总结的相关文章,这些覆盖了入门网络爬虫需要的基本概念和技巧:[宁哥的小站-网络爬虫](http://www.lining0806.com/category/spider/)
***
当我们在浏览器中输入一个url后回车,后台会发生什么?比如说你输入[http://www.lining0806.com/](http://www.lining0806.com/),你就会看到宁哥的小站首页。
简单来说这段过程发生了以下四个步骤:
* 查找域名对应的IP地址。
* 向IP对应的服务器发送请求。
* 服务器响应请求,发回网页内容。
* 浏览器解析网页内容。
网络爬虫要做的,简单来说,就是实现浏览器的功能。通过指定url,直接返回给用户所需要的数据,而不需要一步步人工去操纵浏览器获取。
## 抓取
这一步,你要明确要得到的内容是什么?是HTML源码,还是Json格式的字符串等。
#### 1. 最基本的抓取
抓取大多数情况属于get请求,即直接从对方服务器上获取数据。
首先,Python中自带urllib及urllib2这两个模块,基本上能满足一般的页面抓取。另外,[requests](https://github.com/kennethreitz/requests)也是非常有用的包,与此类似的,还有[httplib2](https://github.com/jcgregorio/httplib2)等等。
```
Requests:
import requests
response = requests.get(url)
content = requests.get(url).content
print "response headers:", response.headers
print "content:", content
Urllib2:
import urllib2
response = urllib2.urlopen(url)
content = urllib2.urlopen(url).read()
print "response headers:", response.headers
print "content:", content
Httplib2:
import httplib2
http = httplib2.Http()
response_headers, content = http.request(url, 'GET')
print "response headers:", response_headers
print "content:", content
```
此外,对于带有查询字段的url,get请求一般会将来请求的数据附在url之后,以?分割url和传输数据,多个参数用&连接。
```
data = {'data1':'XXXXX', 'data2':'XXXXX'}
Requests:data为dict,json
import requests
response = requests.get(url=url, params=data)
Urllib2:data为string
import urllib, urllib2
data = urllib.urlencode(data)
full_url = url+'?'+data
response = urllib2.urlopen(full_url)
```
相关参考:[网易新闻排行榜抓取回顾](http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/)
参考项目:[网络爬虫之最基本的爬虫:爬取网易新闻排行榜](https://github.com/lining0806/PythonSpiderNotes/blob/master/NewsSpider)
### 2. 对于登陆情况的处理
**2.1 使用表单登陆**
这种情况属于post请求,即先向服务器发送表单数据,服务器再将返回的cookie存入本地。
```
data = {'data1':'XXXXX', 'data2':'XXXXX'}
Requests:data为dict,json
import requests
response = requests.post(url=url, data=data)
Urllib2:data为string
import urllib, urllib2
data = urllib.urlencode(data)
req = urllib2.Request(url=url, data=data)
response = urllib2.urlopen(req)
```
**2.2 使用cookie登陆**
使用cookie登陆,服务器会认为你是一个已登陆的用户,所以就会返回给你一个已登陆的内容。因此,需要验证码的情况可以使用带验证码登陆的cookie解决。
```
import requests
requests_session = requests.session()
response = requests_session.post(url=url_login, data=data)
```
若存在验证码,此时采用response = requests_session.post(url=url_login, data=data)是不行的,做法应该如下:
```
response_captcha = requests_session.get(url=url_login, cookies=cookies)
response1 = requests.get(url_login) # 未登陆
response2 = requests_session.get(url_login) # 已登陆,因为之前拿到了Response Cookie!
response3 = requests_session.get(url_results) # 已登陆,因为之前拿到了Response Cookie!
```
相关参考:[网络爬虫-验证码登陆](http://www.lining0806.com/6-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB-%E9%AA%8C%E8%AF%81%E7%A0%81%E7%99%BB%E9%99%86/)
参考项目:[网络爬虫之用户名密码及验证码登陆:爬取知乎网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/ZhihuSpider)
### 3. 对于反爬虫机制的处理
**3.1 使用代理**
适用情况:限制IP地址情况,也可解决由于“频繁点击”而需要输入验证码登陆的情况。
这种情况最好的办法就是维护一个代理IP池,网上有很多免费的代理IP,良莠不齐,可以通过筛选找到能用的。对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。
```
proxies = {'http':'http://XX.XX.XX.XX:XXXX'}
Requests:
import requests
response = requests.get(url=url, proxies=proxies)
Urllib2:
import urllib2
proxy_support = urllib2.ProxyHandler(proxies)
opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
urllib2.install_opener(opener) # 安装opener,此后调用urlopen()时都会使用安装过的opener对象
response = urllib2.urlopen(url)
```
**3.2 时间设置**
适用情况:限制频率情况。
Requests,Urllib2都可以使用time库的sleep()函数:
```
import time
time.sleep(1)
```
**3.3 伪装成浏览器,或者反“反盗链”**
有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。
```
headers = {'User-Agent':'XXXXX'} # 伪装成浏览器访问,适用于拒绝爬虫的网站
headers = {'Referer':'XXXXX'}
headers = {'User-Agent':'XXXXX', 'Referer':'XXXXX'}
Requests:
response = requests.get(url=url, headers=headers)
Urllib2:
import urllib, urllib2
req = urllib2.Request(url=url, headers=headers)
response = urllib2.urlopen(req)
```
### 4. 对于断线重连
不多说。
```
def multi_session(session, *arg):
retryTimes = 20
while retryTimes>0:
try:
return session.post(*arg)
except:
print '.',
retryTimes -= 1
```
或者
```
def multi_open(opener, *arg):
retryTimes = 20
while retryTimes>0:
try:
return opener.open(*arg)
except:
print '.',
retryTimes -= 1
```
这样我们就可以使用multi_session或multi_open对爬虫抓取的session或opener进行保持。
### 5. 多进程抓取
这里针对[华尔街见闻](http://live.wallstreetcn.com/ )进行并行抓取的实验对比:[Python多进程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Python) 与 [Java单线程和多线程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Java)
相关参考:[关于Python和Java的多进程多线程计算方法对比](http://www.lining0806.com/%E5%85%B3%E4%BA%8Epython%E5%92%8Cjava%E7%9A%84%E5%A4%9A%E8%BF%9B%E7%A8%8B%E5%A4%9A%E7%BA%BF%E7%A8%8B%E8%AE%A1%E7%AE%97%E6%96%B9%E6%B3%95%E5%AF%B9%E6%AF%94/)
### 6. 对于Ajax请求的处理
对于“加载更多”情况,使用Ajax来传输很多数据。
它的工作原理是:从网页的url加载网页的源代码之后,会在浏览器里执行JavaScript程序。这些程序会加载更多的内容,“填充”到网页里。这就是为什么如果你直接去爬网页本身的url,你会找不到页面的实际内容。
这里,若使用Google Chrome分析”请求“对应的链接(方法:右键→审查元素→Network→清空,点击”加载更多“,出现对应的GET链接寻找Type为text/html的,点击,查看get参数或者复制Request URL),循环过程。
* 如果“请求”之前有页面,依据上一步的网址进行分析推导第1页。以此类推,抓取抓Ajax地址的数据。
* 对返回的json格式数据(str)进行正则匹配。json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码。
### 7. 自动化测试工具Selenium
Selenium是一款自动化测试工具。它能实现操纵浏览器,包括字符填充、鼠标点击、获取元素、页面切换等一系列操作。总之,凡是浏览器能做的事,Selenium都能够做到。
这里列出在给定城市列表后,使用selenium来动态抓取[去哪儿网](http://flight.qunar.com/)的票价信息的代码。
参考项目:[网络爬虫之Selenium使用代理登陆:爬取去哪儿网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/QunarSpider)
### 8. 验证码识别
对于网站有验证码的情况,我们有三种办法:
* 使用代理,更新IP。
* 使用cookie登陆。
* 验证码识别。
使用代理和使用cookie登陆之前已经讲过,下面讲一下验证码识别。
可以利用开源的Tesseract-OCR系统进行验证码图片的下载及识别,将识别的字符传到爬虫系统进行模拟登陆。当然也可以将验证码图片上传到打码平台上进行识别。如果不成功,可以再次更新验证码识别,直到成功为止。
参考项目:[验证码识别项目第一版:Captcha1](https://github.com/lining0806/PythonSpiderNotes/blob/master/Captcha1)
**爬取有两个需要注意的问题:**
* 如何监控一系列网站的更新情况,也就是说,如何进行增量式爬取?
* 对于海量数据,如何实现分布式爬取?
## 分析
抓取之后就是对抓取的内容进行分析,你需要什么内容,就从中提炼出相关的内容来。
常见的分析工具有[正则表达式](http://deerchao.net/tutorials/regex/regex.htm),[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/),[lxml](http://lxml.de/)等等。
## 存储
分析出我们需要的内容之后,接下来就是存储了。
我们可以选择存入文本文件,也可以选择存入[MySQL](http://www.mysql.com/)或[MongoDB](https://www.mongodb.org/)数据库等。
**存储有两个需要注意的问题:**
* 如何进行网页去重?
* 内容以什么形式存储?
## Scrapy
Scrapy是一个基于Twisted的开源的Python爬虫框架,在工业中应用非常广泛。
相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/),同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码,给大家作为学习参考。
参考项目:[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects)
## Robots协议
好的网络爬虫,首先需要遵守**Robots协议**。Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取。
在网站根目录下放一个robots.txt文本文件(如 https://www.taobao.com/robots.txt ),里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面,指定的页面由正则表达式表示。网络爬虫在采集这个网站之前,首先获取到这个robots.txt文本文件,然后解析到其中的规则,然后根据规则来采集网站的数据。
### 1. Robots协议规则
User-agent: 指定对哪些爬虫生效
Disallow: 指定不允许访问的网址
Allow: 指定允许访问的网址
注意: 一个英文要大写,冒号是英文状态下,冒号后面有一个空格,"/"代表整个网站
### 2. Robots协议举例
禁止所有机器人访问
User-agent: *
Disallow: /
允许所有机器人访问
User-agent: *
Disallow:
禁止特定机器人访问
User-agent: BadBot
Disallow: /
允许特定机器人访问
User-agent: GoodBot
Disallow:
禁止访问特定目录
User-agent: *
Disallow: /images/
仅允许访问特定目录
User-agent: *
Allow: /images/
Disallow: /
禁止访问特定文件
User-agent: *
Disallow: /*.html$
仅允许访问特定文件
User-agent: *
Allow: /*.html$
Disallow: /
================================================
FILE: Spider_Java/README.md
================================================
### Spider_Java
抓取网址:[华尔街见闻](http://live.wallstreetcn.com/)
单线程抓取 Spider_Java1
多线程抓取 Spider_Java2
================================================
FILE: Spider_Java/Spider_Java1/.classpath
================================================
================================================
FILE: Spider_Java/Spider_Java1/.project
================================================
Spider
org.eclipse.jdt.core.javabuilder
org.eclipse.jdt.core.javanature
================================================
FILE: Spider_Java/Spider_Java1/src/synchronizetest/Test.java
================================================
/**
*
*/
package synchronizetest;
/**
* @author FIRELING
*
*/
public class Test
{
public static void main(String[] args)
{
Reservoir r = new Reservoir(100);
Booth b1 = new Booth(r);
Booth b2 = new Booth(r);
Booth b3 = new Booth(r);
}
}
/**
* contain shared resource
*/
class Reservoir {
private int total;
public Reservoir(int t)
{
this.total = t;
}
/**
* Thread safe method
* serialized access to Booth.total
*/
public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法
{
if(this.total > 0) {
this.total = this.total-1;
return true; // successfully sell one
}
else {
return false; // no more tickets
}
}
}
/**
* create new thread by inheriting Thread
*/
class Booth extends Thread {
private static int threadID = 0; // owned by Class object
private Reservoir release; // sell this reservoir
private int count = 0; // owned by this thread object
/**
* constructor
*/
public Booth(Reservoir r) {
super("ID:"+(++threadID));
this.release = r; // all threads share the same reservoir
this.start();
}
/**
* convert object to string
*/
public String toString() {
return super.getName();
}
/**
* what does the thread do?
*/
public void run() {
while(true) { // 循环体!!!
if(this.release.sellTicket()) {
this.count = this.count+1;
System.out.println(this.getName()+":sell 1");
try {
sleep((int) Math.random()*100); // random intervals
// sleep(100); // 若sleep时间相同,则每个窗口买票相当
}
catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
else {
break;
}
}
System.out.println(this.getName()+" I sold:"+count);
}
}
================================================
FILE: Spider_Java/Spider_Java1/src/wallstreetcnsave/WallstreetcnSaveTest.java
================================================
package wallstreetcnsave;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.Mongo;
public class WallstreetcnSaveTest implements Runnable {
private static String DataBaseName = "textclassify";
private static String CollectionName = "WallstreetSaveJava";
private static String url = "http://api.wallstreetcn.com/v2/livenews?&page=";
private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?";
private static final String REGEXSTRING1 = "type";
private static final String REGEXSTRING2 = "content";
private static final String REGEXSTRING3 = "categoryset";
//map表的存放
public static Map GetMap() {
Map map = new HashMap();
map.put("1", "外汇");
map.put("2", "股市");
map.put("3", "商品");
map.put("4", "债市");
map.put("9", "中国");
map.put("10", "美国");
map.put("11", "欧元区");
map.put("12", "日本");
map.put("13", "英国");
map.put("14", "澳洲");
map.put("15", "加拿大");
map.put("16", "瑞士");
map.put("17", "其他地区");
map.put("5", "央行");
return map;
}
private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" };
private static String[] ruleList_property = { "1", "2", "3", "4" };
private static String[] ruleList_centralbank = { "5" };
private static final int start = 1;
private static final int end = 3000;
//对x,x,x格式的内容进行分隔筛选
public static String setCategory(String categorySet, String[] ruleList, Map map) {
StringBuffer disStr = new StringBuffer();
String[] strArray = null;
strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray
// 获取需要的信息
int length_strArray = strArray.length;
int length_ruleList = ruleList.length;
if (length_strArray > 0) {
for (int iArr = 0; iArr < length_strArray; iArr++) {
String s = strArray[iArr];
for (int iRul=0; iRul < length_ruleList; iRul++) {
if (s.equals(ruleList[iRul])) {
disStr.append(map.get(s));
disStr.append(",");
break;
}
}
}
}
if(disStr.length()>1) {
disStr = disStr.deleteCharAt(disStr.length()-1);
}
return disStr.toString();
}
//读取整个页面,返回html字符串
private static String httpRequest(String requestUrl) {
StringBuffer buffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStream inputStream = null;
HttpURLConnection httpUrlConn = null;
try {
// 建立get请求
URL url = new URL(requestUrl);
httpUrlConn = (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
// 获取输入流
inputStream = httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream, "UTF-8");
bufferedReader = new BufferedReader(inputStreamReader);
// 从输入流获取结果
buffer = new StringBuffer();
String str = null;
while ((str = bufferedReader.readLine()) != null) {
str = new String(str.getBytes(), "UTF-8");
buffer.append(str);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (bufferedReader != null) {
try {
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (inputStreamReader != null) {
try {
inputStreamReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
return buffer.toString();
}
// 过滤掉无用的信息
public static List