Repository: lining0806/PythonSpiderNotes Branch: master Commit: da645036061f Files: 39 Total size: 67.4 KB Directory structure: gitextract_ojeilycn/ ├── Captcha1/ │ ├── !Test.bat │ ├── ReadMe.md │ ├── pic/ │ │ └── fnord.tif │ ├── pytesser_pro/ │ │ ├── __init__.py │ │ ├── errors.py │ │ ├── pytesser_pro.py │ │ └── util.py │ └── tess_test.py ├── NewsSpider/ │ ├── NewsSpider.py │ └── ReadMe.md ├── QunarSpider/ │ ├── QunarSpider.py │ └── ReadMe.md ├── ReadMe.md ├── Spider_Java/ │ ├── README.md │ ├── Spider_Java1/ │ │ ├── .classpath │ │ ├── .project │ │ ├── lib/ │ │ │ └── mongo-java-driver-2.13.0-rc1.jar │ │ └── src/ │ │ ├── synchronizetest/ │ │ │ └── Test.java │ │ └── wallstreetcnsave/ │ │ └── WallstreetcnSaveTest.java │ └── Spider_Java2/ │ ├── .classpath │ ├── .project │ ├── lib/ │ │ └── mongo-java-driver-2.13.0-rc1.jar │ └── src/ │ ├── synchronizetest/ │ │ └── Test.java │ └── wallstreetcnsave/ │ └── WallstreetcnSaveTest.java ├── Spider_Python/ │ ├── README.md │ └── WallstreetcnSaveTest.py ├── WechatSearchProjects/ │ ├── README.md │ ├── Spider_Main.py │ ├── WechatSearchTest.py │ └── Wechatproject/ │ ├── Wechatproject/ │ │ ├── __init__.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders/ │ │ ├── __init__.py │ │ └── spider.py │ └── scrapy.cfg └── ZhihuSpider/ ├── ReadMe.md ├── ZhihuSpider.py └── config.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: Captcha1/!Test.bat ================================================ python tess_test.py ./pic/get_price_img.png pause ================================================ FILE: Captcha1/ReadMe.md ================================================ ### 验证码识别项目第一版:Captcha1 本项目采用Tesseract V3.01版本(V3.02版本在训练时有改动,多shapeclustering过程) **Tesseract用法:** * 配置环境变量TESSDATA_PREFIX =“D:\Tesseract-ocr\”,即tessdata的目录,在源码中会到这个路径下查找相应的字库文件用来识别。 * 命令格式: `tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfile...]` * 只识别成数字 `tesseract imagename outputbase -l eng digits` * 解决empty page!! **-psm N** 7 = Treat the image as a single text line tesseract imagename outputbase -l eng -psm 7 * configfile 参数值为tessdata\configs 和 tessdata\tessconfigs 目录下的文件名: `tesseract imagename outputbase -l eng nobatch` **验证码识别项目使用方法1:** * 将下载的图片放到./pic目录下, 验证码图片名称:get_random.jpg 价格图片名称:get_price_img.png * 命令格式: 验证码图片识别:python tess_test.py ./pic/get_random.jpg 价格图片识别:python tess_test.py ./pic/get_price_img.png 打印出识别的结果 若要将结果存在临时文本文件**temp.txt**中,则修改pytessr_pro.py中代码"**cleanup_scratch_flag = True**"改为"**cleanup_scratch_flag = False**" ================================================ FILE: Captcha1/pytesser_pro/__init__.py ================================================ ================================================ FILE: Captcha1/pytesser_pro/errors.py ================================================ """Test for exceptions raised in the tesseract.exe logfile""" class Tesser_General_Exception(Exception): pass class Tesser_Invalid_Filetype(Tesser_General_Exception): pass def check_for_errors(logfile = "tesseract.log"): inf = file(logfile) text = inf.read() inf.close() # All error conditions result in "Error" somewhere in logfile if text.find("Error") != -1: raise Tesser_General_Exception, text ================================================ FILE: Captcha1/pytesser_pro/pytesser_pro.py ================================================ import Image import subprocess import util import errors tesseract_exe_name = "tesseract" # Name of executable to be called at command line scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format scratch_text_name_root = "temp" # Leave out the .txt extension cleanup_scratch_flag = False # Temporary files cleaned up after OCR operation def call_tesseract(input_filename, output_filename, bool_digits=False): """Calls external tesseract.exe on input file (restrictions on types), outputting output_filename+'txt'""" # args = [tesseract_exe_name, input_filename, output_filename] if bool_digits: # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_digits" # price args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_digits -psm 7 nobatch" # price else: args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_characters" # English letters # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_eng -psm 7 nobatch" # English letters # print args proc = subprocess.Popen(args, shell=True) retcode = proc.wait() if retcode != 0: errors.check_for_errors() def image_to_string(im, cleanup = cleanup_scratch_flag, bool_digits=False): """Converts im to file, applies tesseract, and fetches resulting text. If cleanup=True, delete scratch files after operation.""" try: util.image_to_scratch(im, scratch_image_name) call_tesseract(scratch_image_name, scratch_text_name_root, bool_digits) text = util.retrieve_text(scratch_text_name_root) finally: if cleanup: util.perform_cleanup(scratch_image_name, scratch_text_name_root) return text def image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True, bool_digits=False): """Applies tesseract to filename; or, if image is incompatible and graceful_errors=True, converts to compatible format and then applies tesseract. Fetches resulting text. If cleanup=True, delete scratch files after operation.""" try: try: call_tesseract(filename, scratch_text_name_root, bool_digits) text = util.retrieve_text(scratch_text_name_root) except errors.Tesser_General_Exception: if graceful_errors: im = Image.open(filename) text = image_to_string(im, cleanup, bool_digits) else: raise finally: if cleanup: util.perform_cleanup(scratch_image_name, scratch_text_name_root) return text ================================================ FILE: Captcha1/pytesser_pro/util.py ================================================ """Utility functions for processing images for delivery to Tesseract""" import os def image_to_scratch(im, scratch_image_name): """Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract""" im.save(scratch_image_name, dpi=(200,200)) def retrieve_text(scratch_text_name_root): inf = file(scratch_text_name_root + '.txt') text = inf.read() inf.close() return text def perform_cleanup(scratch_image_name, scratch_text_name_root): """Clean up temporary files from disk""" for name in (scratch_image_name, scratch_text_name_root + '.txt', "tesseract.log"): try: os.remove(name) except OSError: pass ================================================ FILE: Captcha1/tess_test.py ================================================ # coding: utf-8 import os import sys import subprocess from pytesser_pro.pytesser_pro import * import Image, ImageEnhance, ImageFilter from pylab import * # 二值化并转格式 def binary(image_name, binary_image_name): # 白底黑字 args = "convert -monochrome "+image_name+" "+binary_image_name # print args proc = subprocess.Popen(args, shell=True) proc.wait() im = Image.open(binary_image_name) w, h = im.size data = list(im.getdata()) if (data[0], data[w-1], data[(h-1)*w], data[h*w-1]) == (0, 0, 0, 0): # 0-黑色,255-白色 # 若非白底黑字则灰度反转 args1 = "convert -negate "+binary_image_name+" "+binary_image_name proc1 = subprocess.Popen(args1, shell=True) proc1.wait() # 计算范围内点的个数 def numpoint(im): w, h = im.size # print w, h data = list(im.getdata()) mumpoint = 0 for x in range(w): for y in range(h): if data[y*w+x] == 0: # 0-黑色,255-白色 mumpoint += 1 return mumpoint # 投影法去干扰线 def pointmidu(binary_image_name, midu_image_name): im = Image.open(binary_image_name) w, h = im.size # print w, h len = 5 for x in range(0, w, len): box = (x, 0, x+len, h) im_box = im.crop(box) num = numpoint(im_box) # print num if num < 20: for i in range(x, x+len): for j in range(h): im.putpixel((i, j), 255) # 0-黑色,255-白色 data = list(im.getdata()) data_column = [] for x in range(w): temp = 0 for y in range(h): if data[y*w+x] == 0: # 0-黑色,255-白色 temp += 1 data_column.append(temp) # print data_column start = 0 for i in range(0, w, 1): if data_column[i] != 0: break else: start += 1 # print start end = w-1 for j in range(w-1, -1, -1): if data_column[j] != 0: break else: end += -1 # print end box_new = (start, 0, end+1, h) im_box_new = im.crop(box_new) im_box_new.save(midu_image_name) # 图像增强 def filter_enhance(midu_image_name, midu_image_name_pro1): im = Image.open(midu_image_name) # 去噪 im = im.filter(ImageFilter.MedianFilter()) # 亮度加强 enhancer = ImageEnhance.Contrast(im) im = enhancer.enhance(2) im = im.convert('1') # im.show() im.save(midu_image_name_pro1) # 字符分割 def seg(midu_image_name_pro1, midu_image_name_pro2, num): im = Image.open(midu_image_name_pro1) w, h = im.size # print w, h, w/num len = 2 for i in range(num-1): start = (i+1)*w/num end = start+len for m in range(start, end+1): for n in range(h): im.putpixel((m, n), 255) # 0-黑色,255-白色 im.save(midu_image_name_pro2) def get_aim1_point(im): aim = [] w, h = im.size # print w, h data = list(im.getdata()) for x in range(0, w, 1): for y in range(0, h, 1): if data[y*w+x] == 0: # 0-黑色,255-白色 start_point = (x, y) # print start_point aim.append(start_point) break return aim def get_aim2_point(im): aim = [] w, h = im.size # print w, h data = list(im.getdata()) for x in range(0, w, 1): for y in range(h-1, -1, -1): if data[y*w+x] == 0: # 0-黑色,255-白色 start_point = (x, y) # print start_point aim.append(start_point) break return aim if __name__=='__main__': if len(sys.argv) == 1: image_name = "./pic/get_random.jpg" # 验证码图片名称 digits = False # image_name = "./pic/get_price_img.png" # 价格图片名称 # digits = True elif len(sys.argv) == 2: if sys.argv[1].find("get_random") != -1: image_name = sys.argv[1] digits = False elif sys.argv[1].find("get_price_img") != -1: image_name = sys.argv[1] digits = True else: print "Please Input the Correct Image Name!" sys.exit(0) else: print "Too Many Arguments!" sys.exit(0) # 二值化并转格式 binary_image_name = os.path.splitext(image_name)[0]+"_binary.png" binary(image_name, binary_image_name) im = Image.open(binary_image_name) print im.format, im.size, im.mode if digits: text = image_file_to_string(binary_image_name, bool_digits=digits) print text.replace("\n", "") else: # 投影法去干扰线 fpathandname , fext = os.path.splitext(binary_image_name) midu_image_name = fpathandname+"_midu"+fext pointmidu(binary_image_name, midu_image_name) fpathandname , fext = os.path.splitext(midu_image_name) # 去干扰线 # im = Image.open(midu_image_name) # w, h = im.size # data = list(im.getdata()) # aim1 = get_aim1_point(im) # for x, y in aim1: # curr = data[y*w+x] # prev = data[(y-1)*w+x] # next = data[(y+1)*w+x] # # if prev == 0 and next == 0: # 0-黑色,255-白色 # continue # if prev == 0: # im.putpixel((x, y), 255) # im.putpixel((x, y-1), 255) # elif next == 0: # im.putpixel((x, y), 255) # im.putpixel((x, y+1), 255) # else: # im.putpixel((x, y), 255) # data = list(im.getdata()) # aim2 = get_aim2_point(im) # for x, y in aim2: # curr = data[y*w+x] # prev = data[(y-1)*w+x] # next = data[(y+1)*w+x] # # if prev == 0 and next == 0: # 0-黑色,255-白色 # continue # if prev == 0: # im.putpixel((x, y), 255) # im.putpixel((x, y-1), 255) # elif next == 0: # im.putpixel((x, y), 255) # im.putpixel((x, y+1), 255) # else: # im.putpixel((x, y), 255) # midu_image_name_new = fpathandname+"_new"+fext # im.save(midu_image_name_new) # 图像增强 midu_image_name_pro1 = fpathandname+"_pro1"+fext filter_enhance(midu_image_name, midu_image_name_pro1) # 字符分割 # num = 4 # midu_image_name_pro2 = fpathandname+"_pro2"+fext # seg(midu_image_name_pro1, midu_image_name_pro2, num) # im = Image.open(midu_image_name) # text = image_to_string(im) # print text.replace("\n", "") text = image_file_to_string(midu_image_name_pro1, bool_digits=digits) print text.replace("\n", "") ================================================ FILE: NewsSpider/NewsSpider.py ================================================ # -*- coding: utf-8 -*- import os import sys import urllib2 import requests import re from lxml import etree def StringListSave(save_path, filename, slist): if not os.path.exists(save_path): os.makedirs(save_path) path = save_path+"/"+filename+".txt" with open(path, "w+") as fp: for s in slist: fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8"))) def Page_Info(myPage): '''Regex''' mypage_Info = re.findall(r'

(.*?)

.*?
', myPage, re.S) return mypage_Info def New_Page_Info(new_page): '''Regex(slowly) or Xpath(fast)''' # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) # # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) # bugs # results = [] # for url, item in new_page_Info: # results.append((item, url+".html")) # return results dom = etree.HTML(new_page) new_items = dom.xpath('//tr/td/a/text()') new_urls = dom.xpath('//tr/td/a/@href') assert(len(new_items) == len(new_urls)) return zip(new_items, new_urls) def Spider(url): i = 0 print "downloading ", url myPage = requests.get(url).content.decode("gbk") # myPage = urllib2.urlopen(url).read().decode("gbk") myPageResults = Page_Info(myPage) save_path = u"网易新闻抓取" filename = str(i)+"_"+u"新闻排行榜" StringListSave(save_path, filename, myPageResults) i += 1 for item, url in myPageResults: print "downloading ", url new_page = requests.get(url).content.decode("gbk") # new_page = urllib2.urlopen(url).read().decode("gbk") newPageResults = New_Page_Info(new_page) filename = str(i)+"_"+item StringListSave(save_path, filename, newPageResults) i += 1 if __name__ == '__main__': print "start" start_url = "http://news.163.com/rank/" Spider(start_url) print "end" ================================================ FILE: NewsSpider/ReadMe.md ================================================ ### 网络爬虫之最基本的爬虫:爬取[网易新闻排行榜](http://news.163.com/rank/) **一些说明:** * 使用urllib2或requests包来爬取页面。 * 使用正则表达式分析一级页面,使用Xpath来分析二级页面。 * 将得到的标题和链接,保存为本地文件。 ================================================ FILE: QunarSpider/QunarSpider.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- import os import time import datetime import codecs import multiprocessing as mp from os import makedirs from os.path import exists from selenium import webdriver from selenium.webdriver.common.proxy import * site = 'http://flight.qunar.com' hot_city_list = [u'上海', u'北京', u'广州', u'深圳'] num = len(hot_city_list) def one_driver_ticket(driver, from_city, to_city): # time = datetime.datetime.now() date = datetime.date.today() tomorrow = date+datetime.timedelta(days=1) # date格式转为string格式 tomorrow_string = tomorrow.strftime('%Y-%m-%d') driver.find_element_by_name('fromCity').clear() driver.find_element_by_name('fromCity').send_keys(from_city) driver.find_element_by_name('toCity').clear() driver.find_element_by_name('toCity').send_keys(to_city) driver.find_element_by_name('fromDate').clear() driver.find_element_by_name('fromDate').send_keys(tomorrow_string) driver.find_element_by_xpath('//button[@type="submit"]').click() time.sleep(5) # 控制间隔时间,等待浏览器反映 flag = True page_num = 0 while flag: # 保存页面 # print driver.page_source source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML") print type(source_code) dstdir = u'./ticket/' if not exists(dstdir): makedirs(dstdir) f = codecs.open(dstdir+from_city+u','+to_city+unicode(tomorrow_string)+u','+unicode(str(page_num+1))+u'.html', 'w+', 'utf8') f.write(source_code) f.close() next_page = None try: next_page = driver.find_element_by_id('nextXI3') except Exception as e: print e pass print "page: %d" % (page_num+1) if next_page: try: next_page.click() time.sleep(2) # 控制间隔时间,等待浏览器反映 page_num += 1 except Exception as e: print 'next_page could not be clicked' print e flag = False else: flag = False def get_proxy_list(file_path): proxy_list = [] try: f = open(file_path, 'r') all_lines = f.readlines() # readlines()每次按行读取整个文件内容,将读取到的内容放到一个列表中,返回list类型。 for line in all_lines: proxy_list.append(line.replace('\r', '').replace('\n', '')) f.close() except Exception as e: print e return proxy_list def ticket_worker_proxy(city_proxy): city = city_proxy.split(',')[0] proxy = city_proxy.split(',')[1] proxy = Proxy({ 'proxyType': ProxyType.MANUAL, 'httpProxy': proxy, 'ftpProxy': proxy, 'sslProxy': proxy, 'noProxy': '' # 过滤不需要代理的地址 }) driver = webdriver.Firefox(proxy=proxy) driver.get(site) driver.maximize_window() # 将浏览器最大化显示 for i in xrange(num): if city == hot_city_list[i]: continue from_city = city to_city = hot_city_list[i] one_driver_ticket(driver, from_city, to_city) driver.close() def all_ticket_proxy(): hot_city_proxy_list = [] proxy_list = get_proxy_list('./proxy/proxy.txt') # ./表示当前目录,../表示上一级目录 for i in xrange(num): hot_city_proxy_list.append(hot_city_list[i]+','+proxy_list[i]) pool = mp.Pool(processes=1) pool.map(ticket_worker_proxy, hot_city_proxy_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)] pool.close() pool.join() def ticket_worker_no_proxy(city): driver = webdriver.Firefox() # chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' # os.environ['webdriver.chrome.driver'] = chromedriver # driver = webdriver.Chrome(chromedriver) driver.get(site) driver.maximize_window() # 将浏览器最大化显示 time.sleep(5) # 控制间隔时间,等待浏览器反映 for i in xrange(num): if city == hot_city_list[i]: continue from_city = city to_city = hot_city_list[i] one_driver_ticket(driver, from_city, to_city) driver.close() def all_ticket_no_proxy(): pool = mp.Pool(processes=1) pool.map(ticket_worker_no_proxy, hot_city_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)] pool.close() pool.join() if __name__ == '__main__': print "start" start = datetime.datetime.now() # all_ticket_proxy() # proxy all_ticket_no_proxy() # no proxy end = datetime.datetime.now() print "end" print "time: ", end-start ================================================ FILE: QunarSpider/ReadMe.md ================================================ ### 网络爬虫之Selenium使用代理登陆:爬取[去哪儿](http://flight.qunar.com/)网站 **一些说明:** * 使用selenium模拟浏览器登陆,获取翻页操作。 * 代理可以存入一个文件,程序读取并使用。 * 支持多进程抓取。 ================================================ FILE: ReadMe.md ================================================ # [Python入门网络爬虫之精华版](https://github.com/lining0806/PythonSpiderNotes) *** Python学习网络爬虫主要分3个大的版块:**抓取**,**分析**,**存储** 另外,比较常用的爬虫框架[Scrapy](http://scrapy.org/),这里最后也详细介绍一下。 首先列举一下本人总结的相关文章,这些覆盖了入门网络爬虫需要的基本概念和技巧:[宁哥的小站-网络爬虫](http://www.lining0806.com/category/spider/) *** 当我们在浏览器中输入一个url后回车,后台会发生什么?比如说你输入[http://www.lining0806.com/](http://www.lining0806.com/),你就会看到宁哥的小站首页。 简单来说这段过程发生了以下四个步骤: * 查找域名对应的IP地址。 * 向IP对应的服务器发送请求。 * 服务器响应请求,发回网页内容。 * 浏览器解析网页内容。 网络爬虫要做的,简单来说,就是实现浏览器的功能。通过指定url,直接返回给用户所需要的数据,而不需要一步步人工去操纵浏览器获取。 ## 抓取 这一步,你要明确要得到的内容是什么?是HTML源码,还是Json格式的字符串等。 #### 1. 最基本的抓取 抓取大多数情况属于get请求,即直接从对方服务器上获取数据。 首先,Python中自带urllib及urllib2这两个模块,基本上能满足一般的页面抓取。另外,[requests](https://github.com/kennethreitz/requests)也是非常有用的包,与此类似的,还有[httplib2](https://github.com/jcgregorio/httplib2)等等。 ``` Requests: import requests response = requests.get(url) content = requests.get(url).content print "response headers:", response.headers print "content:", content Urllib2: import urllib2 response = urllib2.urlopen(url) content = urllib2.urlopen(url).read() print "response headers:", response.headers print "content:", content Httplib2: import httplib2 http = httplib2.Http() response_headers, content = http.request(url, 'GET') print "response headers:", response_headers print "content:", content ``` 此外,对于带有查询字段的url,get请求一般会将来请求的数据附在url之后,以?分割url和传输数据,多个参数用&连接。 ``` data = {'data1':'XXXXX', 'data2':'XXXXX'} Requests:data为dict,json import requests response = requests.get(url=url, params=data) Urllib2:data为string import urllib, urllib2 data = urllib.urlencode(data) full_url = url+'?'+data response = urllib2.urlopen(full_url) ``` 相关参考:[网易新闻排行榜抓取回顾](http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/) 参考项目:[网络爬虫之最基本的爬虫:爬取网易新闻排行榜](https://github.com/lining0806/PythonSpiderNotes/blob/master/NewsSpider) ### 2. 对于登陆情况的处理 **2.1 使用表单登陆** 这种情况属于post请求,即先向服务器发送表单数据,服务器再将返回的cookie存入本地。 ``` data = {'data1':'XXXXX', 'data2':'XXXXX'} Requests:data为dict,json import requests response = requests.post(url=url, data=data) Urllib2:data为string import urllib, urllib2 data = urllib.urlencode(data) req = urllib2.Request(url=url, data=data) response = urllib2.urlopen(req) ``` **2.2 使用cookie登陆** 使用cookie登陆,服务器会认为你是一个已登陆的用户,所以就会返回给你一个已登陆的内容。因此,需要验证码的情况可以使用带验证码登陆的cookie解决。 ``` import requests requests_session = requests.session() response = requests_session.post(url=url_login, data=data) ``` 若存在验证码,此时采用response = requests_session.post(url=url_login, data=data)是不行的,做法应该如下: ``` response_captcha = requests_session.get(url=url_login, cookies=cookies) response1 = requests.get(url_login) # 未登陆 response2 = requests_session.get(url_login) # 已登陆,因为之前拿到了Response Cookie! response3 = requests_session.get(url_results) # 已登陆,因为之前拿到了Response Cookie! ``` 相关参考:[网络爬虫-验证码登陆](http://www.lining0806.com/6-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB-%E9%AA%8C%E8%AF%81%E7%A0%81%E7%99%BB%E9%99%86/) 参考项目:[网络爬虫之用户名密码及验证码登陆:爬取知乎网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/ZhihuSpider) ### 3. 对于反爬虫机制的处理 **3.1 使用代理** 适用情况:限制IP地址情况,也可解决由于“频繁点击”而需要输入验证码登陆的情况。 这种情况最好的办法就是维护一个代理IP池,网上有很多免费的代理IP,良莠不齐,可以通过筛选找到能用的。对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。 ``` proxies = {'http':'http://XX.XX.XX.XX:XXXX'} Requests: import requests response = requests.get(url=url, proxies=proxies) Urllib2: import urllib2 proxy_support = urllib2.ProxyHandler(proxies) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) urllib2.install_opener(opener) # 安装opener,此后调用urlopen()时都会使用安装过的opener对象 response = urllib2.urlopen(url) ``` **3.2 时间设置** 适用情况:限制频率情况。 Requests,Urllib2都可以使用time库的sleep()函数: ``` import time time.sleep(1) ``` **3.3 伪装成浏览器,或者反“反盗链”** 有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。 ``` headers = {'User-Agent':'XXXXX'} # 伪装成浏览器访问,适用于拒绝爬虫的网站 headers = {'Referer':'XXXXX'} headers = {'User-Agent':'XXXXX', 'Referer':'XXXXX'} Requests: response = requests.get(url=url, headers=headers) Urllib2: import urllib, urllib2 req = urllib2.Request(url=url, headers=headers) response = urllib2.urlopen(req) ``` ### 4. 对于断线重连 不多说。 ``` def multi_session(session, *arg): retryTimes = 20 while retryTimes>0: try: return session.post(*arg) except: print '.', retryTimes -= 1 ``` 或者 ``` def multi_open(opener, *arg): retryTimes = 20 while retryTimes>0: try: return opener.open(*arg) except: print '.', retryTimes -= 1 ``` 这样我们就可以使用multi_session或multi_open对爬虫抓取的session或opener进行保持。 ### 5. 多进程抓取 这里针对[华尔街见闻](http://live.wallstreetcn.com/ )进行并行抓取的实验对比:[Python多进程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Python) 与 [Java单线程和多线程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Java) 相关参考:[关于Python和Java的多进程多线程计算方法对比](http://www.lining0806.com/%E5%85%B3%E4%BA%8Epython%E5%92%8Cjava%E7%9A%84%E5%A4%9A%E8%BF%9B%E7%A8%8B%E5%A4%9A%E7%BA%BF%E7%A8%8B%E8%AE%A1%E7%AE%97%E6%96%B9%E6%B3%95%E5%AF%B9%E6%AF%94/) ### 6. 对于Ajax请求的处理 对于“加载更多”情况,使用Ajax来传输很多数据。 它的工作原理是:从网页的url加载网页的源代码之后,会在浏览器里执行JavaScript程序。这些程序会加载更多的内容,“填充”到网页里。这就是为什么如果你直接去爬网页本身的url,你会找不到页面的实际内容。 这里,若使用Google Chrome分析”请求“对应的链接(方法:右键→审查元素→Network→清空,点击”加载更多“,出现对应的GET链接寻找Type为text/html的,点击,查看get参数或者复制Request URL),循环过程。 * 如果“请求”之前有页面,依据上一步的网址进行分析推导第1页。以此类推,抓取抓Ajax地址的数据。 * 对返回的json格式数据(str)进行正则匹配。json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码。 ### 7. 自动化测试工具Selenium Selenium是一款自动化测试工具。它能实现操纵浏览器,包括字符填充、鼠标点击、获取元素、页面切换等一系列操作。总之,凡是浏览器能做的事,Selenium都能够做到。 这里列出在给定城市列表后,使用selenium来动态抓取[去哪儿网](http://flight.qunar.com/)的票价信息的代码。 参考项目:[网络爬虫之Selenium使用代理登陆:爬取去哪儿网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/QunarSpider) ### 8. 验证码识别 对于网站有验证码的情况,我们有三种办法: * 使用代理,更新IP。 * 使用cookie登陆。 * 验证码识别。 使用代理和使用cookie登陆之前已经讲过,下面讲一下验证码识别。 可以利用开源的Tesseract-OCR系统进行验证码图片的下载及识别,将识别的字符传到爬虫系统进行模拟登陆。当然也可以将验证码图片上传到打码平台上进行识别。如果不成功,可以再次更新验证码识别,直到成功为止。 参考项目:[验证码识别项目第一版:Captcha1](https://github.com/lining0806/PythonSpiderNotes/blob/master/Captcha1) **爬取有两个需要注意的问题:** * 如何监控一系列网站的更新情况,也就是说,如何进行增量式爬取? * 对于海量数据,如何实现分布式爬取? ## 分析 抓取之后就是对抓取的内容进行分析,你需要什么内容,就从中提炼出相关的内容来。 常见的分析工具有[正则表达式](http://deerchao.net/tutorials/regex/regex.htm),[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/),[lxml](http://lxml.de/)等等。 ## 存储 分析出我们需要的内容之后,接下来就是存储了。 我们可以选择存入文本文件,也可以选择存入[MySQL](http://www.mysql.com/)或[MongoDB](https://www.mongodb.org/)数据库等。 **存储有两个需要注意的问题:** * 如何进行网页去重? * 内容以什么形式存储? ## Scrapy Scrapy是一个基于Twisted的开源的Python爬虫框架,在工业中应用非常广泛。 相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/),同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码,给大家作为学习参考。 参考项目:[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects) ## Robots协议 好的网络爬虫,首先需要遵守**Robots协议**。Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取。 在网站根目录下放一个robots.txt文本文件(如 https://www.taobao.com/robots.txt ),里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面,指定的页面由正则表达式表示。网络爬虫在采集这个网站之前,首先获取到这个robots.txt文本文件,然后解析到其中的规则,然后根据规则来采集网站的数据。 ### 1. Robots协议规则 User-agent: 指定对哪些爬虫生效 Disallow: 指定不允许访问的网址 Allow: 指定允许访问的网址 注意: 一个英文要大写,冒号是英文状态下,冒号后面有一个空格,"/"代表整个网站 ### 2. Robots协议举例 禁止所有机器人访问 User-agent: * Disallow: / 允许所有机器人访问 User-agent: * Disallow: 禁止特定机器人访问 User-agent: BadBot Disallow: / 允许特定机器人访问 User-agent: GoodBot Disallow: 禁止访问特定目录 User-agent: * Disallow: /images/ 仅允许访问特定目录 User-agent: * Allow: /images/ Disallow: / 禁止访问特定文件 User-agent: * Disallow: /*.html$ 仅允许访问特定文件 User-agent: * Allow: /*.html$ Disallow: / ================================================ FILE: Spider_Java/README.md ================================================ ### Spider_Java 抓取网址:[华尔街见闻](http://live.wallstreetcn.com/) 单线程抓取 Spider_Java1 多线程抓取 Spider_Java2 ================================================ FILE: Spider_Java/Spider_Java1/.classpath ================================================ ================================================ FILE: Spider_Java/Spider_Java1/.project ================================================ Spider org.eclipse.jdt.core.javabuilder org.eclipse.jdt.core.javanature ================================================ FILE: Spider_Java/Spider_Java1/src/synchronizetest/Test.java ================================================ /** * */ package synchronizetest; /** * @author FIRELING * */ public class Test { public static void main(String[] args) { Reservoir r = new Reservoir(100); Booth b1 = new Booth(r); Booth b2 = new Booth(r); Booth b3 = new Booth(r); } } /** * contain shared resource */ class Reservoir { private int total; public Reservoir(int t) { this.total = t; } /** * Thread safe method * serialized access to Booth.total */ public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法 { if(this.total > 0) { this.total = this.total-1; return true; // successfully sell one } else { return false; // no more tickets } } } /** * create new thread by inheriting Thread */ class Booth extends Thread { private static int threadID = 0; // owned by Class object private Reservoir release; // sell this reservoir private int count = 0; // owned by this thread object /** * constructor */ public Booth(Reservoir r) { super("ID:"+(++threadID)); this.release = r; // all threads share the same reservoir this.start(); } /** * convert object to string */ public String toString() { return super.getName(); } /** * what does the thread do? */ public void run() { while(true) { // 循环体!!! if(this.release.sellTicket()) { this.count = this.count+1; System.out.println(this.getName()+":sell 1"); try { sleep((int) Math.random()*100); // random intervals // sleep(100); // 若sleep时间相同,则每个窗口买票相当 } catch (InterruptedException e) { throw new RuntimeException(e); } } else { break; } } System.out.println(this.getName()+" I sold:"+count); } } ================================================ FILE: Spider_Java/Spider_Java1/src/wallstreetcnsave/WallstreetcnSaveTest.java ================================================ package wallstreetcnsave; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.text.DateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.Mongo; public class WallstreetcnSaveTest implements Runnable { private static String DataBaseName = "textclassify"; private static String CollectionName = "WallstreetSaveJava"; private static String url = "http://api.wallstreetcn.com/v2/livenews?&page="; private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"

(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?"; private static final String REGEXSTRING1 = "type"; private static final String REGEXSTRING2 = "content"; private static final String REGEXSTRING3 = "categoryset"; //map表的存放 public static Map GetMap() { Map map = new HashMap(); map.put("1", "外汇"); map.put("2", "股市"); map.put("3", "商品"); map.put("4", "债市"); map.put("9", "中国"); map.put("10", "美国"); map.put("11", "欧元区"); map.put("12", "日本"); map.put("13", "英国"); map.put("14", "澳洲"); map.put("15", "加拿大"); map.put("16", "瑞士"); map.put("17", "其他地区"); map.put("5", "央行"); return map; } private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" }; private static String[] ruleList_property = { "1", "2", "3", "4" }; private static String[] ruleList_centralbank = { "5" }; private static final int start = 1; private static final int end = 3000; //对x,x,x格式的内容进行分隔筛选 public static String setCategory(String categorySet, String[] ruleList, Map map) { StringBuffer disStr = new StringBuffer(); String[] strArray = null; strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray // 获取需要的信息 int length_strArray = strArray.length; int length_ruleList = ruleList.length; if (length_strArray > 0) { for (int iArr = 0; iArr < length_strArray; iArr++) { String s = strArray[iArr]; for (int iRul=0; iRul < length_ruleList; iRul++) { if (s.equals(ruleList[iRul])) { disStr.append(map.get(s)); disStr.append(","); break; } } } } if(disStr.length()>1) { disStr = disStr.deleteCharAt(disStr.length()-1); } return disStr.toString(); } //读取整个页面,返回html字符串 private static String httpRequest(String requestUrl) { StringBuffer buffer = null; BufferedReader bufferedReader = null; InputStreamReader inputStreamReader = null; InputStream inputStream = null; HttpURLConnection httpUrlConn = null; try { // 建立get请求 URL url = new URL(requestUrl); httpUrlConn = (HttpURLConnection) url.openConnection(); httpUrlConn.setDoInput(true); httpUrlConn.setRequestMethod("GET"); // 获取输入流 inputStream = httpUrlConn.getInputStream(); inputStreamReader = new InputStreamReader(inputStream, "UTF-8"); bufferedReader = new BufferedReader(inputStreamReader); // 从输入流获取结果 buffer = new StringBuffer(); String str = null; while ((str = bufferedReader.readLine()) != null) { str = new String(str.getBytes(), "UTF-8"); buffer.append(str); } } catch (Exception e) { e.printStackTrace(); } finally { if (bufferedReader != null) { try { bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } } if (inputStreamReader != null) { try { inputStreamReader.close(); } catch (IOException e) { e.printStackTrace(); } } if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { e.printStackTrace(); } } if (httpUrlConn != null) { httpUrlConn.disconnect(); } } return buffer.toString(); } // 过滤掉无用的信息 public static List> htmlFiter(String html, String Regex) { List> list = new ArrayList>(); // 查找目标 Pattern p = Pattern.compile(Regex); Matcher m = p.matcher(html); while (m.find()) { Map map_save = new HashMap(); // 可修改部分 map_save.put(REGEXSTRING1, m.group(1)); map_save.put(REGEXSTRING2, m.group(2)); map_save.put(REGEXSTRING3, m.group(3)); list.add(map_save); } return list; } //unicode格式转中文 public static String UnicodeToString(String str) { Pattern pattern = Pattern.compile("(\\\\u(\\p{XDigit}{4}))"); // XDigit表示16进制数字,正则里的\p表示Unicode块 Matcher matcher = pattern.matcher(str); char ch; while (matcher.find()) { ch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码,再char转为字符 str = str.replace(matcher.group(1), ch + ""); } return str; } public void run() { // 链接数据库 try { Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB(DataBaseName); DBCollection collection = db.getCollection(CollectionName); // 调用抓取的方法获取内容 for (int i = start; i <= end; i++) { String requestUrl = url + i; System.out.println(requestUrl); String html = httpRequest(requestUrl); List> resultList = htmlFiter(html, Regex); if (resultList.isEmpty()) { System.out.printf("The end url: %s", requestUrl); break; } else { for (Map result : resultList) { BasicDBObject dbObject = new BasicDBObject(); String type = result.get(REGEXSTRING1); String content = UnicodeToString(result.get(REGEXSTRING2)); // String content = result.get(REGEXSTRING2); Map map = GetMap(); String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); Date date = new Date(); DateFormat time = DateFormat.getDateTimeInstance(); String time_str = time.format(date); String source = "wangstreetcn"; dbObject.put("content", content); // 具体内容 dbObject.put("createdtime", time_str); // 创建时间 dbObject.put("source", source); // 信息来源 dbObject.put("district", district); // 所属地区 dbObject.put("property", property); // 资产类别 dbObject.put("centralbank", centralbank); // 资产类别 dbObject.put("type", type); //信息类型 collection.insert(dbObject); } } } } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) throws InterruptedException { WallstreetcnSaveTest wallstreetcnsave = new WallstreetcnSaveTest(); wallstreetcnsave.run(); } } ================================================ FILE: Spider_Java/Spider_Java2/.classpath ================================================ ================================================ FILE: Spider_Java/Spider_Java2/.project ================================================ Spider org.eclipse.jdt.core.javabuilder org.eclipse.jdt.core.javanature ================================================ FILE: Spider_Java/Spider_Java2/src/synchronizetest/Test.java ================================================ /** * */ package synchronizetest; /** * @author FIRELING * */ public class Test { public static void main(String[] args) { Reservoir r = new Reservoir(100); Booth b1 = new Booth(r); Booth b2 = new Booth(r); Booth b3 = new Booth(r); } } /** * contain shared resource */ class Reservoir { private int total; public Reservoir(int t) { this.total = t; } /** * Thread safe method * serialized access to Booth.total */ public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法 { if(this.total > 0) { this.total = this.total-1; return true; // successfully sell one } else { return false; // no more tickets } } } /** * create new thread by inheriting Thread */ class Booth extends Thread { private static int threadID = 0; // owned by Class object private Reservoir release; // sell this reservoir private int count = 0; // owned by this thread object /** * constructor */ public Booth(Reservoir r) { super("ID:"+(++threadID)); this.release = r; // all threads share the same reservoir this.start(); } /** * convert object to string */ public String toString() { return super.getName(); } /** * what does the thread do? */ public void run() { while(true) { // 循环体!!! if(this.release.sellTicket()) { this.count = this.count+1; System.out.println(this.getName()+":sell 1"); try { sleep((int) Math.random()*100); // random intervals // sleep(100); // 若sleep时间相同,则每个窗口买票相当 } catch (InterruptedException e) { throw new RuntimeException(e); } } else { break; } } System.out.println(this.getName()+" I sold:"+count); } } ================================================ FILE: Spider_Java/Spider_Java2/src/wallstreetcnsave/WallstreetcnSaveTest.java ================================================ package wallstreetcnsave; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.text.DateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.mongodb.BasicDBObject; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.Mongo; class WallstreetcnSave implements Runnable { private GetrequestUrl release; public WallstreetcnSave(GetrequestUrl url) { this.release = url; // all threads share the same GetrequestUrl } private static String DataBaseName = "textclassify"; private static String CollectionName = "WallstreetSaveJava"; private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"

(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?"; private static final String REGEXSTRING1 = "type"; private static final String REGEXSTRING2 = "content"; private static final String REGEXSTRING3 = "categoryset"; //map表的存放 public static Map GetMap() { Map map = new HashMap(); map.put("1", "外汇"); map.put("2", "股市"); map.put("3", "商品"); map.put("4", "债市"); map.put("9", "中国"); map.put("10", "美国"); map.put("11", "欧元区"); map.put("12", "日本"); map.put("13", "英国"); map.put("14", "澳洲"); map.put("15", "加拿大"); map.put("16", "瑞士"); map.put("17", "其他地区"); map.put("5", "央行"); return map; } private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" }; private static String[] ruleList_property = { "1", "2", "3", "4" }; private static String[] ruleList_centralbank = { "5" }; //对x,x,x格式的内容进行分隔筛选 public static String setCategory(String categorySet, String[] ruleList, Map map) { StringBuffer disStr = new StringBuffer(); String[] strArray = null; strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray // 获取需要的信息 int length_strArray = strArray.length; int length_ruleList = ruleList.length; if (length_strArray > 0) { for (int iArr = 0; iArr < length_strArray; iArr++) { String s = strArray[iArr]; for (int iRul=0; iRul < length_ruleList; iRul++) { if (s.equals(ruleList[iRul])) { disStr.append(map.get(s)); disStr.append(","); break; } } } } if(disStr.length()>1) { disStr = disStr.deleteCharAt(disStr.length()-1); } return disStr.toString(); } //读取整个页面,返回html字符串 private static String httpRequest(String requestUrl) { StringBuffer buffer = null; BufferedReader bufferedReader = null; InputStreamReader inputStreamReader = null; InputStream inputStream = null; HttpURLConnection httpUrlConn = null; try { // 建立get请求 URL url = new URL(requestUrl); httpUrlConn = (HttpURLConnection) url.openConnection(); httpUrlConn.setDoInput(true); httpUrlConn.setRequestMethod("GET"); // 获取输入流 inputStream = httpUrlConn.getInputStream(); inputStreamReader = new InputStreamReader(inputStream, "UTF-8"); bufferedReader = new BufferedReader(inputStreamReader); // 从输入流获取结果 buffer = new StringBuffer(); String str = null; while ((str = bufferedReader.readLine()) != null) { str = new String(str.getBytes(), "UTF-8"); buffer.append(str); } } catch (Exception e) { e.printStackTrace(); } finally { if (bufferedReader != null) { try { bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } } if (inputStreamReader != null) { try { inputStreamReader.close(); } catch (IOException e) { e.printStackTrace(); } } if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { e.printStackTrace(); } } if (httpUrlConn != null) { httpUrlConn.disconnect(); } } return buffer.toString(); } // 过滤掉无用的信息 public static List> htmlFiter(String html, String Regex) { List> list = new ArrayList>(); // 查找目标 Pattern p = Pattern.compile(Regex); Matcher m = p.matcher(html); while (m.find()) { Map map_save = new HashMap(); // 可修改部分 map_save.put(REGEXSTRING1, m.group(1)); map_save.put(REGEXSTRING2, m.group(2)); map_save.put(REGEXSTRING3, m.group(3)); list.add(map_save); } return list; } //unicode格式转中文 public static String UnicodeToString(String str) { Pattern pattern = Pattern.compile("(\\\\u(\\p{XDigit}{4}))"); // XDigit表示16进制数字,正则里的\p表示Unicode块 Matcher matcher = pattern.matcher(str); char ch; while (matcher.find()) { ch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码,再char转为字符 str = str.replace(matcher.group(1), ch + ""); } return str; } public void run() { while(true) { // 循环体!!! // 链接数据库 try { Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB(DataBaseName); DBCollection collection = db.getCollection(CollectionName); // 调用抓取的方法获取内容 String requestUrl = this.release.GetMethod(); if(requestUrl.equals("")) { break; } else { System.out.println(requestUrl); String html = httpRequest(requestUrl); List> resultList = htmlFiter(html, Regex); if (resultList.isEmpty()) { System.out.printf("The end url: %s", requestUrl); break; } else { for (Map result : resultList) { BasicDBObject dbObject = new BasicDBObject(); String type = result.get(REGEXSTRING1); String content = UnicodeToString(result.get(REGEXSTRING2)); Map map = GetMap(); String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); Date date = new Date(); DateFormat time = DateFormat.getDateTimeInstance(); String time_str = time.format(date); String source = "wangstreetcn"; dbObject.put("content", content); // 具体内容 dbObject.put("createdtime", time_str); // 创建时间 dbObject.put("source", source); // 信息来源 dbObject.put("district", district); // 所属地区 dbObject.put("property", property); // 资产类别 dbObject.put("centralbank", centralbank); // 资产类别 dbObject.put("type", type); //信息类型 collection.insert(dbObject); } } } } catch (Exception e) { e.printStackTrace(); } } } public void run1() { while(true) { // 循环体!!! // 链接数据库 try { Mongo mongo = new Mongo("localhost", 27017); DB db = mongo.getDB(DataBaseName); DBCollection collection = db.getCollection(CollectionName); // 调用抓取的方法获取内容 String requestUrl = this.release.GetMethod(); if(requestUrl.equals("")) { break; } else { System.out.println(requestUrl); String html = httpRequest(requestUrl); List> resultList = htmlFiter(html, Regex); if (resultList.isEmpty()) { System.out.printf("The end url: %s\n", requestUrl); break; } else { for (Map result : resultList) { BasicDBObject dbObject = new BasicDBObject(); String type = result.get(REGEXSTRING1); String content = UnicodeToString(result.get(REGEXSTRING2)); Map map = GetMap(); String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); Date date = new Date(); DateFormat time = DateFormat.getDateTimeInstance(); String time_str = time.format(date); String source = "wangstreetcn"; dbObject.put("content", content); // 具体内容 dbObject.put("createdtime", time_str); // 创建时间 dbObject.put("source", source); // 信息来源 dbObject.put("district", district); // 所属地区 dbObject.put("property", property); // 资产类别 dbObject.put("centralbank", centralbank); // 资产类别 dbObject.put("type", type); //信息类型 collection.insert(dbObject); } } } } catch (Exception e) { e.printStackTrace(); } } } } /** * contain shared resource */ class GetrequestUrl { private String url = "http://api.wallstreetcn.com/v2/livenews?&page="; private int start; private int end = 5000; public GetrequestUrl(int start) { this.start = start; } public GetrequestUrl(int start, int end) { this.start = start; this.end = end; } /** * Thread safe method */ public synchronized String GetMethod() // 利用synchronized修饰符同步了整个方法 { if(this.start <= this.end) { String requestUrl = this.url+this.start; this.start = this.start+1; return requestUrl; } else { return ""; } } } public class WallstreetcnSaveTest { public static void main(String[] args) { // 多线程抓取 int start = 1; GetrequestUrl url = new GetrequestUrl(start); // int start = 1, end = 3000; // GetrequestUrl url = new GetrequestUrl(start, end); int thread_num = 1; while(true) { if(thread_num++ > 8) break; Thread thread = new Thread(new WallstreetcnSave(url)); thread.start(); } } } ================================================ FILE: Spider_Python/README.md ================================================ ### Spider_Python 抓取网址:[华尔街见闻](http://live.wallstreetcn.com/) 多进程抓取 ================================================ FILE: Spider_Python/WallstreetcnSaveTest.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- import sys import re import urllib, urllib2 import requests import pymongo import datetime import multiprocessing as mp Category_Map = { "1":u"外汇", "2":u"股市", "3":u"商品", "4":u"债市", "5":u"央行", "9":u"中国", "10":u"美国", "11":u"欧元区", "12":u"日本", "13":u"英国", "14":u"澳洲", "15":u"加拿大", "16":u"瑞士", "17":u"其他地区" } def num2name(category_num): if Category_Map.has_key(category_num): return Category_Map[category_num] else: return "" class MongoDBIO: # 申明相关的属性 def __init__(self, host, port, name, password, database, collection): self.host = host self.port = port self.name = name self.password = password self.database = database self.collection = collection # 连接数据库,db和posts为数据库和集合的游标 def Connection(self): # connection = pymongo.Connection() # 连接本地数据库 connection = pymongo.Connection(host=self.host, port=self.port) # db = connection.datas db = connection[self.database] if self.name or self.password: db.authenticate(name=self.name, password=self.password) # 验证用户名密码 # print "Database:", db.name # posts = db.cn_live_news posts = db[self.collection] # print "Collection:", posts.name return posts # 保存操作 # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() # for save_content in save_contents: # posts.save(save_content) def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() posts.save(save_content) def Spider(url, data): # # 方法1:requests get content = requests.get(url=url, params=data).content # GET请求发送 # # 方法2:urllib2 get # data = urllib.urlencode(data) # 编码工作,由dict转为string # full_url = url+'?'+data # print full_url # content = urllib2.urlopen(full_url).read() # GET请求发送 # # content = requests.get(full_url).content # GET请求发送 # print type(content) # str return content def ContentSave(item): # 保存配置 save_host = "localhost" save_port = 27017 save_name = "" save_password = "" save_database = "textclassify" save_collection = "WallstreetcnSave" source = "wallstreetcn" createdtime = datetime.datetime.now() type = item[0] content = item[1].decode("unicode_escape") # json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码 content = content.encode("utf-8") # print content # district的筛选 categorySet = item[2] category_num = categorySet.split(",") category_name = map(num2name, category_num) districtset = set(category_name)&{u"中国", u"美国", u"欧元区", u"日本", u"英国", u"澳洲", u"加拿大", u"瑞士", u"其他地区"} district = ",".join(districtset) propertyset = set(category_name)&{u"外汇", u"股市", u"商品", u"债市"} property = ",".join(propertyset) centralbankset = set(category_name)&{u"央行"} centralbank = ",".join(centralbankset) save_content = { "source":source, "createdtime":createdtime, "content":content, "type":type, "district":district, "property":property, "centralbank":centralbank } ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) def func(page): url = "http://api.wallstreetcn.com/v2/livenews" # get参数 data = { "page":page } content = Spider(url, data) items = re.findall(r'"type":"(.*?)","codeType".*?"contentHtml":"(.*?)","data".*?"categorySet":"(.*?)","hasMore"', content) # 正则匹配 if len(items) == 0: print "The End Page:", page data = urllib.urlencode(data) # 编码工作,由dict转为string full_url = url+'?'+data print full_url sys.exit(0) # 无错误退出 else: print "The Page:", page, "Downloading..." for item in items: ContentSave(item) if __name__ == '__main__': start = datetime.datetime.now() start_page = 1 end_page = 3300 # 多进程抓取 pages = [i for i in range(start_page, end_page)] p = mp.Pool() p.map_async(func, pages) p.close() p.join() # 单进程抓取 page = end_page while 1: url = "http://api.wallstreetcn.com/v2/livenews" # get参数 data = { "page":page } content = Spider(url, data) items = re.findall(r'"type":"(.*?)","codeType".*?"contentHtml":"(.*?)","data".*?"categorySet":"(.*?)","hasMore"', content) # 正则匹配 if len(items) == 0: print "The End Page:", page data = urllib.urlencode(data) # 编码工作,由dict转为string full_url = url+'?'+data print full_url break else: print "The Page:", page, "Downloading..." for item in items: ContentSave(item) page += 1 end = datetime.datetime.now() print "last time: ", end-start ================================================ FILE: WechatSearchProjects/README.md ================================================ ### 使用Scrapy或Requests递归抓取[微信搜索](http://weixin.sogou.com/weixin)结果 使用Scrapy方法 或者 使用Requests+BeautifulSoup **使用Scrapy方法:** * 将querystring替换为你要查询的单词 * type可以选择 * i的范围可以调整,对应查询的搜索结果页面数目 ================================================ FILE: WechatSearchProjects/Spider_Main.py ================================================ #coding: utf-8 from scrapy.cmdline import execute import os if __name__ == '__main__': project_name = "Wechatproject" spider_name = "wechat" results_name = "results/results.json" if not os.path.exists(project_name): print "Please Edit the project files and Run again!!!" s = "scrapy startproject %s" % project_name execute(s.split()) else: print "Start Crawling!!!" path = os.getcwd() # 获取当前路径 os.chdir(path+"/"+project_name) # 修改当前路径 if os.path.exists(results_name): os.remove(results_name) s = "scrapy crawl %s" % spider_name # s = "scrapy crawl %s -o %s -t json" % (spider_name, results_name) execute(s.split()) ================================================ FILE: WechatSearchProjects/WechatSearchTest.py ================================================ #!/usr/bin/env python # -*- coding:utf-8 -*- import sys import re import urllib, urllib2 import requests import pymongo import datetime from bs4 import BeautifulSoup import multiprocessing as mp class MongoDBIO: # 申明相关的属性 def __init__(self, host, port, name, password, database, collection): self.host = host self.port = port self.name = name self.password = password self.database = database self.collection = collection # 连接数据库,db和posts为数据库和集合的游标 def Connection(self): # connection = pymongo.Connection() # 连接本地数据库 connection = pymongo.Connection(host=self.host, port=self.port) # db = connection.datas db = connection[self.database] if self.name or self.password: db.authenticate(name=self.name, password=self.password) # 验证用户名密码 # print "Database:", db.name # posts = db.cn_live_news posts = db[self.collection] # print "Collection:", posts.name return posts # # 保存操作 # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() # # for save_content in save_contents: # posts.save(save_content) # 保存操作 def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() posts.save(save_content) def GetTitleUrl(url, data): content = requests.get(url=url, params=data).content # GET请求发送 soup = BeautifulSoup(content) tags = soup.findAll("h4") titleurl = [] for tag in tags: item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""} titleurl.append(item) return titleurl def GetContent(url): soup = BeautifulSoup(requests.get(url=url).content) tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 content_list = [tag_i.text for tag_i in tag.findAll("p")] content = "".join(content_list) return content def ContentSave(item): # 保存配置 save_host = "localhost" save_port = 27017 save_name = "" save_password = "" save_database = "testwechat" save_collection = "result" save_content = { "title":item["title"], "link":item["link"], "content":item["content"] } ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) def func(tuple): querystring, type, page = tuple[0], tuple[1], tuple[2] url = "http://weixin.sogou.com/weixin" # get参数 data = { "query":querystring, "type":type, "page":page } titleurl = GetTitleUrl(url, data) for item in titleurl: url = item["link"] print "url:", url content = GetContent(url) item["content"] = content ContentSave(item) if __name__ == '__main__': start = datetime.datetime.now() querystring = u"清华" type = 2 # 2-文章,1-微信号 # 多进程抓取 p = mp.Pool() p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)]) p.close() p.join() # # 单进程抓取 # for page in range(1, 50, 1): # tuple = (querystring, type, page) # func(tuple) end = datetime.datetime.now() print "last time: ", end-start ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/__init__.py ================================================ ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/items.py ================================================ # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class WechatprojectItem(Item): # define the fields for your item here like: # name = Field() title = Field() link = Field() content = Field() pass ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py ================================================ # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # class WechatprojectPipeline(object): # def process_item(self, item, spider): # return item # # MySQL Database # from twisted.enterprise import adbapi # import twisted package # class WechatprojectPipeline(object): # # connnect databases # def __init__(self): # self.dbpool = adbapi.ConnectionPool("MySQLdb", # host = "localhost", # db = "testwechat", # you must build database named testwechat # user = "root", # passwd = "testpasswd", # charset = "utf8") # # pipeline default function # def process_item(self, item, spider): # query = self.dbpool.runInteraction(self._conditional_insert, item) # return item # # insert the data to databases # def _conditional_insert(self, tx, item): # item dictionary # # you must build table named result in database testwechat # tx.execute("insert into result values (%s, %s, %s)", (item["title"], item["link"], item["content"])) # MongoDB Database import pymongo class WechatprojectPipeline(object): # connnect databases def __init__(self): connection = pymongo.Connection(host = "localhost", port = 27017) db = connection["testwechat"] # you need no build database named testdouban # db.authenticate(name = "root", password = "testpasswd") # no name and password for localhost self.posts = db["result"] # you need not build collection named book # pipeline default function def process_item(self, item, spider): self.posts.insert(dict(item)) # convert json to dict return item # # Json File # import json # import codecs # class WechatprojectPipeline(object): # def __init__(self): # self.file = codecs.open('results.json', 'w', 'utf-8') # def process_item(self, item, spider): # line = json.dumps(dict(item))+'\n' # self.file.write(line) # return item ############################################################################################# # '''if you want to download images''' # from scrapy.http.request import Request # from scrapy.contrib.pipeline.images import ImagesPipeline # class MyImagesPipeline(ImagesPipeline): # #@TODO # def get_media_requests(self, item, info): # for image_url in item['image_urls']: # item['image_urls'] contains the image urls # # yield Request(image_url) # yield Request(image_url, meta={'name': item['name']}) # item['name'] contains the images name # def item_completed(self, results, item, info): # return super(MyImagesPipeline, self).item_completed(results, item, info) # def file_path(self, request, response=None, info=None): # f_path = super(MyImagesPipeline, self).file_path(request, response, info) # f_path = f_path.replace('full', request.meta['name']) # return f_path # ########################################################## # # import hashlib # # image_guid = hashlib.sha1(request.url).hexdigest() # change to request.url after deprecation # # return '%s/%s.jpg' % (request.meta['name'], image_guid) # pass # # from scrapy.contrib.pipeline.media import MediaPipeline # # class MyMediaPipeline(MediaPipeline): # # #@TODO # # pass ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/settings.py ================================================ # Scrapy settings for Wechatproject project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'Wechatproject' SPIDER_MODULES = ['Wechatproject.spiders'] NEWSPIDER_MODULE = 'Wechatproject.spiders' ITEM_PIPELINES = ['Wechatproject.pipelines.WechatprojectPipeline'] # add settings ############################################################################################# # '''if you want to download images''' # ITEM_PIPELINES = {'Wechatproject.pipelines.WechatprojectPipeline':1, 'Wechatproject.pipelines.MyImagesPipeline':2 # add settings # IMAGES_STORE = './images' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'Wechatproject (+http://www.yourdomain.com)' ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/spiders/__init__.py ================================================ # This package will contain the spiders of your Scrapy project # # Please refer to the documentation for information on how to create and manage # your spiders. ================================================ FILE: WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py ================================================ #coding: utf-8 from scrapy.spider import BaseSpider from scrapy.selector import Selector from Wechatproject.items import WechatprojectItem from bs4 import BeautifulSoup from scrapy.http import Request class WechatSpider(BaseSpider): ############################################################################################# '''微信搜索程序''' name = "wechat" start_urls = [] querystring = u"清华" type = 2 # 2-文章,1-微信号 for i in range(1, 50, 1): start_urls.append("http://weixin.sogou.com/weixin?type=%d&query=%s&page=%d" % (type, querystring, i)) # print start_urls ############################################################################################# ## 递归抓取 ## 使用xpath()方法,注意item中键对值为string类型,extract()方法返回list def parse(self, response): # print response.body sel = Selector(response) sites = sel.xpath('//div[@class="txt-box"]/h4/a') for site in sites: item = WechatprojectItem() item["title"] = site.xpath("text()").extract() # 其中在item.py中定义了title = Field() item["link"] = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() ############################################################################################# # yield item ## 只抓取当前页数据 next_url = item["link"][0] # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 ## 使用BeautifulSoup方法,注意item中键对值为string类型 def parse(self, response): # print response.body soup = BeautifulSoup(response.body) tags = soup.findAll("h4") for tag in tags: item = WechatprojectItem() item["title"] = tag.text # 其中在item.py中定义了title = Field() item["link"] = tag.find("a").get("href") # 其中在item.py中定义了link = Field() ############################################################################################# # yield item ## 只抓取当前页数据 next_url = item["link"] # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 def parse2(self, response): soup = BeautifulSoup(response.body) tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 content_list = [tag_i.text for tag_i in tag.findAll("p")] content = "".join(content_list) # print content # item = WechatprojectItem() ## 只抓取二级页面数据 item = response.meta['item'] ## 抓取当前页数和二级页面数据 item["content"] = content return item ================================================ FILE: WechatSearchProjects/Wechatproject/scrapy.cfg ================================================ # Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # http://doc.scrapy.org/en/latest/topics/scrapyd.html [settings] default = Wechatproject.settings [deploy] #url = http://localhost:6800/ project = Wechatproject ================================================ FILE: ZhihuSpider/ReadMe.md ================================================ ### 网络爬虫之用户名密码及验证码登陆:爬取[知乎](http://www.zhihu.com/)网站 **一些说明:** * 使用requests包来爬取。首先尝试用用户名密码自动登陆,如果失败,则需要采用cookie登陆。 * 配置文件config.ini,其中包括用户名密码信息,如果有验证码情况,需要手动登陆一次网站获取cookie信息。 * 判断登陆成功与否,看生成的html文件中有没有用户信息。 ================================================ FILE: ZhihuSpider/ZhihuSpider.py ================================================ # -*- coding: utf-8 -*- ''' 网络爬虫之用户名密码及验证码登陆:爬取知乎网站 ''' import requests import ConfigParser def create_session(): cf = ConfigParser.ConfigParser() cf.read('config.ini') cookies = cf.items('cookies') cookies = dict(cookies) from pprint import pprint pprint(cookies) email = cf.get('info', 'email') password = cf.get('info', 'password') session = requests.session() login_data = {'email': email, 'password': password} header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', 'Host': 'www.zhihu.com', 'Referer': 'http://www.zhihu.com/' } r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header) if r.json()['r'] == 1: print 'Login Failed, reason is:', for m in r.json()['data']: print r.json()['data'][m] print 'So we use cookies to login in...' has_cookies = False for key in cookies: if key != '__name__' and cookies[key] != '': has_cookies = True break if has_cookies is False: raise ValueError('请填写config.ini文件中的cookies项.') else: # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 with open('login.html', 'w') as fp: fp.write(r.content) return session, cookies if __name__ == '__main__': requests_session, requests_cookies = create_session() # url = 'http://www.zhihu.com/login/email' url = 'http://www.zhihu.com/topic/19552832' # content = requests_session.get(url).content # 未登陆 # content = requests.get(url, cookies=requests_cookies).content # 已登陆 content = requests_session.get(url, cookies=requests_cookies).content # 已登陆 with open('url.html', 'w') as fp: fp.write(content) ================================================ FILE: ZhihuSpider/config.ini ================================================ [info] email = xxxx@163.com password = xxxx [cookies] q_c1 = cap_id = _za = __utmt = __utma = __utmb = __utmc = __utmz = __utmv = z_c0 = unlock_ticket =