[
  {
    "path": "Captcha1/!Test.bat",
    "content": "python tess_test.py ./pic/get_price_img.png\r\npause"
  },
  {
    "path": "Captcha1/ReadMe.md",
    "content": "### 验证码识别项目第一版：Captcha1\r\n\r\n本项目采用Tesseract V3.01版本(V3.02版本在训练时有改动，多shapeclustering过程)  \r\n\r\n**Tesseract用法：** \r\n* 配置环境变量TESSDATA_PREFIX =“D:\\Tesseract-ocr\\”，即tessdata的目录，在源码中会到这个路径下查找相应的字库文件用来识别。  \r\n* 命令格式：  \r\n`tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfile...]`  \r\n* 只识别成数字   \r\n`tesseract imagename outputbase -l eng digits`  \r\n* 解决empty page!!  \r\n**-psm N** \r\n\r\n\t7 = Treat the image as a single text line  \r\n\ttesseract imagename outputbase -l eng -psm 7  \r\n* configfile 参数值为tessdata\\configs 和 tessdata\\tessconfigs 目录下的文件名：   \r\n`tesseract imagename outputbase -l eng nobatch`  \r\n\r\n\r\n**验证码识别项目使用方法1：**  \r\n \r\n* 将下载的图片放到./pic目录下，  \r\n\r\n\t验证码图片名称：get_random.jpg  \r\n\t价格图片名称：get_price_img.png \r\n\r\n* 命令格式：  \r\n\r\n\t验证码图片识别：python tess_test.py ./pic/get_random.jpg  \r\n\t价格图片识别：python tess_test.py ./pic/get_price_img.png\r\n  \r\n打印出识别的结果\r\n\r\n若要将结果存在临时文本文件**temp.txt**中，则修改pytessr_pro.py中代码\"**cleanup_scratch_flag = True**\"改为\"**cleanup_scratch_flag = False**\"\r\n"
  },
  {
    "path": "Captcha1/pytesser_pro/__init__.py",
    "content": ""
  },
  {
    "path": "Captcha1/pytesser_pro/errors.py",
    "content": "\"\"\"Test for exceptions raised in the tesseract.exe logfile\"\"\"\r\n\r\nclass Tesser_General_Exception(Exception):\r\n\tpass\r\n\r\nclass Tesser_Invalid_Filetype(Tesser_General_Exception):\r\n\tpass\r\n\r\ndef check_for_errors(logfile = \"tesseract.log\"):\r\n\tinf = file(logfile)\r\n\ttext = inf.read()\r\n\tinf.close()\r\n\t# All error conditions result in \"Error\" somewhere in logfile\r\n\tif text.find(\"Error\") != -1:\r\n\t\traise Tesser_General_Exception, text"
  },
  {
    "path": "Captcha1/pytesser_pro/pytesser_pro.py",
    "content": "import Image\r\nimport subprocess\r\n\r\nimport util\r\nimport errors\r\n\r\ntesseract_exe_name = \"tesseract\" # Name of executable to be called at command line\r\nscratch_image_name = \"temp.bmp\" # This file must be .bmp or other Tesseract-compatible format\r\nscratch_text_name_root = \"temp\" # Leave out the .txt extension\r\ncleanup_scratch_flag = False  # Temporary files cleaned up after OCR operation\r\n\r\ndef call_tesseract(input_filename, output_filename, bool_digits=False):\r\n    \"\"\"Calls external tesseract.exe on input file (restrictions on types),\r\n    outputting output_filename+'txt'\"\"\"\r\n    # args = [tesseract_exe_name, input_filename, output_filename]\r\n    if bool_digits:\r\n        # args = tesseract_exe_name+\" \"+input_filename+\" \"+output_filename+\" -l eng -psm 7 nobatch eng_digits\" # price\r\n        args = tesseract_exe_name+\" \"+input_filename+\" \"+output_filename+\" -l test_digits -psm 7 nobatch\" # price\r\n    else:\r\n        args = tesseract_exe_name+\" \"+input_filename+\" \"+output_filename+\" -l eng -psm 7 nobatch eng_characters\" # English letters\r\n        # args = tesseract_exe_name+\" \"+input_filename+\" \"+output_filename+\" -l test_eng -psm 7 nobatch\" # English letters\r\n    # print args\r\n    proc = subprocess.Popen(args, shell=True)\r\n    retcode = proc.wait()\r\n    if retcode != 0:\r\n        errors.check_for_errors()\r\n\r\ndef image_to_string(im, cleanup = cleanup_scratch_flag, bool_digits=False):\r\n    \"\"\"Converts im to file, applies tesseract, and fetches resulting text.\r\n    If cleanup=True, delete scratch files after operation.\"\"\"\r\n    try:\r\n        util.image_to_scratch(im, scratch_image_name)\r\n        call_tesseract(scratch_image_name, scratch_text_name_root, bool_digits)\r\n        text = util.retrieve_text(scratch_text_name_root)\r\n    finally:\r\n        if cleanup:\r\n            util.perform_cleanup(scratch_image_name, scratch_text_name_root)\r\n    return text\r\n\r\ndef image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True, bool_digits=False):\r\n    \"\"\"Applies tesseract to filename; or, if image is incompatible and graceful_errors=True,\r\n    converts to compatible format and then applies tesseract.  Fetches resulting text.\r\n    If cleanup=True, delete scratch files after operation.\"\"\"\r\n    try:\r\n        try:\r\n            call_tesseract(filename, scratch_text_name_root, bool_digits)\r\n            text = util.retrieve_text(scratch_text_name_root)\r\n        except errors.Tesser_General_Exception:\r\n            if graceful_errors:\r\n                im = Image.open(filename)\r\n                text = image_to_string(im, cleanup, bool_digits)\r\n            else:\r\n                raise\r\n    finally:\r\n        if cleanup:\r\n            util.perform_cleanup(scratch_image_name, scratch_text_name_root)\r\n    return text\r\n"
  },
  {
    "path": "Captcha1/pytesser_pro/util.py",
    "content": "\"\"\"Utility functions for processing images for delivery to Tesseract\"\"\"\r\n\r\nimport os\r\n\r\ndef image_to_scratch(im, scratch_image_name):\r\n\t\"\"\"Saves image in memory to scratch file.  .bmp format will be read correctly by Tesseract\"\"\"\r\n\tim.save(scratch_image_name, dpi=(200,200))\r\n\r\ndef\tretrieve_text(scratch_text_name_root):\r\n\tinf = file(scratch_text_name_root + '.txt')\r\n\ttext = inf.read()\r\n\tinf.close()\r\n\treturn text\r\n\r\ndef perform_cleanup(scratch_image_name, scratch_text_name_root):\r\n\t\"\"\"Clean up temporary files from disk\"\"\"\r\n\tfor name in (scratch_image_name, scratch_text_name_root + '.txt', \"tesseract.log\"):\r\n\t\ttry:\r\n\t\t\tos.remove(name)\r\n\t\texcept OSError:\r\n\t\t\tpass\r\n"
  },
  {
    "path": "Captcha1/tess_test.py",
    "content": "# coding: utf-8\r\n\r\nimport os\r\nimport sys\r\nimport subprocess\r\nfrom pytesser_pro.pytesser_pro import *\r\nimport Image, ImageEnhance, ImageFilter\r\nfrom pylab import *\r\n\r\n\r\n\r\n# 二值化并转格式\r\ndef binary(image_name, binary_image_name):\r\n    # 白底黑字\r\n    args = \"convert -monochrome \"+image_name+\" \"+binary_image_name\r\n    # print args\r\n    proc = subprocess.Popen(args, shell=True)\r\n    proc.wait()\r\n    im = Image.open(binary_image_name)\r\n    w, h = im.size\r\n    data = list(im.getdata())\r\n    if (data[0], data[w-1], data[(h-1)*w], data[h*w-1]) == (0, 0, 0, 0): # 0-黑色，255-白色\r\n        # 若非白底黑字则灰度反转\r\n        args1 = \"convert -negate \"+binary_image_name+\" \"+binary_image_name\r\n        proc1 = subprocess.Popen(args1, shell=True)\r\n        proc1.wait()\r\n\r\n# 计算范围内点的个数\r\ndef numpoint(im):\r\n    w, h = im.size\r\n    # print w, h\r\n    data = list(im.getdata())\r\n    mumpoint = 0\r\n    for x in range(w):\r\n        for y in range(h):\r\n            if data[y*w+x] == 0: # 0-黑色，255-白色\r\n                mumpoint += 1\r\n    return mumpoint\r\n\r\n# 投影法去干扰线\r\ndef pointmidu(binary_image_name, midu_image_name):\r\n    im = Image.open(binary_image_name)\r\n    w, h = im.size\r\n    # print w, h\r\n    len = 5\r\n    for x in range(0, w, len):\r\n        box = (x, 0, x+len, h)\r\n        im_box = im.crop(box)\r\n        num = numpoint(im_box)\r\n        # print num\r\n        if num < 20:\r\n            for i in range(x, x+len):\r\n                for j in range(h):\r\n                    im.putpixel((i, j), 255) # 0-黑色，255-白色\r\n    data = list(im.getdata())\r\n    data_column = []\r\n    for x in range(w):\r\n        temp = 0\r\n        for y in range(h):\r\n            if data[y*w+x] == 0: # 0-黑色，255-白色\r\n                temp += 1\r\n        data_column.append(temp)\r\n    # print data_column\r\n    start = 0\r\n    for i in range(0, w, 1):\r\n        if data_column[i] != 0:\r\n            break\r\n        else:\r\n            start += 1\r\n    # print start\r\n    end = w-1\r\n    for j in range(w-1, -1, -1):\r\n        if data_column[j] != 0:\r\n            break\r\n        else:\r\n            end += -1\r\n    # print end\r\n    box_new = (start, 0, end+1, h)\r\n    im_box_new = im.crop(box_new)\r\n    im_box_new.save(midu_image_name)\r\n\r\n# 图像增强\r\ndef filter_enhance(midu_image_name, midu_image_name_pro1):\r\n    im = Image.open(midu_image_name)\r\n    # 去噪\r\n    im = im.filter(ImageFilter.MedianFilter())\r\n    # 亮度加强\r\n    enhancer = ImageEnhance.Contrast(im)\r\n    im = enhancer.enhance(2)\r\n    im = im.convert('1')\r\n    # im.show()\r\n    im.save(midu_image_name_pro1)\r\n\r\n# 字符分割\r\ndef seg(midu_image_name_pro1, midu_image_name_pro2, num):\r\n    im = Image.open(midu_image_name_pro1)\r\n    w, h = im.size\r\n    # print w, h, w/num\r\n    len = 2\r\n    for i in range(num-1):\r\n        start = (i+1)*w/num\r\n        end = start+len\r\n        for m in range(start, end+1):\r\n            for n in range(h):\r\n                im.putpixel((m, n), 255) # 0-黑色，255-白色\r\n    im.save(midu_image_name_pro2)\r\n\r\ndef get_aim1_point(im):\r\n    aim = []\r\n    w, h = im.size\r\n    # print w, h\r\n    data = list(im.getdata())\r\n    for x in range(0, w, 1):\r\n        for y in range(0, h, 1):\r\n            if data[y*w+x] == 0: # 0-黑色，255-白色\r\n                start_point = (x, y)\r\n                # print start_point\r\n                aim.append(start_point)\r\n                break\r\n    return aim\r\n\r\ndef get_aim2_point(im):\r\n    aim = []\r\n    w, h = im.size\r\n    # print w, h\r\n    data = list(im.getdata())\r\n    for x in range(0, w, 1):\r\n        for y in range(h-1, -1, -1):\r\n            if data[y*w+x] == 0: # 0-黑色，255-白色\r\n                start_point = (x, y)\r\n                # print start_point\r\n                aim.append(start_point)\r\n                break\r\n    return aim\r\n\r\n\r\nif __name__=='__main__':\r\n\r\n    if len(sys.argv) == 1:\r\n        image_name = \"./pic/get_random.jpg\" # 验证码图片名称\r\n        digits = False\r\n        # image_name = \"./pic/get_price_img.png\" # 价格图片名称\r\n        # digits = True\r\n    elif len(sys.argv) == 2:\r\n        if sys.argv[1].find(\"get_random\") != -1:\r\n            image_name = sys.argv[1]\r\n            digits = False\r\n        elif sys.argv[1].find(\"get_price_img\") != -1:\r\n            image_name = sys.argv[1]\r\n            digits = True\r\n        else:\r\n            print \"Please Input the Correct Image Name!\"\r\n            sys.exit(0)\r\n    else:\r\n        print \"Too Many Arguments!\"\r\n        sys.exit(0)\r\n\r\n\r\n    # 二值化并转格式\r\n    binary_image_name = os.path.splitext(image_name)[0]+\"_binary.png\"\r\n    binary(image_name, binary_image_name)\r\n\r\n    im = Image.open(binary_image_name)\r\n    print im.format, im.size, im.mode\r\n\r\n\r\n    if digits:\r\n        text = image_file_to_string(binary_image_name, bool_digits=digits)\r\n        print text.replace(\"\\n\", \"\")\r\n    else:\r\n        # 投影法去干扰线\r\n        fpathandname , fext = os.path.splitext(binary_image_name)\r\n        midu_image_name = fpathandname+\"_midu\"+fext\r\n        pointmidu(binary_image_name, midu_image_name)\r\n\r\n\r\n        fpathandname , fext = os.path.splitext(midu_image_name)\r\n\r\n        # 去干扰线\r\n        # im = Image.open(midu_image_name)\r\n        # w, h = im.size\r\n        # data = list(im.getdata())\r\n        # aim1 = get_aim1_point(im)\r\n        # for x, y in aim1:\r\n        #     curr = data[y*w+x]\r\n        #     prev = data[(y-1)*w+x]\r\n        #     next = data[(y+1)*w+x]\r\n        #\r\n        #     if prev == 0 and next == 0: # 0-黑色，255-白色\r\n        #         continue\r\n        #     if prev == 0:\r\n        #         im.putpixel((x, y), 255)\r\n        #         im.putpixel((x, y-1), 255)\r\n        #     elif next == 0:\r\n        #         im.putpixel((x, y), 255)\r\n        #         im.putpixel((x, y+1), 255)\r\n        #     else:\r\n        #         im.putpixel((x, y), 255)\r\n        # data = list(im.getdata())\r\n        # aim2 = get_aim2_point(im)\r\n        # for x, y in aim2:\r\n        #     curr = data[y*w+x]\r\n        #     prev = data[(y-1)*w+x]\r\n        #     next = data[(y+1)*w+x]\r\n        #\r\n        #     if prev == 0 and next == 0: # 0-黑色，255-白色\r\n        #         continue\r\n        #     if prev == 0:\r\n        #         im.putpixel((x, y), 255)\r\n        #         im.putpixel((x, y-1), 255)\r\n        #     elif next == 0:\r\n        #         im.putpixel((x, y), 255)\r\n        #         im.putpixel((x, y+1), 255)\r\n        #     else:\r\n        #         im.putpixel((x, y), 255)\r\n        # midu_image_name_new = fpathandname+\"_new\"+fext\r\n        # im.save(midu_image_name_new)\r\n\r\n\r\n        # 图像增强\r\n        midu_image_name_pro1 = fpathandname+\"_pro1\"+fext\r\n        filter_enhance(midu_image_name, midu_image_name_pro1)\r\n        # 字符分割\r\n        # num = 4\r\n        # midu_image_name_pro2 = fpathandname+\"_pro2\"+fext\r\n        # seg(midu_image_name_pro1, midu_image_name_pro2, num)\r\n\r\n        # im = Image.open(midu_image_name)\r\n        # text = image_to_string(im)\r\n        # print text.replace(\"\\n\", \"\")\r\n        text = image_file_to_string(midu_image_name_pro1, bool_digits=digits)\r\n        print text.replace(\"\\n\", \"\")"
  },
  {
    "path": "NewsSpider/NewsSpider.py",
    "content": "# -*- coding: utf-8 -*-\r\nimport os\r\nimport sys\r\nimport urllib2\r\nimport requests\r\nimport re\r\nfrom lxml import etree\r\n\r\n\r\ndef StringListSave(save_path, filename, slist):\r\n    if not os.path.exists(save_path):\r\n        os.makedirs(save_path)\r\n    path = save_path+\"/\"+filename+\".txt\"\r\n    with open(path, \"w+\") as fp:\r\n        for s in slist:\r\n            fp.write(\"%s\\t\\t%s\\n\" % (s[0].encode(\"utf8\"), s[1].encode(\"utf8\")))\r\n\r\ndef Page_Info(myPage):\r\n    '''Regex'''\r\n    mypage_Info = re.findall(r'<div class=\"titleBar\" id=\".*?\"><h2>(.*?)</h2><div class=\"more\"><a href=\"(.*?)\">.*?</a></div></div>', myPage, re.S)\r\n    return mypage_Info\r\n\r\ndef New_Page_Info(new_page):\r\n    '''Regex(slowly) or Xpath(fast)'''\r\n    # new_page_Info = re.findall(r'<td class=\".*?\">.*?<a href=\"(.*?)\\.html\".*?>(.*?)</a></td>', new_page, re.S)\r\n    # # new_page_Info = re.findall(r'<td class=\".*?\">.*?<a href=\"(.*?)\">(.*?)</a></td>', new_page, re.S) # bugs\r\n    # results = []\r\n    # for url, item in new_page_Info:\r\n    #     results.append((item, url+\".html\"))\r\n    # return results\r\n    dom = etree.HTML(new_page)\r\n    new_items = dom.xpath('//tr/td/a/text()')\r\n    new_urls = dom.xpath('//tr/td/a/@href')\r\n    assert(len(new_items) == len(new_urls))\r\n    return zip(new_items, new_urls)\r\n\r\ndef Spider(url):\r\n    i = 0\r\n    print \"downloading \", url\r\n    myPage = requests.get(url).content.decode(\"gbk\")\r\n    # myPage = urllib2.urlopen(url).read().decode(\"gbk\")\r\n    myPageResults = Page_Info(myPage)\r\n    save_path = u\"网易新闻抓取\"\r\n    filename = str(i)+\"_\"+u\"新闻排行榜\"\r\n    StringListSave(save_path, filename, myPageResults)\r\n    i += 1\r\n    for item, url in myPageResults:\r\n        print \"downloading \", url\r\n        new_page = requests.get(url).content.decode(\"gbk\")\r\n        # new_page = urllib2.urlopen(url).read().decode(\"gbk\")\r\n        newPageResults = New_Page_Info(new_page)\r\n        filename = str(i)+\"_\"+item\r\n        StringListSave(save_path, filename, newPageResults)\r\n        i += 1\r\n\r\n\r\nif __name__ == '__main__':\r\n    print \"start\"\r\n    start_url = \"http://news.163.com/rank/\"\r\n    Spider(start_url)\r\n    print \"end\""
  },
  {
    "path": "NewsSpider/ReadMe.md",
    "content": "### 网络爬虫之最基本的爬虫：爬取[网易新闻排行榜](http://news.163.com/rank/)\r\n\r\n**一些说明：** \r\n\r\n* 使用urllib2或requests包来爬取页面。\r\n\r\n* 使用正则表达式分析一级页面，使用Xpath来分析二级页面。\r\n\r\n* 将得到的标题和链接，保存为本地文件。\r\n"
  },
  {
    "path": "QunarSpider/QunarSpider.py",
    "content": "#!/usr/bin/env python\r\n# -*- coding:utf-8 -*-\r\n\r\nimport os\r\nimport time\r\nimport datetime\r\nimport codecs\r\nimport multiprocessing as mp\r\nfrom os import makedirs\r\nfrom os.path import exists\r\nfrom selenium import webdriver\r\nfrom selenium.webdriver.common.proxy import *\r\n\r\n\r\nsite = 'http://flight.qunar.com'\r\nhot_city_list = [u'上海', u'北京', u'广州', u'深圳']\r\nnum = len(hot_city_list)\r\n\r\n\r\ndef one_driver_ticket(driver, from_city, to_city):\r\n    # time = datetime.datetime.now()\r\n    date = datetime.date.today()\r\n    tomorrow = date+datetime.timedelta(days=1)\r\n    # date格式转为string格式\r\n    tomorrow_string = tomorrow.strftime('%Y-%m-%d')\r\n\r\n    driver.find_element_by_name('fromCity').clear()\r\n    driver.find_element_by_name('fromCity').send_keys(from_city)\r\n    driver.find_element_by_name('toCity').clear()\r\n    driver.find_element_by_name('toCity').send_keys(to_city)\r\n    driver.find_element_by_name('fromDate').clear()\r\n    driver.find_element_by_name('fromDate').send_keys(tomorrow_string)\r\n    driver.find_element_by_xpath('//button[@type=\"submit\"]').click()\r\n    time.sleep(5) # 控制间隔时间，等待浏览器反映\r\n\r\n    flag = True\r\n    page_num = 0\r\n    while flag:\r\n        # 保存页面\r\n        # print driver.page_source\r\n        source_code = driver.find_element_by_xpath(\"//*\").get_attribute(\"outerHTML\")\r\n        print type(source_code)\r\n        dstdir = u'./ticket/'\r\n        if not exists(dstdir):\r\n            makedirs(dstdir)\r\n        f = codecs.open(dstdir+from_city+u','+to_city+unicode(tomorrow_string)+u','+unicode(str(page_num+1))+u'.html', 'w+', 'utf8')\r\n        f.write(source_code)\r\n        f.close()\r\n\r\n        next_page = None\r\n        try:\r\n            next_page = driver.find_element_by_id('nextXI3')\r\n        except Exception as e:\r\n            print e\r\n            pass\r\n        print \"page: %d\" % (page_num+1)\r\n        if next_page:\r\n            try:\r\n                next_page.click()\r\n                time.sleep(2) # 控制间隔时间，等待浏览器反映\r\n                page_num += 1\r\n            except Exception as e:\r\n                print 'next_page could not be clicked'\r\n                print e\r\n                flag = False\r\n        else:\r\n            flag = False\r\n\r\ndef get_proxy_list(file_path):\r\n    proxy_list = []\r\n    try:\r\n        f = open(file_path, 'r')\r\n        all_lines = f.readlines() # readlines()每次按行读取整个文件内容，将读取到的内容放到一个列表中，返回list类型。\r\n        for line in all_lines:\r\n            proxy_list.append(line.replace('\\r', '').replace('\\n', ''))\r\n        f.close()\r\n    except Exception as e:\r\n        print e\r\n    return proxy_list\r\n\r\ndef ticket_worker_proxy(city_proxy):\r\n    city = city_proxy.split(',')[0]\r\n    proxy = city_proxy.split(',')[1]\r\n    proxy = Proxy({\r\n        'proxyType': ProxyType.MANUAL,\r\n        'httpProxy': proxy,\r\n        'ftpProxy': proxy,\r\n        'sslProxy': proxy,\r\n        'noProxy': '' # 过滤不需要代理的地址\r\n    })\r\n    driver = webdriver.Firefox(proxy=proxy)\r\n    driver.get(site)\r\n    driver.maximize_window() # 将浏览器最大化显示\r\n    for i in xrange(num):\r\n        if city == hot_city_list[i]:\r\n            continue\r\n        from_city = city\r\n        to_city = hot_city_list[i]\r\n        one_driver_ticket(driver, from_city, to_city)\r\n    driver.close()\r\n\r\ndef all_ticket_proxy():\r\n    hot_city_proxy_list = []\r\n    proxy_list = get_proxy_list('./proxy/proxy.txt') # ./表示当前目录，../表示上一级目录\r\n    for i in xrange(num):\r\n        hot_city_proxy_list.append(hot_city_list[i]+','+proxy_list[i])\r\n    pool = mp.Pool(processes=1)\r\n    pool.map(ticket_worker_proxy, hot_city_proxy_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]\r\n    pool.close()\r\n    pool.join()\r\n\r\ndef ticket_worker_no_proxy(city):\r\n    driver = webdriver.Firefox()\r\n    # chromedriver = r'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe'\r\n    # os.environ['webdriver.chrome.driver'] = chromedriver\r\n    # driver = webdriver.Chrome(chromedriver)\r\n    driver.get(site)\r\n    driver.maximize_window() # 将浏览器最大化显示\r\n    time.sleep(5) # 控制间隔时间，等待浏览器反映\r\n    for i in xrange(num):\r\n        if city == hot_city_list[i]:\r\n            continue\r\n        from_city = city\r\n        to_city = hot_city_list[i]\r\n        one_driver_ticket(driver, from_city, to_city)\r\n    driver.close()\r\n\r\ndef all_ticket_no_proxy():\r\n    pool = mp.Pool(processes=1)\r\n    pool.map(ticket_worker_no_proxy, hot_city_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]\r\n    pool.close()\r\n    pool.join()\r\n\r\n\r\nif __name__ == '__main__':\r\n    print \"start\"\r\n    start = datetime.datetime.now()\r\n    # all_ticket_proxy() # proxy\r\n    all_ticket_no_proxy() # no proxy\r\n    end = datetime.datetime.now()\r\n    print \"end\"\r\n    print \"time: \", end-start\r\n"
  },
  {
    "path": "QunarSpider/ReadMe.md",
    "content": "### 网络爬虫之Selenium使用代理登陆：爬取[去哪儿](http://flight.qunar.com/)网站 \r\n\r\n**一些说明：**\r\n\r\n* 使用selenium模拟浏览器登陆，获取翻页操作。\r\n\r\n* 代理可以存入一个文件，程序读取并使用。\r\n\r\n* 支持多进程抓取。"
  },
  {
    "path": "ReadMe.md",
    "content": "# [Python入门网络爬虫之精华版](https://github.com/lining0806/PythonSpiderNotes)\n\n*** \n\nPython学习网络爬虫主要分3个大的版块：**抓取**，**分析**，**存储**  \n\n另外，比较常用的爬虫框架[Scrapy](http://scrapy.org/)，这里最后也详细介绍一下。    \n\n首先列举一下本人总结的相关文章，这些覆盖了入门网络爬虫需要的基本概念和技巧：[宁哥的小站-网络爬虫](http://www.lining0806.com/category/spider/)  \n***\n\n当我们在浏览器中输入一个url后回车，后台会发生什么？比如说你输入[http://www.lining0806.com/](http://www.lining0806.com/)，你就会看到宁哥的小站首页。\n\n简单来说这段过程发生了以下四个步骤：\n\n* 查找域名对应的IP地址。\n* 向IP对应的服务器发送请求。\n* 服务器响应请求，发回网页内容。\n* 浏览器解析网页内容。\n\n网络爬虫要做的，简单来说，就是实现浏览器的功能。通过指定url，直接返回给用户所需要的数据，而不需要一步步人工去操纵浏览器获取。\n\n## 抓取  \n这一步，你要明确要得到的内容是什么？是HTML源码，还是Json格式的字符串等。  \n\n#### 1. 最基本的抓取  \n\n抓取大多数情况属于get请求，即直接从对方服务器上获取数据。  \n\n首先，Python中自带urllib及urllib2这两个模块，基本上能满足一般的页面抓取。另外，[requests](https://github.com/kennethreitz/requests)也是非常有用的包，与此类似的，还有[httplib2](https://github.com/jcgregorio/httplib2)等等。    \n\n```\nRequests：\n\timport requests\n\tresponse = requests.get(url)\n\tcontent = requests.get(url).content\n\tprint \"response headers:\", response.headers\n\tprint \"content:\", content\nUrllib2：\n\timport urllib2\n\tresponse = urllib2.urlopen(url)\n\tcontent = urllib2.urlopen(url).read()\n\tprint \"response headers:\", response.headers\n\tprint \"content:\", content\nHttplib2：\n\timport httplib2\n\thttp = httplib2.Http()\n\tresponse_headers, content = http.request(url, 'GET')\n\tprint \"response headers:\", response_headers\n\tprint \"content:\", content\n```  \n\n此外，对于带有查询字段的url，get请求一般会将来请求的数据附在url之后，以?分割url和传输数据，多个参数用&连接。  \n\n```\ndata = {'data1':'XXXXX', 'data2':'XXXXX'}\nRequests：data为dict，json\n\timport requests\n\tresponse = requests.get(url=url, params=data)\nUrllib2：data为string\n\timport urllib, urllib2    \n\tdata = urllib.urlencode(data)\n\tfull_url = url+'?'+data\n\tresponse = urllib2.urlopen(full_url)\n```\n\n相关参考：[网易新闻排行榜抓取回顾](http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/)\n\n参考项目：[网络爬虫之最基本的爬虫：爬取网易新闻排行榜](https://github.com/lining0806/PythonSpiderNotes/blob/master/NewsSpider)\n\n### 2. 对于登陆情况的处理  \n\n**2.1 使用表单登陆**  \n\n这种情况属于post请求，即先向服务器发送表单数据，服务器再将返回的cookie存入本地。  \n\n```\ndata = {'data1':'XXXXX', 'data2':'XXXXX'}\nRequests：data为dict，json\n\timport requests\n\tresponse = requests.post(url=url, data=data)\nUrllib2：data为string\n\timport urllib, urllib2    \n\tdata = urllib.urlencode(data)\n\treq = urllib2.Request(url=url, data=data)\n\tresponse = urllib2.urlopen(req)\n```  \n\n**2.2 使用cookie登陆**  \n\n使用cookie登陆，服务器会认为你是一个已登陆的用户，所以就会返回给你一个已登陆的内容。因此，需要验证码的情况可以使用带验证码登陆的cookie解决。  \n\n```\nimport requests\t\t\t\nrequests_session = requests.session() \nresponse = requests_session.post(url=url_login, data=data) \n```\n\n若存在验证码，此时采用response = requests_session.post(url=url_login, data=data)是不行的，做法应该如下：  \n\n```\nresponse_captcha = requests_session.get(url=url_login, cookies=cookies)\nresponse1 = requests.get(url_login) # 未登陆\nresponse2 = requests_session.get(url_login) # 已登陆，因为之前拿到了Response Cookie！\nresponse3 = requests_session.get(url_results) # 已登陆，因为之前拿到了Response Cookie！\n```\n\n相关参考：[网络爬虫-验证码登陆](http://www.lining0806.com/6-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB-%E9%AA%8C%E8%AF%81%E7%A0%81%E7%99%BB%E9%99%86/)  \n\n参考项目：[网络爬虫之用户名密码及验证码登陆：爬取知乎网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/ZhihuSpider)  \n\n### 3. 对于反爬虫机制的处理 \n\n**3.1 使用代理** \n\n适用情况：限制IP地址情况，也可解决由于“频繁点击”而需要输入验证码登陆的情况。  \n\n这种情况最好的办法就是维护一个代理IP池，网上有很多免费的代理IP，良莠不齐，可以通过筛选找到能用的。对于“频繁点击”的情况，我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。\n\n```\nproxies = {'http':'http://XX.XX.XX.XX:XXXX'}\nRequests：\n\timport requests\n\tresponse = requests.get(url=url, proxies=proxies)\nUrllib2：\n\timport urllib2\n\tproxy_support = urllib2.ProxyHandler(proxies)\n\topener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)\n\turllib2.install_opener(opener) # 安装opener，此后调用urlopen()时都会使用安装过的opener对象\n\tresponse = urllib2.urlopen(url)\n```\n\n**3.2 时间设置** \n\n适用情况：限制频率情况。 \n\nRequests，Urllib2都可以使用time库的sleep()函数：\n\n```\nimport time\ntime.sleep(1)\n```\n\n**3.3 伪装成浏览器，或者反“反盗链”**  \n\n有些网站会检查你是不是真的浏览器访问，还是机器自动访问的。这种情况，加上User-Agent，表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法，一般再加上Referer。\n\n```\nheaders = {'User-Agent':'XXXXX'} # 伪装成浏览器访问，适用于拒绝爬虫的网站\nheaders = {'Referer':'XXXXX'}\nheaders = {'User-Agent':'XXXXX', 'Referer':'XXXXX'}\nRequests：\n\tresponse = requests.get(url=url, headers=headers)\nUrllib2：\n\timport urllib, urllib2   \n\treq = urllib2.Request(url=url, headers=headers)\n\tresponse = urllib2.urlopen(req)\n```\n\n### 4. 对于断线重连  \n\n不多说。\n\n```\ndef multi_session(session, *arg):\n\tretryTimes = 20\n\twhile retryTimes>0:\n\t\ttry:\n\t\t\treturn session.post(*arg)\n\t\texcept:\n\t\t\tprint '.',\n\t\t\tretryTimes -= 1\n```\n\n或者  \n\n```\ndef multi_open(opener, *arg):\n\tretryTimes = 20\n\twhile retryTimes>0:\n\t\ttry:\n\t\t\treturn opener.open(*arg)\n\t\texcept:\n\t\t\tprint '.',\n\t\t\tretryTimes -= 1\n```\n\n这样我们就可以使用multi_session或multi_open对爬虫抓取的session或opener进行保持。    \n\n### 5. 多进程抓取  \n\n这里针对[华尔街见闻](http://live.wallstreetcn.com/ )进行并行抓取的实验对比：[Python多进程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Python) 与 [Java单线程和多线程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Java)  \n\n相关参考：[关于Python和Java的多进程多线程计算方法对比](http://www.lining0806.com/%E5%85%B3%E4%BA%8Epython%E5%92%8Cjava%E7%9A%84%E5%A4%9A%E8%BF%9B%E7%A8%8B%E5%A4%9A%E7%BA%BF%E7%A8%8B%E8%AE%A1%E7%AE%97%E6%96%B9%E6%B3%95%E5%AF%B9%E6%AF%94/)  \n\n### 6. 对于Ajax请求的处理  \n\n对于“加载更多”情况，使用Ajax来传输很多数据。\n\n它的工作原理是：从网页的url加载网页的源代码之后，会在浏览器里执行JavaScript程序。这些程序会加载更多的内容，“填充”到网页里。这就是为什么如果你直接去爬网页本身的url，你会找不到页面的实际内容。  \n\n这里，若使用Google Chrome分析”请求“对应的链接(方法：右键→审查元素→Network→清空，点击”加载更多“，出现对应的GET链接寻找Type为text/html的，点击，查看get参数或者复制Request URL)，循环过程。  \n\n* 如果“请求”之前有页面，依据上一步的网址进行分析推导第1页。以此类推，抓取抓Ajax地址的数据。  \n* 对返回的json格式数据(str)进行正则匹配。json格式数据中，需从'\\\\uxxxx'形式的unicode_escape编码转换成u'\\uxxxx'的unicode编码。  \n\n### 7. 自动化测试工具Selenium\n\nSelenium是一款自动化测试工具。它能实现操纵浏览器，包括字符填充、鼠标点击、获取元素、页面切换等一系列操作。总之，凡是浏览器能做的事，Selenium都能够做到。\n\n这里列出在给定城市列表后，使用selenium来动态抓取[去哪儿网](http://flight.qunar.com/)的票价信息的代码。\n\n参考项目：[网络爬虫之Selenium使用代理登陆：爬取去哪儿网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/QunarSpider) \n\n### 8. 验证码识别  \n\n对于网站有验证码的情况，我们有三种办法：  \n\n* 使用代理，更新IP。\n* 使用cookie登陆。\n* 验证码识别。\n\n使用代理和使用cookie登陆之前已经讲过，下面讲一下验证码识别。  \n\n可以利用开源的Tesseract-OCR系统进行验证码图片的下载及识别，将识别的字符传到爬虫系统进行模拟登陆。当然也可以将验证码图片上传到打码平台上进行识别。如果不成功，可以再次更新验证码识别，直到成功为止。  \n\n参考项目：[验证码识别项目第一版：Captcha1](https://github.com/lining0806/PythonSpiderNotes/blob/master/Captcha1)\n\n**爬取有两个需要注意的问题：**\n\n* 如何监控一系列网站的更新情况，也就是说，如何进行增量式爬取？\n* 对于海量数据，如何实现分布式爬取？\n\n## 分析  \n\n抓取之后就是对抓取的内容进行分析，你需要什么内容，就从中提炼出相关的内容来。  \n\n常见的分析工具有[正则表达式](http://deerchao.net/tutorials/regex/regex.htm)，[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/)，[lxml](http://lxml.de/)等等。  \n\n## 存储  \n\n分析出我们需要的内容之后，接下来就是存储了。  \n\n我们可以选择存入文本文件，也可以选择存入[MySQL](http://www.mysql.com/)或[MongoDB](https://www.mongodb.org/)数据库等。  \n\n**存储有两个需要注意的问题：**\n\n* 如何进行网页去重？\n* 内容以什么形式存储？\n\n\n## Scrapy  \n\nScrapy是一个基于Twisted的开源的Python爬虫框架，在工业中应用非常广泛。  \n\n相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/)，同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码，给大家作为学习参考。\n\n参考项目：[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects)\n\n## Robots协议  \n\n好的网络爬虫，首先需要遵守**Robots协议**。Robots协议（也称为爬虫协议、机器人协议等）的全称是“网络爬虫排除标准”（Robots Exclusion Protocol），网站通过Robots协议告诉搜索引擎哪些页面可以抓取，哪些页面不能抓取。\n\n在网站根目录下放一个robots.txt文本文件（如 https://www.taobao.com/robots.txt ），里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面，指定的页面由正则表达式表示。网络爬虫在采集这个网站之前，首先获取到这个robots.txt文本文件，然后解析到其中的规则，然后根据规则来采集网站的数据。\n\n### 1. Robots协议规则\n\n\tUser-agent: 指定对哪些爬虫生效\n\tDisallow: 指定不允许访问的网址\n\tAllow: 指定允许访问的网址\n\n注意: 一个英文要大写，冒号是英文状态下，冒号后面有一个空格，\"/\"代表整个网站\n\n### 2. Robots协议举例\n\n\t禁止所有机器人访问\n\t\tUser-agent: *\n\t\tDisallow: /\n\t允许所有机器人访问\n\t\tUser-agent: *\n\t\tDisallow: \n\t禁止特定机器人访问\n\t\tUser-agent: BadBot\n\t\tDisallow: /\n\t允许特定机器人访问\n\t\tUser-agent: GoodBot\n\t\tDisallow: \n\t禁止访问特定目录\n\t\tUser-agent: *\n\t\tDisallow: /images/\n\t仅允许访问特定目录\n\t\tUser-agent: *\n\t\tAllow: /images/\n\t\tDisallow: /\n\t禁止访问特定文件\n\t\tUser-agent: *\n\t\tDisallow: /*.html$\n\t仅允许访问特定文件\n\t\tUser-agent: *\n\t\tAllow: /*.html$\n\t\tDisallow: /"
  },
  {
    "path": "Spider_Java/README.md",
    "content": "### Spider_Java\n\n抓取网址：[华尔街见闻](http://live.wallstreetcn.com/)\n\n单线程抓取 Spider_Java1\n\n多线程抓取 Spider_Java2\n"
  },
  {
    "path": "Spider_Java/Spider_Java1/.classpath",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<classpath>\n\t<classpathentry kind=\"src\" path=\"src\"/>\n\t<classpathentry kind=\"con\" path=\"org.eclipse.jdt.launching.JRE_CONTAINER\"/>\n\t<classpathentry kind=\"lib\" path=\"lib/mongo-java-driver-2.13.0-rc1.jar\"/>\n\t<classpathentry kind=\"output\" path=\"bin\"/>\n</classpath>\n"
  },
  {
    "path": "Spider_Java/Spider_Java1/.project",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<projectDescription>\n\t<name>Spider</name>\n\t<comment></comment>\n\t<projects>\n\t</projects>\n\t<buildSpec>\n\t\t<buildCommand>\n\t\t\t<name>org.eclipse.jdt.core.javabuilder</name>\n\t\t\t<arguments>\n\t\t\t</arguments>\n\t\t</buildCommand>\n\t</buildSpec>\n\t<natures>\n\t\t<nature>org.eclipse.jdt.core.javanature</nature>\n\t</natures>\n</projectDescription>\n"
  },
  {
    "path": "Spider_Java/Spider_Java1/src/synchronizetest/Test.java",
    "content": "/**\n * \n */\npackage synchronizetest;\n\n/**\n * @author FIRELING\n *\n */\npublic class Test\n{\n\tpublic static void main(String[] args)\n\t{\n\t\tReservoir r = new Reservoir(100);\n\t\tBooth b1 = new Booth(r);\n\t\tBooth b2 = new Booth(r);\n\t\tBooth b3 = new Booth(r);\n\t}\n}\n/**\n * contain shared resource\n */\nclass Reservoir {\n\tprivate int total;\n\tpublic Reservoir(int t) \n\t{\n\t\tthis.total = t;\n\t}\n\t/**\n\t * Thread safe method\n\t * serialized access to Booth.total\n\t */\n\tpublic synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法\n\t{\n\t\tif(this.total > 0) {\n\t\t\tthis.total = this.total-1;\n\t\t\treturn true; // successfully sell one\n\t\t}\n\t\telse {\n\t\t\treturn false; // no more tickets\n\t\t}\n\t}\n}\n/**\n * create new thread by inheriting Thread\n */\nclass Booth extends Thread {\n\tprivate static int threadID = 0; // owned by Class object\n\n\tprivate Reservoir release; // sell this reservoir \n\tprivate int count = 0; // owned by this thread object\n\t/**\n\t * constructor\n\t */\n\tpublic Booth(Reservoir r) {\n\t\tsuper(\"ID:\"+(++threadID));\n\t\tthis.release = r; // all threads share the same reservoir\n\t\tthis.start();\n\t}\n\t/**\n\t * convert object to string\n\t */\n\tpublic String toString() {\n\t\treturn super.getName();\n\t}\n\t/**\n\t * what does the thread do?\n\t */\n\tpublic void run() {\n\t\twhile(true) { // 循环体！！！\n\t\t\tif(this.release.sellTicket()) {\n\t\t\t\tthis.count = this.count+1;\n\t\t\t\tSystem.out.println(this.getName()+\":sell 1\");\n\t\t\t\ttry {\n\t\t\t\t\tsleep((int) Math.random()*100); // random intervals\n\t\t\t\t\t// sleep(100); // 若sleep时间相同，则每个窗口买票相当\n\t\t\t\t}\n\t\t\t\tcatch (InterruptedException e) {\n\t\t\t\t\tthrow new RuntimeException(e);\n\t\t\t\t}\n\t\t\t}\n\t\t\telse {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t}\n\t\tSystem.out.println(this.getName()+\" I sold:\"+count);\n\t}\n}\n\n"
  },
  {
    "path": "Spider_Java/Spider_Java1/src/wallstreetcnsave/WallstreetcnSaveTest.java",
    "content": "package wallstreetcnsave;\n\nimport java.io.BufferedReader;\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.InputStreamReader;\nimport java.net.HttpURLConnection;\nimport java.net.URL;\nimport java.text.DateFormat;\nimport java.util.ArrayList;\nimport java.util.Date;\nimport java.util.HashMap;\nimport java.util.List;\nimport java.util.Map;\nimport java.util.regex.Matcher;\nimport java.util.regex.Pattern;\n\nimport com.mongodb.BasicDBObject;\nimport com.mongodb.DB;\nimport com.mongodb.DBCollection;\nimport com.mongodb.Mongo;\n\npublic class WallstreetcnSaveTest implements Runnable {\n\t\n\tprivate static String DataBaseName = \"textclassify\";\n\tprivate static String CollectionName = \"WallstreetSaveJava\";\n\n\tprivate static String url = \"http://api.wallstreetcn.com/v2/livenews?&page=\";\n\t\n\tprivate static String Regex = \".*?\\\"type\\\":\\\"(.*?)\\\".*?\\\"contentHtml\\\":\\\"<p>(.*?)<\\\\\\\\/p>\\\".*?\\\"categorySet\\\":\\\"(.*?)\\\".*?\";\n\tprivate static final String REGEXSTRING1 = \"type\";\n\tprivate static final String REGEXSTRING2 = \"content\";\n\tprivate static final String REGEXSTRING3 = \"categoryset\";\n\t\n\t//map表的存放\n\tpublic static Map<String, String> GetMap() {\n\t\tMap<String, String> map = new HashMap<String, String>();\n\t\tmap.put(\"1\", \"外汇\");\n\t\tmap.put(\"2\", \"股市\");\n\t\tmap.put(\"3\", \"商品\");\n\t\tmap.put(\"4\", \"债市\");\n\t\tmap.put(\"9\", \"中国\");\n\t\tmap.put(\"10\", \"美国\");\n\t\tmap.put(\"11\", \"欧元区\");\n\t\tmap.put(\"12\", \"日本\");\n\t\tmap.put(\"13\", \"英国\");\n\t\tmap.put(\"14\", \"澳洲\");\n\t\tmap.put(\"15\", \"加拿大\");\n\t\tmap.put(\"16\", \"瑞士\");\n\t\tmap.put(\"17\", \"其他地区\");\n\t\tmap.put(\"5\", \"央行\");\n\t\treturn map;\n\t}\n\tprivate static String[] ruleList_district = { \"9\", \"10\", \"11\", \"12\", \"13\", \"14\", \"15\", \"16\", \"17\" };\n\tprivate static String[] ruleList_property = { \"1\", \"2\", \"3\", \"4\" };\n\tprivate static String[] ruleList_centralbank = { \"5\" };\n\t\n\tprivate static final int start = 1;\n\tprivate static final int end = 3000;\n\t\n\t//对x,x,x格式的内容进行分隔筛选\n\tpublic static String setCategory(String categorySet, String[] ruleList, Map<String, String> map) {\n\t\tStringBuffer disStr = new StringBuffer(); \n\t\tString[] strArray = null;\n\t\tstrArray = categorySet.split(\",\"); // 拆分字符为\",\",然后把结果交给数组strArray\n\t\t// 获取需要的信息\n\t\tint length_strArray = strArray.length;\n\t\tint length_ruleList = ruleList.length;\n\t\t\n\t\tif (length_strArray > 0) {\n\t\t\tfor (int iArr = 0; iArr < length_strArray; iArr++) {\n\t\t\t\tString s = strArray[iArr];\n\t\t\t\t\tfor (int iRul=0; iRul < length_ruleList; iRul++) {\n\t\t\t\t\t\tif (s.equals(ruleList[iRul])) {\n\t\t\t\t\t\t\tdisStr.append(map.get(s));\n\t\t\t\t\t\t\tdisStr.append(\",\");\n\t\t\t\t\t\t\t\tbreak;\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\tif(disStr.length()>1) {\n\t\t\t\tdisStr = disStr.deleteCharAt(disStr.length()-1);\n\t\t\t}\n\t\t\treturn disStr.toString();\n\t\t}\n\t\n\t//读取整个页面，返回html字符串\n\tprivate static String httpRequest(String requestUrl) {\n\t\tStringBuffer buffer = null;\n\t\tBufferedReader bufferedReader = null;\n\t\tInputStreamReader inputStreamReader = null;\n\t\tInputStream inputStream = null;\n\t\tHttpURLConnection httpUrlConn = null;\n\t\ttry {\n\t\t\t// 建立get请求\n\t\t\tURL url = new URL(requestUrl);\n\t\t\thttpUrlConn = (HttpURLConnection) url.openConnection();\n\t\t\thttpUrlConn.setDoInput(true);\n\t\t\thttpUrlConn.setRequestMethod(\"GET\");\n\t\t\t// 获取输入流\n\t\t\tinputStream = httpUrlConn.getInputStream();\n\t\t\tinputStreamReader = new InputStreamReader(inputStream, \"UTF-8\");\n\t\t\tbufferedReader = new BufferedReader(inputStreamReader);\n\t\t\t// 从输入流获取结果\n\t\t\tbuffer = new StringBuffer();\n\t\t\tString str = null;\n\t\t\twhile ((str = bufferedReader.readLine()) != null) {\n\t\t\t\tstr = new String(str.getBytes(), \"UTF-8\");\n\t\t\t\tbuffer.append(str);\n\t\t\t}\n\t\t} catch (Exception e) {\n\t\t\te.printStackTrace();\n\t\t} finally {\n\t\t\tif (bufferedReader != null) {\n\t\t\t\ttry {\n\t\t\t\t\tbufferedReader.close();\n\t\t\t\t} catch (IOException e) {\n\t\t\t\t\te.printStackTrace();\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (inputStreamReader != null) {\n\t\t\t\ttry {\n\t\t\t\t\tinputStreamReader.close();\n\t\t\t\t} catch (IOException e) {\n\t\t\t\t\te.printStackTrace();\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (inputStream != null) {\n\t\t\t\ttry {\n\t\t\t\t\tinputStream.close();\n\t\t\t\t} catch (IOException e) {\n\t\t\t\t\te.printStackTrace();\n\t\t\t\t}\n\t\t\t}\n\t\t\tif (httpUrlConn != null) {\n\t\t\t\thttpUrlConn.disconnect();\n\t\t\t}\n\t\t}\n\t\treturn buffer.toString();\n\t}\n\n\t// 过滤掉无用的信息\n\tpublic static List<Map<String, String>> htmlFiter(String html, String Regex) {\n\t\tList<Map<String, String>> list = new ArrayList<Map<String, String>>();\n\t\t// 查找目标\n\t\tPattern p = Pattern.compile(Regex);\n\t\tMatcher m = p.matcher(html);\n\t\twhile (m.find()) {\n\t\t\tMap<String, String> map_save = new HashMap<String, String>();\n\t\t\t// 可修改部分\n\t\t\tmap_save.put(REGEXSTRING1, m.group(1));\n\t\t\tmap_save.put(REGEXSTRING2, m.group(2));\n\t\t\tmap_save.put(REGEXSTRING3, m.group(3));\n\t\t\t\n\t\t\tlist.add(map_save);\n\t\t}\n\t\treturn list;\n\t}\n\t\n\t//unicode格式转中文\n\tpublic static String UnicodeToString(String str) {\n\t\t\tPattern pattern = Pattern.compile(\"(\\\\\\\\u(\\\\p{XDigit}{4}))\"); // XDigit表示16进制数字，正则里的\\p表示Unicode块\n\t\t\tMatcher matcher = pattern.matcher(str);\n\t\t\tchar ch;\n\t\t\twhile (matcher.find()) {\n\t\t\t\tch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码，再char转为字符\n\t\t\t\tstr = str.replace(matcher.group(1), ch + \"\");\n\t\t\t}\n\t\t\treturn str;\n\t\t}\n\t\n\tpublic void run() {\n\t\t// 链接数据库\n\t\ttry {\n\t\t\tMongo mongo = new Mongo(\"localhost\", 27017);\n\t\t\tDB db = mongo.getDB(DataBaseName);\n\t\t\tDBCollection collection = db.getCollection(CollectionName);\n\t\t\t\n\t\t\t// 调用抓取的方法获取内容\n\t\t\tfor (int i = start; i <= end; i++) {\n\t\t\t\tString requestUrl = url + i;\n\t\t\t\tSystem.out.println(requestUrl);\n\t\t\t\t\n\t\t\t\tString html = httpRequest(requestUrl);\n\t\t\t\tList<Map<String, String>> resultList = htmlFiter(html, Regex);\n\t\t\t\t\n\t\t\t\tif (resultList.isEmpty()) {\n\t\t\t\t\tSystem.out.printf(\"The end url: %s\", requestUrl);\n\t\t\t\t\tbreak;\n\t\t\t\t} else {\n\t\t\t\t\tfor (Map<String, String> result : resultList) {\n\t\t\t\t\t\tBasicDBObject dbObject = new BasicDBObject();\n\t\t\t\t\t\t\n\t\t\t\t\t\tString type = result.get(REGEXSTRING1);\n\t\t\t\t\t\tString content = UnicodeToString(result.get(REGEXSTRING2));\n//\t\t\t\t\t\tString content = result.get(REGEXSTRING2);\n\t\t\t\t\t\t\n\t\t\t\t\t\tMap<String, String> map = GetMap();\n\t\t\t\t\t\tString district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); \n\t\t\t\t\t\tString property = setCategory(result.get(REGEXSTRING3), ruleList_property, map);\n\t\t\t\t\t\tString centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map);\n\t\t\t\t\t\t\n\t\t\t\t\t\tDate date = new Date();\n\t\t\t\t\t\tDateFormat time = DateFormat.getDateTimeInstance();\n\t\t\t\t\t\tString time_str = time.format(date);\n\t\t\t\t\t\t\n\t\t\t\t\t\tString source = \"wangstreetcn\";\n\n\t\t\t\t\t\tdbObject.put(\"content\", content);       // 具体内容\n\t\t\t\t\t\tdbObject.put(\"createdtime\", time_str);   // 创建时间\n\t\t\t\t\t\tdbObject.put(\"source\", source);          // 信息来源\n\t\t\t\t\t\tdbObject.put(\"district\", district);      // 所属地区\n\t\t\t\t\t\tdbObject.put(\"property\", property);      // 资产类别\n\t\t\t\t\t\tdbObject.put(\"centralbank\", centralbank); // 资产类别\n\t\t\t\t\t\tdbObject.put(\"type\", type); //信息类型\n\t\t\t\t\t\t\n\t\t\t\t\t\tcollection.insert(dbObject);\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t} catch (Exception e) {\n\t\t\te.printStackTrace();\n\t\t} \n\t}\n\t\n\t\n\tpublic static void main(String[] args) throws InterruptedException {\n\t\tWallstreetcnSaveTest wallstreetcnsave = new WallstreetcnSaveTest();\n\t\twallstreetcnsave.run();\n\t}\n\n}\n"
  },
  {
    "path": "Spider_Java/Spider_Java2/.classpath",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<classpath>\n\t<classpathentry kind=\"src\" path=\"src\"/>\n\t<classpathentry kind=\"con\" path=\"org.eclipse.jdt.launching.JRE_CONTAINER\"/>\n\t<classpathentry kind=\"lib\" path=\"lib/mongo-java-driver-2.13.0-rc1.jar\"/>\n\t<classpathentry kind=\"output\" path=\"bin\"/>\n</classpath>\n"
  },
  {
    "path": "Spider_Java/Spider_Java2/.project",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<projectDescription>\r\n\t<name>Spider</name>\r\n\t<comment></comment>\r\n\t<projects>\r\n\t</projects>\r\n\t<buildSpec>\r\n\t\t<buildCommand>\r\n\t\t\t<name>org.eclipse.jdt.core.javabuilder</name>\r\n\t\t\t<arguments>\r\n\t\t\t</arguments>\r\n\t\t</buildCommand>\r\n\t</buildSpec>\r\n\t<natures>\r\n\t\t<nature>org.eclipse.jdt.core.javanature</nature>\r\n\t</natures>\r\n</projectDescription>\r\n"
  },
  {
    "path": "Spider_Java/Spider_Java2/src/synchronizetest/Test.java",
    "content": "/**\r\n * \r\n */\r\npackage synchronizetest;\r\n\r\n/**\r\n * @author FIRELING\r\n *\r\n */\r\npublic class Test\r\n{\r\n\tpublic static void main(String[] args)\r\n\t{\r\n\t\tReservoir r = new Reservoir(100);\r\n\t\tBooth b1 = new Booth(r);\r\n\t\tBooth b2 = new Booth(r);\r\n\t\tBooth b3 = new Booth(r);\r\n\t}\r\n}\r\n/**\r\n * contain shared resource\r\n */\r\nclass Reservoir {\r\n\tprivate int total;\r\n\tpublic Reservoir(int t) \r\n\t{\r\n\t\tthis.total = t;\r\n\t}\r\n\t/**\r\n\t * Thread safe method\r\n\t * serialized access to Booth.total\r\n\t */\r\n\tpublic synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法\r\n\t{\r\n\t\tif(this.total > 0) {\r\n\t\t\tthis.total = this.total-1;\r\n\t\t\treturn true; // successfully sell one\r\n\t\t}\r\n\t\telse {\r\n\t\t\treturn false; // no more tickets\r\n\t\t}\r\n\t}\r\n}\r\n/**\r\n * create new thread by inheriting Thread\r\n */\r\nclass Booth extends Thread {\r\n\tprivate static int threadID = 0; // owned by Class object\r\n\r\n\tprivate Reservoir release; // sell this reservoir \r\n\tprivate int count = 0; // owned by this thread object\r\n\t/**\r\n\t * constructor\r\n\t */\r\n\tpublic Booth(Reservoir r) {\r\n\t\tsuper(\"ID:\"+(++threadID));\r\n\t\tthis.release = r; // all threads share the same reservoir\r\n\t\tthis.start();\r\n\t}\r\n\t/**\r\n\t * convert object to string\r\n\t */\r\n\tpublic String toString() {\r\n\t\treturn super.getName();\r\n\t}\r\n\t/**\r\n\t * what does the thread do?\r\n\t */\r\n\tpublic void run() {\r\n\t\twhile(true) { // 循环体！！！\r\n\t\t\tif(this.release.sellTicket()) {\r\n\t\t\t\tthis.count = this.count+1;\r\n\t\t\t\tSystem.out.println(this.getName()+\":sell 1\");\r\n\t\t\t\ttry {\r\n\t\t\t\t\tsleep((int) Math.random()*100); // random intervals\r\n\t\t\t\t\t// sleep(100); // 若sleep时间相同，则每个窗口买票相当\r\n\t\t\t\t}\r\n\t\t\t\tcatch (InterruptedException e) {\r\n\t\t\t\t\tthrow new RuntimeException(e);\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\telse {\r\n\t\t\t\tbreak;\r\n\t\t\t}\r\n\t\t}\r\n\t\tSystem.out.println(this.getName()+\" I sold:\"+count);\r\n\t}\r\n}\r\n\r\n"
  },
  {
    "path": "Spider_Java/Spider_Java2/src/wallstreetcnsave/WallstreetcnSaveTest.java",
    "content": "package wallstreetcnsave;\r\n\r\nimport java.io.BufferedReader;\r\nimport java.io.IOException;\r\nimport java.io.InputStream;\r\nimport java.io.InputStreamReader;\r\nimport java.net.HttpURLConnection;\r\nimport java.net.URL;\r\nimport java.text.DateFormat;\r\nimport java.util.ArrayList;\r\nimport java.util.Date;\r\nimport java.util.HashMap;\r\nimport java.util.List;\r\nimport java.util.Map;\r\nimport java.util.regex.Matcher;\r\nimport java.util.regex.Pattern;\r\n\r\nimport com.mongodb.BasicDBObject;\r\nimport com.mongodb.DB;\r\nimport com.mongodb.DBCollection;\r\nimport com.mongodb.Mongo;\r\n\r\n\r\nclass WallstreetcnSave implements Runnable {\r\n\t\r\n\tprivate GetrequestUrl release;\r\n\tpublic WallstreetcnSave(GetrequestUrl url) {\r\n\t\tthis.release = url; // all threads share the same GetrequestUrl\r\n\t}\r\n\t\r\n\tprivate static String DataBaseName = \"textclassify\";\r\n\tprivate static String CollectionName = \"WallstreetSaveJava\";\r\n\t\r\n\tprivate static String Regex = \".*?\\\"type\\\":\\\"(.*?)\\\".*?\\\"contentHtml\\\":\\\"<p>(.*?)<\\\\\\\\/p>\\\".*?\\\"categorySet\\\":\\\"(.*?)\\\".*?\";\r\n\tprivate static final String REGEXSTRING1 = \"type\";\r\n\tprivate static final String REGEXSTRING2 = \"content\";\r\n\tprivate static final String REGEXSTRING3 = \"categoryset\";\r\n\t\r\n\t//map表的存放\r\n\tpublic static Map<String, String> GetMap() {\r\n\t\tMap<String, String> map = new HashMap<String, String>();\r\n\t\tmap.put(\"1\", \"外汇\");\r\n\t\tmap.put(\"2\", \"股市\");\r\n\t\tmap.put(\"3\", \"商品\");\r\n\t\tmap.put(\"4\", \"债市\");\r\n\t\tmap.put(\"9\", \"中国\");\r\n\t\tmap.put(\"10\", \"美国\");\r\n\t\tmap.put(\"11\", \"欧元区\");\r\n\t\tmap.put(\"12\", \"日本\");\r\n\t\tmap.put(\"13\", \"英国\");\r\n\t\tmap.put(\"14\", \"澳洲\");\r\n\t\tmap.put(\"15\", \"加拿大\");\r\n\t\tmap.put(\"16\", \"瑞士\");\r\n\t\tmap.put(\"17\", \"其他地区\");\r\n\t\tmap.put(\"5\", \"央行\");\r\n\t\treturn map;\r\n\t}\r\n\tprivate static String[] ruleList_district = { \"9\", \"10\", \"11\", \"12\", \"13\", \"14\", \"15\", \"16\", \"17\" };\r\n\tprivate static String[] ruleList_property = { \"1\", \"2\", \"3\", \"4\" };\r\n\tprivate static String[] ruleList_centralbank = { \"5\" };\r\n\t\r\n\t//对x,x,x格式的内容进行分隔筛选\r\n\tpublic static String setCategory(String categorySet, String[] ruleList, Map<String, String> map) {\r\n\t\tStringBuffer disStr = new StringBuffer(); \r\n\t\tString[] strArray = null;\r\n\t\tstrArray = categorySet.split(\",\"); // 拆分字符为\",\",然后把结果交给数组strArray\r\n\t\t// 获取需要的信息\r\n\t\tint length_strArray = strArray.length;\r\n\t\tint length_ruleList = ruleList.length;\r\n\t\t\r\n\t\tif (length_strArray > 0) {\r\n\t\t\tfor (int iArr = 0; iArr < length_strArray; iArr++) {\r\n\t\t\t\tString s = strArray[iArr];\r\n\t\t\t\t\tfor (int iRul=0; iRul < length_ruleList; iRul++) {\r\n\t\t\t\t\t\tif (s.equals(ruleList[iRul])) {\r\n\t\t\t\t\t\t\tdisStr.append(map.get(s));\r\n\t\t\t\t\t\t\tdisStr.append(\",\");\r\n\t\t\t\t\t\t\t\tbreak;\r\n\t\t\t\t\t\t\t}\r\n\t\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif(disStr.length()>1) {\r\n\t\t\t\tdisStr = disStr.deleteCharAt(disStr.length()-1);\r\n\t\t\t}\r\n\t\t\treturn disStr.toString();\r\n\t\t}\r\n\t\r\n\t//读取整个页面，返回html字符串\r\n\tprivate static String httpRequest(String requestUrl) {\r\n\t\tStringBuffer buffer = null;\r\n\t\tBufferedReader bufferedReader = null;\r\n\t\tInputStreamReader inputStreamReader = null;\r\n\t\tInputStream inputStream = null;\r\n\t\tHttpURLConnection httpUrlConn = null;\r\n\t\ttry {\r\n\t\t\t// 建立get请求\r\n\t\t\tURL url = new URL(requestUrl);\r\n\t\t\thttpUrlConn = (HttpURLConnection) url.openConnection();\r\n\t\t\thttpUrlConn.setDoInput(true);\r\n\t\t\thttpUrlConn.setRequestMethod(\"GET\");\r\n\t\t\t// 获取输入流\r\n\t\t\tinputStream = httpUrlConn.getInputStream();\r\n\t\t\tinputStreamReader = new InputStreamReader(inputStream, \"UTF-8\");\r\n\t\t\tbufferedReader = new BufferedReader(inputStreamReader);\r\n\t\t\t// 从输入流获取结果\r\n\t\t\tbuffer = new StringBuffer();\r\n\t\t\tString str = null;\r\n\t\t\twhile ((str = bufferedReader.readLine()) != null) {\r\n\t\t\t\tstr = new String(str.getBytes(), \"UTF-8\");\r\n\t\t\t\tbuffer.append(str);\r\n\t\t\t}\r\n\t\t} catch (Exception e) {\r\n\t\t\te.printStackTrace();\r\n\t\t} finally {\r\n\t\t\tif (bufferedReader != null) {\r\n\t\t\t\ttry {\r\n\t\t\t\t\tbufferedReader.close();\r\n\t\t\t\t} catch (IOException e) {\r\n\t\t\t\t\te.printStackTrace();\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (inputStreamReader != null) {\r\n\t\t\t\ttry {\r\n\t\t\t\t\tinputStreamReader.close();\r\n\t\t\t\t} catch (IOException e) {\r\n\t\t\t\t\te.printStackTrace();\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (inputStream != null) {\r\n\t\t\t\ttry {\r\n\t\t\t\t\tinputStream.close();\r\n\t\t\t\t} catch (IOException e) {\r\n\t\t\t\t\te.printStackTrace();\r\n\t\t\t\t}\r\n\t\t\t}\r\n\t\t\tif (httpUrlConn != null) {\r\n\t\t\t\thttpUrlConn.disconnect();\r\n\t\t\t}\r\n\t\t}\r\n\t\treturn buffer.toString();\r\n\t}\r\n\r\n\t// 过滤掉无用的信息\r\n\tpublic static List<Map<String, String>> htmlFiter(String html, String Regex) {\r\n\t\tList<Map<String, String>> list = new ArrayList<Map<String, String>>();\r\n\t\t// 查找目标\r\n\t\tPattern p = Pattern.compile(Regex);\r\n\t\tMatcher m = p.matcher(html);\r\n\t\twhile (m.find()) {\r\n\t\t\tMap<String, String> map_save = new HashMap<String, String>();\r\n\t\t\t// 可修改部分\r\n\t\t\tmap_save.put(REGEXSTRING1, m.group(1));\r\n\t\t\tmap_save.put(REGEXSTRING2, m.group(2));\r\n\t\t\tmap_save.put(REGEXSTRING3, m.group(3));\r\n\t\t\t\r\n\t\t\tlist.add(map_save);\r\n\t\t}\r\n\t\treturn list;\r\n\t}\r\n\t\r\n\t//unicode格式转中文\r\n\tpublic static String UnicodeToString(String str) {\r\n\t\t\tPattern pattern = Pattern.compile(\"(\\\\\\\\u(\\\\p{XDigit}{4}))\"); // XDigit表示16进制数字，正则里的\\p表示Unicode块\r\n\t\t\tMatcher matcher = pattern.matcher(str);\r\n\t\t\tchar ch;\r\n\t\t\twhile (matcher.find()) {\r\n\t\t\t\tch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码，再char转为字符\r\n\t\t\t\tstr = str.replace(matcher.group(1), ch + \"\");\r\n\t\t\t}\r\n\t\t\treturn str;\r\n\t\t}\r\n\t\r\n\tpublic void run() {\r\n\t\twhile(true) { // 循环体！！！\r\n\t\t\t// 链接数据库\r\n\t\t\ttry {\r\n\t\t\t\tMongo mongo = new Mongo(\"localhost\", 27017);\r\n\t\t\t\tDB db = mongo.getDB(DataBaseName);\r\n\t\t\t\tDBCollection collection = db.getCollection(CollectionName);\r\n\t\t\t\t\r\n\t\t\t\t// 调用抓取的方法获取内容\r\n\t\t\t\tString requestUrl = this.release.GetMethod();\r\n\t\t\t\tif(requestUrl.equals(\"\")) {\r\n\t\t\t\t\tbreak;\r\n\t\t\t\t} else {\r\n\t\t\t\t\tSystem.out.println(requestUrl);\r\n\t\t\t\t\t\r\n\t\t\t\t\tString html = httpRequest(requestUrl);\r\n\t\t\t\t\tList<Map<String, String>> resultList = htmlFiter(html, Regex);\r\n\t\t\t\t\t\r\n\t\t\t\t\tif (resultList.isEmpty()) {\r\n\t\t\t\t\t\tSystem.out.printf(\"The end url: %s\", requestUrl);\r\n\t\t\t\t\t\tbreak;\r\n\t\t\t\t\t} else {\r\n\t\t\t\t\t\tfor (Map<String, String> result : resultList) {\r\n\t\t\t\t\t\t\tBasicDBObject dbObject = new BasicDBObject();\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tString type = result.get(REGEXSTRING1);\r\n\t\t\t\t\t\t\tString content = UnicodeToString(result.get(REGEXSTRING2));\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tMap<String, String> map = GetMap();\r\n\t\t\t\t\t\t\tString district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); \r\n\t\t\t\t\t\t\tString property = setCategory(result.get(REGEXSTRING3), ruleList_property, map);\r\n\t\t\t\t\t\t\tString centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map);\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tDate date = new Date();\r\n\t\t\t\t\t\t\tDateFormat time = DateFormat.getDateTimeInstance();\r\n\t\t\t\t\t\t\tString time_str = time.format(date);\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tString source = \"wangstreetcn\";\r\n\t\t\r\n\t\t\t\t\t\t\tdbObject.put(\"content\", content);       // 具体内容\r\n\t\t\t\t\t\t\tdbObject.put(\"createdtime\", time_str);   // 创建时间\r\n\t\t\t\t\t\t\tdbObject.put(\"source\", source);          // 信息来源\r\n\t\t\t\t\t\t\tdbObject.put(\"district\", district);      // 所属地区\r\n\t\t\t\t\t\t\tdbObject.put(\"property\", property);      // 资产类别\r\n\t\t\t\t\t\t\tdbObject.put(\"centralbank\", centralbank); // 资产类别\r\n\t\t\t\t\t\t\tdbObject.put(\"type\", type); //信息类型\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tcollection.insert(dbObject);\r\n\t\t\t\t\t\t}\r\n\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t} catch (Exception e) {\r\n\t\t\t\te.printStackTrace();\r\n\t\t\t} \r\n\t\t}\r\n\t}\r\n\r\n\tpublic void run1() {\r\n\t\twhile(true) { // 循环体！！！\r\n\t\t\t// 链接数据库\r\n\t\t\ttry {\r\n\t\t\t\tMongo mongo = new Mongo(\"localhost\", 27017);\r\n\t\t\t\tDB db = mongo.getDB(DataBaseName);\r\n\t\t\t\tDBCollection collection = db.getCollection(CollectionName);\r\n\t\t\t\t\r\n\t\t\t\t// 调用抓取的方法获取内容\r\n\t\t\t\tString requestUrl = this.release.GetMethod();\r\n\t\t\t\tif(requestUrl.equals(\"\")) {\r\n\t\t\t\t\tbreak;\r\n\t\t\t\t} else {\r\n\t\t\t\t\tSystem.out.println(requestUrl);\r\n\t\t\t\t\t\r\n\t\t\t\t\tString html = httpRequest(requestUrl);\r\n\t\t\t\t\tList<Map<String, String>> resultList = htmlFiter(html, Regex);\r\n\t\t\t\t\t\r\n\t\t\t\t\tif (resultList.isEmpty()) {\r\n\t\t\t\t\t\tSystem.out.printf(\"The end url: %s\\n\", requestUrl);\r\n\t\t\t\t\t\tbreak;\r\n\t\t\t\t\t} else {\r\n\t\t\t\t\t\tfor (Map<String, String> result : resultList) {\r\n\t\t\t\t\t\t\tBasicDBObject dbObject = new BasicDBObject();\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tString type = result.get(REGEXSTRING1);\r\n\t\t\t\t\t\t\tString content = UnicodeToString(result.get(REGEXSTRING2));\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tMap<String, String> map = GetMap();\r\n\t\t\t\t\t\t\tString district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); \r\n\t\t\t\t\t\t\tString property = setCategory(result.get(REGEXSTRING3), ruleList_property, map);\r\n\t\t\t\t\t\t\tString centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map);\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tDate date = new Date();\r\n\t\t\t\t\t\t\tDateFormat time = DateFormat.getDateTimeInstance();\r\n\t\t\t\t\t\t\tString time_str = time.format(date);\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tString source = \"wangstreetcn\";\r\n\t\t\r\n\t\t\t\t\t\t\tdbObject.put(\"content\", content);       // 具体内容\r\n\t\t\t\t\t\t\tdbObject.put(\"createdtime\", time_str);   // 创建时间\r\n\t\t\t\t\t\t\tdbObject.put(\"source\", source);          // 信息来源\r\n\t\t\t\t\t\t\tdbObject.put(\"district\", district);      // 所属地区\r\n\t\t\t\t\t\t\tdbObject.put(\"property\", property);      // 资产类别\r\n\t\t\t\t\t\t\tdbObject.put(\"centralbank\", centralbank); // 资产类别\r\n\t\t\t\t\t\t\tdbObject.put(\"type\", type); //信息类型\r\n\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\tcollection.insert(dbObject);\r\n\t\t\t\t\t\t}\r\n\t\t\t\t\t}\r\n\t\t\t\t}\r\n\t\t\t} catch (Exception e) {\r\n\t\t\t\te.printStackTrace();\r\n\t\t\t} \r\n\t\t}\r\n\t}\r\n\t\r\n}\r\n\t\r\n/**\r\n * contain shared resource\r\n */\r\nclass GetrequestUrl {\r\n\t\r\n\tprivate String url = \"http://api.wallstreetcn.com/v2/livenews?&page=\";\r\n\tprivate int start;\r\n\tprivate int end = 5000;\r\n\t\r\n\tpublic GetrequestUrl(int start) \r\n\t{\r\n\t\tthis.start = start;\r\n\t}\r\n\tpublic GetrequestUrl(int start, int end) \r\n\t{\r\n\t\tthis.start = start;\r\n\t\tthis.end = end;\r\n\t}\r\n\r\n\t/**\r\n\t * Thread safe method\r\n\t */\r\n\tpublic synchronized String GetMethod() // 利用synchronized修饰符同步了整个方法\r\n\t{\r\n\t\tif(this.start <= this.end) {\r\n\t\t\tString requestUrl = this.url+this.start;\r\n\t\t\tthis.start = this.start+1;\r\n\t\t\treturn requestUrl;\r\n\t\t}\r\n\t\telse {\r\n\t\t\treturn \"\"; \r\n\t\t}\r\n\t}\r\n}\r\n\r\n\r\npublic class WallstreetcnSaveTest {\r\n\tpublic static void main(String[] args) {\t\t\r\n\t\t// 多线程抓取\r\n\t\tint start = 1;\r\n\t\tGetrequestUrl url = new GetrequestUrl(start);\r\n//\t\tint start = 1, end = 3000;\r\n//\t\tGetrequestUrl url = new GetrequestUrl(start, end);\r\n\t\t\r\n\t\tint thread_num = 1;\r\n\t\twhile(true) {\r\n\t\t\tif(thread_num++ > 8) break;\r\n\t\t\tThread thread = new Thread(new WallstreetcnSave(url));\r\n\t\t\tthread.start();\r\n\t\t}\r\n\t\t\r\n\t}\r\n}\r\n"
  },
  {
    "path": "Spider_Python/README.md",
    "content": "### Spider_Python\n\n抓取网址：[华尔街见闻](http://live.wallstreetcn.com/)\n\n多进程抓取\n"
  },
  {
    "path": "Spider_Python/WallstreetcnSaveTest.py",
    "content": "#!/usr/bin/env python\n# -*- coding:utf-8 -*-\n\nimport sys\nimport re\nimport urllib, urllib2\nimport requests\nimport pymongo\nimport datetime\nimport multiprocessing as mp\n\n\nCategory_Map = {\n    \"1\":u\"外汇\",\n    \"2\":u\"股市\",\n    \"3\":u\"商品\",\n    \"4\":u\"债市\",\n    \"5\":u\"央行\",\n    \"9\":u\"中国\",\n    \"10\":u\"美国\",\n    \"11\":u\"欧元区\",\n    \"12\":u\"日本\",\n    \"13\":u\"英国\",\n    \"14\":u\"澳洲\",\n    \"15\":u\"加拿大\",\n    \"16\":u\"瑞士\",\n    \"17\":u\"其他地区\"\n}\ndef num2name(category_num):\n    if Category_Map.has_key(category_num):\n        return Category_Map[category_num]\n    else:\n        return \"\"\n\nclass MongoDBIO:\n    # 申明相关的属性\n    def __init__(self, host, port, name, password, database, collection):\n        self.host = host\n        self.port = port\n        self.name = name\n        self.password = password\n        self.database = database\n        self.collection = collection\n\n    # 连接数据库，db和posts为数据库和集合的游标\n    def Connection(self):\n        # connection = pymongo.Connection() # 连接本地数据库\n        connection = pymongo.Connection(host=self.host, port=self.port)\n        # db = connection.datas\n        db = connection[self.database]\n        if self.name or self.password:\n            db.authenticate(name=self.name, password=self.password) # 验证用户名密码\n        # print \"Database:\", db.name\n        # posts = db.cn_live_news\n        posts = db[self.collection]\n        # print \"Collection:\", posts.name\n        return posts\n\n# 保存操作\n# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):\n#     posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()\n#     for save_content in save_contents:\n#         posts.save(save_content)\ndef ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):\n    posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()\n    posts.save(save_content)\n\ndef Spider(url, data):\n    # # 方法1：requests get\n    content = requests.get(url=url, params=data).content # GET请求发送\n    # # 方法2：urllib2 get\n    # data = urllib.urlencode(data) # 编码工作，由dict转为string\n    # full_url = url+'?'+data\n    # print full_url\n    # content = urllib2.urlopen(full_url).read() # GET请求发送\n    # # content = requests.get(full_url).content # GET请求发送\n    # print type(content) # str\n    return content\n\ndef ContentSave(item):\n    # 保存配置\n    save_host = \"localhost\"\n    save_port = 27017\n    save_name = \"\"\n    save_password = \"\"\n    save_database = \"textclassify\"\n    save_collection = \"WallstreetcnSave\"\n\n    source = \"wallstreetcn\"\n    createdtime = datetime.datetime.now()\n    type = item[0]\n    content = item[1].decode(\"unicode_escape\") # json格式数据中，需从'\\\\uxxxx'形式的unicode_escape编码转换成u'\\uxxxx'的unicode编码\n    content = content.encode(\"utf-8\")\n    # print content\n    # district的筛选\n    categorySet = item[2]\n    category_num = categorySet.split(\",\")\n    category_name = map(num2name, category_num)\n    districtset = set(category_name)&{u\"中国\", u\"美国\", u\"欧元区\", u\"日本\", u\"英国\", u\"澳洲\", u\"加拿大\", u\"瑞士\", u\"其他地区\"}\n    district = \",\".join(districtset)\n    propertyset = set(category_name)&{u\"外汇\", u\"股市\", u\"商品\", u\"债市\"}\n    property = \",\".join(propertyset)\n    centralbankset = set(category_name)&{u\"央行\"}\n    centralbank = \",\".join(centralbankset)\n    save_content = {\n        \"source\":source,\n        \"createdtime\":createdtime,\n        \"content\":content,\n        \"type\":type,\n        \"district\":district,\n        \"property\":property,\n        \"centralbank\":centralbank\n    }\n    ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)\n\ndef func(page):\n    url = \"http://api.wallstreetcn.com/v2/livenews\"\n    # get参数\n    data = {\n        \"page\":page\n    }\n    content = Spider(url, data)\n    items = re.findall(r'\"type\":\"(.*?)\",\"codeType\".*?\"contentHtml\":\"(.*?)\",\"data\".*?\"categorySet\":\"(.*?)\",\"hasMore\"', content) # 正则匹配\n    if len(items) == 0:\n        print \"The End Page:\", page\n        data = urllib.urlencode(data) # 编码工作，由dict转为string\n        full_url = url+'?'+data\n        print full_url\n        sys.exit(0) # 无错误退出\n    else:\n        print \"The Page:\", page, \"Downloading...\"\n        for item in items:\n            ContentSave(item)\n\n\nif __name__ == '__main__':\n\n    start = datetime.datetime.now()\n\n    start_page = 1\n    end_page = 3300\n\n\n    # 多进程抓取\n    pages = [i for i in range(start_page, end_page)]\n    p = mp.Pool()\n    p.map_async(func, pages)\n    p.close()\n    p.join()\n\n\n    # 单进程抓取\n    page = end_page\n\n    while 1:\n        url = \"http://api.wallstreetcn.com/v2/livenews\"\n        # get参数\n        data = {\n            \"page\":page\n        }\n        content = Spider(url, data)\n        items = re.findall(r'\"type\":\"(.*?)\",\"codeType\".*?\"contentHtml\":\"(.*?)\",\"data\".*?\"categorySet\":\"(.*?)\",\"hasMore\"', content) # 正则匹配\n        if len(items) == 0:\n            print \"The End Page:\", page\n            data = urllib.urlencode(data) # 编码工作，由dict转为string\n            full_url = url+'?'+data\n            print full_url\n            break\n        else:\n            print \"The Page:\", page, \"Downloading...\"\n            for item in items:\n                ContentSave(item)\n            page += 1\n\n    end = datetime.datetime.now()\n    print \"last time: \", end-start\n"
  },
  {
    "path": "WechatSearchProjects/README.md",
    "content": "### 使用Scrapy或Requests递归抓取[微信搜索](http://weixin.sogou.com/weixin)结果\r\n\r\n使用Scrapy方法 或者 使用Requests+BeautifulSoup\r\n\r\n**使用Scrapy方法：**  \r\n\r\n* 将querystring替换为你要查询的单词\r\n\r\n* type可以选择\r\n\r\n* i的范围可以调整，对应查询的搜索结果页面数目  \r\n"
  },
  {
    "path": "WechatSearchProjects/Spider_Main.py",
    "content": "#coding: utf-8\r\nfrom scrapy.cmdline import execute\r\nimport os\r\n\r\nif __name__ == '__main__':\r\n    project_name = \"Wechatproject\"\r\n    spider_name = \"wechat\"\r\n    results_name = \"results/results.json\"\r\n\r\n    if not os.path.exists(project_name):\r\n        print \"Please Edit the project files and Run again!!!\"\r\n        s = \"scrapy startproject %s\" % project_name\r\n        execute(s.split())\r\n    else:\r\n        print \"Start Crawling!!!\"\r\n        path = os.getcwd() # 获取当前路径\r\n        os.chdir(path+\"/\"+project_name) # 修改当前路径\r\n        if os.path.exists(results_name):\r\n            os.remove(results_name)\r\n        s = \"scrapy crawl %s\" % spider_name\r\n        # s = \"scrapy crawl %s -o %s -t json\" % (spider_name, results_name)\r\n        execute(s.split())\r\n"
  },
  {
    "path": "WechatSearchProjects/WechatSearchTest.py",
    "content": "#!/usr/bin/env python\r\n# -*- coding:utf-8 -*-\r\n\r\nimport sys\r\nimport re\r\nimport urllib, urllib2\r\nimport requests\r\nimport pymongo\r\nimport datetime\r\nfrom bs4 import BeautifulSoup\r\nimport multiprocessing as mp\r\n\r\n\r\nclass MongoDBIO:\r\n    # 申明相关的属性\r\n    def __init__(self, host, port, name, password, database, collection):\r\n        self.host = host\r\n        self.port = port\r\n        self.name = name\r\n        self.password = password\r\n        self.database = database\r\n        self.collection = collection\r\n\r\n    # 连接数据库，db和posts为数据库和集合的游标\r\n    def Connection(self):\r\n        # connection = pymongo.Connection() # 连接本地数据库\r\n        connection = pymongo.Connection(host=self.host, port=self.port)\r\n        # db = connection.datas\r\n        db = connection[self.database]\r\n        if self.name or self.password:\r\n            db.authenticate(name=self.name, password=self.password) # 验证用户名密码\r\n        # print \"Database:\", db.name\r\n        # posts = db.cn_live_news\r\n        posts = db[self.collection]\r\n        # print \"Collection:\", posts.name\r\n        return posts\r\n\r\n# # 保存操作\r\n# def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents):\r\n#     posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()\r\n#\r\n#     for save_content in save_contents:\r\n#         posts.save(save_content)\r\n# 保存操作\r\ndef ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content):\r\n    posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection()\r\n    posts.save(save_content)\r\n\r\n\r\ndef GetTitleUrl(url, data):\r\n    content = requests.get(url=url, params=data).content # GET请求发送\r\n    soup = BeautifulSoup(content)\r\n    tags = soup.findAll(\"h4\")\r\n    titleurl = []\r\n    for tag in tags:\r\n        item = {\"title\":tag.text.strip(), \"link\":tag.find(\"a\").get(\"href\"), \"content\":\"\"}\r\n        titleurl.append(item)\r\n    return titleurl\r\n\r\ndef GetContent(url):\r\n    soup = BeautifulSoup(requests.get(url=url).content)\r\n    tag = soup.find(\"div\", attrs={\"class\":\"rich_media_content\", \"id\":\"js_content\"}) # 提取第一个标签\r\n    content_list = [tag_i.text for tag_i in tag.findAll(\"p\")]\r\n    content = \"\".join(content_list)\r\n    return content\r\n\r\ndef ContentSave(item):\r\n    # 保存配置\r\n    save_host = \"localhost\"\r\n    save_port = 27017\r\n    save_name = \"\"\r\n    save_password = \"\"\r\n    save_database = \"testwechat\"\r\n    save_collection = \"result\"\r\n\r\n    save_content = {\r\n        \"title\":item[\"title\"],\r\n        \"link\":item[\"link\"],\r\n        \"content\":item[\"content\"]\r\n    }\r\n\r\n    ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content)\r\n\r\ndef func(tuple):\r\n    querystring, type, page = tuple[0], tuple[1], tuple[2]\r\n    url = \"http://weixin.sogou.com/weixin\"\r\n    # get参数\r\n    data = {\r\n        \"query\":querystring,\r\n        \"type\":type,\r\n        \"page\":page\r\n    }\r\n\r\n    titleurl = GetTitleUrl(url, data)\r\n\r\n    for item in titleurl:\r\n        url = item[\"link\"]\r\n        print \"url:\", url\r\n        content = GetContent(url)\r\n        item[\"content\"] = content\r\n        ContentSave(item)\r\n\r\n\r\nif __name__ == '__main__':\r\n    start = datetime.datetime.now()\r\n\r\n    querystring = u\"清华\"\r\n    type = 2 # 2-文章，1-微信号\r\n\r\n    # 多进程抓取\r\n    p = mp.Pool()\r\n    p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)])\r\n    p.close()\r\n    p.join()\r\n\r\n    # # 单进程抓取\r\n    # for page in range(1, 50, 1):\r\n    #     tuple = (querystring, type, page)\r\n    #     func(tuple)\r\n\r\n    end = datetime.datetime.now()\r\n    print \"last time: \", end-start\r\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/__init__.py",
    "content": ""
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/items.py",
    "content": "# Define here the models for your scraped items\n#\n# See documentation in:\n# http://doc.scrapy.org/en/latest/topics/items.html\n\nfrom scrapy.item import Item, Field\n\nclass WechatprojectItem(Item):\n    # define the fields for your item here like:\n    # name = Field()\n    title = Field()\n    link = Field()\n    content = Field()\n    pass\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py",
    "content": "# Define your item pipelines here\n#\n# Don't forget to add your pipeline to the ITEM_PIPELINES setting\n# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html\n\n# class WechatprojectPipeline(object):\n#     def process_item(self, item, spider):\n#         return item\n\n\n\n# # MySQL Database\n# from twisted.enterprise import adbapi  # import twisted package\n# class WechatprojectPipeline(object):\n#     # connnect databases\n#     def __init__(self):\n#         self.dbpool = adbapi.ConnectionPool(\"MySQLdb\",\n#                                             host = \"localhost\",\n#                                             db = \"testwechat\", # you must build database named testwechat\n#                                             user = \"root\",\n#                                             passwd = \"testpasswd\",\n#                                             charset = \"utf8\")\n#     # pipeline default function\n#     def process_item(self, item, spider):\n#         query = self.dbpool.runInteraction(self._conditional_insert, item)\n#         return item\n#     # insert the data to databases\n#     def _conditional_insert(self, tx, item): # item dictionary\n#         # you must build table named result in database testwechat\n#         tx.execute(\"insert into result values (%s, %s, %s)\", (item[\"title\"], item[\"link\"], item[\"content\"]))\n\n\n# MongoDB Database\nimport pymongo\nclass WechatprojectPipeline(object):\n    # connnect databases\n    def __init__(self):\n        connection = pymongo.Connection(host = \"localhost\", port = 27017)\n        db = connection[\"testwechat\"] # you need no build database named testdouban\n        # db.authenticate(name = \"root\", password = \"testpasswd\") # no name and password for localhost\n        self.posts = db[\"result\"] # you need not build collection named book\n    # pipeline default function\n    def process_item(self, item, spider):\n        self.posts.insert(dict(item)) # convert json to dict\n        return item\n\n\n# # Json File\n# import json\n# import codecs\n# class WechatprojectPipeline(object):\n#     def __init__(self):\n#         self.file = codecs.open('results.json', 'w', 'utf-8')\n#     def process_item(self, item, spider):\n#         line = json.dumps(dict(item))+'\\n'\n#         self.file.write(line)\n#         return item\n\n\n#############################################################################################\n# '''if you want to download images'''\n# from scrapy.http.request import Request\n# from scrapy.contrib.pipeline.images import ImagesPipeline\n# class MyImagesPipeline(ImagesPipeline):\n#     #@TODO\n#     def get_media_requests(self, item, info):\n#         for image_url in item['image_urls']: # item['image_urls'] contains the image urls\n#             # yield Request(image_url)\n#             yield Request(image_url, meta={'name': item['name']}) # item['name'] contains the images name\n#     def item_completed(self, results, item, info):\n#         return super(MyImagesPipeline, self).item_completed(results, item, info)\n#     def file_path(self, request, response=None, info=None):\n#         f_path = super(MyImagesPipeline, self).file_path(request, response, info)\n#         f_path = f_path.replace('full', request.meta['name'])\n#         return f_path\n#         ##########################################################\n#         # import hashlib\n#         # image_guid = hashlib.sha1(request.url).hexdigest()  # change to request.url after deprecation\n#         # return '%s/%s.jpg' % (request.meta['name'], image_guid)\n#     pass\n# # from scrapy.contrib.pipeline.media import MediaPipeline\n# # class MyMediaPipeline(MediaPipeline):\n# #     #@TODO\n# #     pass\n\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/settings.py",
    "content": "# Scrapy settings for Wechatproject project\n#\n# For simplicity, this file contains only the most important settings by\n# default. All the other settings are documented here:\n#\n#     http://doc.scrapy.org/en/latest/topics/settings.html\n#\n\nBOT_NAME = 'Wechatproject'\n\nSPIDER_MODULES = ['Wechatproject.spiders']\nNEWSPIDER_MODULE = 'Wechatproject.spiders'\n\nITEM_PIPELINES = ['Wechatproject.pipelines.WechatprojectPipeline'] # add settings\n#############################################################################################\n# '''if you want to download images'''\n# ITEM_PIPELINES = {'Wechatproject.pipelines.WechatprojectPipeline':1, 'Wechatproject.pipelines.MyImagesPipeline':2 # add settings\n# IMAGES_STORE = './images'\n\n# Crawl responsibly by identifying yourself (and your website) on the user-agent\n#USER_AGENT = 'Wechatproject (+http://www.yourdomain.com)'\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/spiders/__init__.py",
    "content": "# This package will contain the spiders of your Scrapy project\n#\n# Please refer to the documentation for information on how to create and manage\n# your spiders.\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py",
    "content": "#coding: utf-8\r\nfrom scrapy.spider import BaseSpider\r\nfrom scrapy.selector import Selector\r\nfrom Wechatproject.items import WechatprojectItem\r\nfrom bs4 import BeautifulSoup\r\nfrom scrapy.http import Request\r\n\r\n\r\nclass WechatSpider(BaseSpider):\r\n    #############################################################################################\r\n    '''微信搜索程序'''\r\n    name = \"wechat\"\r\n\r\n    start_urls = []\r\n    querystring = u\"清华\"\r\n    type = 2 # 2-文章，1-微信号\r\n    for i in range(1, 50, 1):\r\n        start_urls.append(\"http://weixin.sogou.com/weixin?type=%d&query=%s&page=%d\" % (type, querystring, i))\r\n    # print start_urls\r\n\r\n    #############################################################################################\r\n    ## 递归抓取\r\n\r\n    ## 使用xpath()方法，注意item中键对值为string类型，extract()方法返回list\r\n    def parse(self, response):\r\n        # print response.body\r\n        sel = Selector(response)\r\n        sites = sel.xpath('//div[@class=\"txt-box\"]/h4/a')\r\n        for site in sites:\r\n            item = WechatprojectItem()\r\n            item[\"title\"] = site.xpath(\"text()\").extract() # 其中在item.py中定义了title = Field()\r\n            item[\"link\"] = site.xpath(\"@href\").extract() # 其中在item.py中定义了link = Field()\r\n            #############################################################################################\r\n            # yield item ## 只抓取当前页数据\r\n            next_url = item[\"link\"][0]\r\n            # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据\r\n            yield Request(url=next_url, meta={\"item\":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据\r\n\r\n    ## 使用BeautifulSoup方法，注意item中键对值为string类型\r\n    def parse(self, response):\r\n        # print response.body\r\n        soup = BeautifulSoup(response.body)\r\n        tags = soup.findAll(\"h4\")\r\n        for tag in tags:\r\n            item = WechatprojectItem()\r\n            item[\"title\"] = tag.text # 其中在item.py中定义了title = Field()\r\n            item[\"link\"] = tag.find(\"a\").get(\"href\") # 其中在item.py中定义了link = Field()\r\n            #############################################################################################\r\n            # yield item ## 只抓取当前页数据\r\n            next_url = item[\"link\"]\r\n            # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据\r\n            yield Request(url=next_url, meta={\"item\":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据\r\n\r\n    def parse2(self, response):\r\n        soup = BeautifulSoup(response.body)\r\n        tag = soup.find(\"div\", attrs={\"class\":\"rich_media_content\", \"id\":\"js_content\"}) # 提取第一个标签\r\n        content_list = [tag_i.text for tag_i in tag.findAll(\"p\")]\r\n        content = \"\".join(content_list)\r\n        # print content\r\n        # item = WechatprojectItem() ## 只抓取二级页面数据\r\n        item = response.meta['item'] ## 抓取当前页数和二级页面数据\r\n        item[\"content\"] = content\r\n        return item\r\n"
  },
  {
    "path": "WechatSearchProjects/Wechatproject/scrapy.cfg",
    "content": "# Automatically created by: scrapy startproject\n#\n# For more information about the [deploy] section see:\n# http://doc.scrapy.org/en/latest/topics/scrapyd.html\n\n[settings]\ndefault = Wechatproject.settings\n\n[deploy]\n#url = http://localhost:6800/\nproject = Wechatproject\n"
  },
  {
    "path": "ZhihuSpider/ReadMe.md",
    "content": "### 网络爬虫之用户名密码及验证码登陆：爬取[知乎](http://www.zhihu.com/)网站 \r\n\r\n**一些说明：** \r\n\r\n* 使用requests包来爬取。首先尝试用用户名密码自动登陆，如果失败，则需要采用cookie登陆。  \r\n\r\n* 配置文件config.ini，其中包括用户名密码信息，如果有验证码情况，需要手动登陆一次网站获取cookie信息。  \r\n\r\n* 判断登陆成功与否，看生成的html文件中有没有用户信息。\r\n"
  },
  {
    "path": "ZhihuSpider/ZhihuSpider.py",
    "content": "# -*- coding: utf-8 -*-\r\n'''\r\n网络爬虫之用户名密码及验证码登陆：爬取知乎网站\r\n'''\r\nimport requests\r\nimport ConfigParser\r\n\r\ndef create_session():\r\n    cf = ConfigParser.ConfigParser()\r\n    cf.read('config.ini')\r\n    cookies = cf.items('cookies')\r\n    cookies = dict(cookies)\r\n    from pprint import pprint\r\n    pprint(cookies)\r\n    email = cf.get('info', 'email')\r\n    password = cf.get('info', 'password')\r\n\r\n    session = requests.session()\r\n    login_data = {'email': email, 'password': password}\r\n    header = {\r\n        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36',\r\n        'Host': 'www.zhihu.com',\r\n        'Referer': 'http://www.zhihu.com/'\r\n    }\r\n    r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header)\r\n    if r.json()['r'] == 1:\r\n        print 'Login Failed, reason is:',\r\n        for m in r.json()['data']:\r\n            print r.json()['data'][m]\r\n        print 'So we use cookies to login in...'\r\n        has_cookies = False\r\n        for key in cookies:\r\n            if key != '__name__' and cookies[key] != '':\r\n                has_cookies = True\r\n                break\r\n        if has_cookies is False:\r\n            raise ValueError('请填写config.ini文件中的cookies项.')\r\n        else:\r\n            # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆\r\n            r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆\r\n\r\n    with open('login.html', 'w') as fp:\r\n        fp.write(r.content)\r\n\r\n    return session, cookies\r\n\r\n\r\nif __name__ == '__main__':\r\n    requests_session, requests_cookies = create_session()\r\n\r\n    # url = 'http://www.zhihu.com/login/email'\r\n    url = 'http://www.zhihu.com/topic/19552832'\r\n    # content = requests_session.get(url).content # 未登陆\r\n    # content = requests.get(url, cookies=requests_cookies).content # 已登陆\r\n    content = requests_session.get(url, cookies=requests_cookies).content # 已登陆\r\n    with open('url.html', 'w') as fp:\r\n        fp.write(content)"
  },
  {
    "path": "ZhihuSpider/config.ini",
    "content": "[info]\nemail = xxxx@163.com\npassword = xxxx\n\n[cookies]\nq_c1 =\ncap_id =\n_za =\n__utmt =\n__utma =\n__utmb =\n__utmc =\n__utmz =\n__utmv =\nz_c0 =\nunlock_ticket ="
  }
]